]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
import ceph 16.2.6
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b 20#include <boost/algorithm/string.hpp>
11fdf7f2 21#include <experimental/iterator>
224ce89b 22#include <locale>
7c673cae
FG
23#include <sstream>
24
31f18b77
FG
25#include "mon/OSDMonitor.h"
26#include "mon/Monitor.h"
27#include "mon/MDSMonitor.h"
31f18b77
FG
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
f67539c2 30#include "mon/KVMonitor.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
9f95a23c 42#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
43#include "messages/MOSDFull.h"
44#include "messages/MOSDMap.h"
45#include "messages/MMonGetOSDMap.h"
46#include "messages/MOSDBoot.h"
47#include "messages/MOSDAlive.h"
48#include "messages/MPoolOp.h"
49#include "messages/MPoolOpReply.h"
50#include "messages/MOSDPGCreate.h"
11fdf7f2 51#include "messages/MOSDPGCreate2.h"
7c673cae
FG
52#include "messages/MOSDPGCreated.h"
53#include "messages/MOSDPGTemp.h"
11fdf7f2 54#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
55#include "messages/MMonCommand.h"
56#include "messages/MRemoveSnaps.h"
57#include "messages/MOSDScrub.h"
58#include "messages/MRoute.h"
9f95a23c
TL
59#include "messages/MMonGetPurgedSnaps.h"
60#include "messages/MMonGetPurgedSnapsReply.h"
7c673cae
FG
61
62#include "common/TextTable.h"
63#include "common/Timer.h"
64#include "common/ceph_argparse.h"
65#include "common/perf_counters.h"
eafe8130 66#include "common/PriorityCache.h"
7c673cae 67#include "common/strtol.h"
11fdf7f2 68#include "common/numa.h"
7c673cae
FG
69
70#include "common/config.h"
71#include "common/errno.h"
72
73#include "erasure-code/ErasureCodePlugin.h"
74#include "compressor/Compressor.h"
75#include "common/Checksummer.h"
76
77#include "include/compat.h"
11fdf7f2 78#include "include/ceph_assert.h"
7c673cae
FG
79#include "include/stringify.h"
80#include "include/util.h"
81#include "common/cmdparse.h"
82#include "include/str_list.h"
83#include "include/str_map.h"
224ce89b 84#include "include/scope_guard.h"
eafe8130 85#include "perfglue/heap_profiler.h"
7c673cae 86
28e407b8
AA
87#include "auth/cephx/CephxKeyServer.h"
88#include "osd/OSDCap.h"
89
7c673cae
FG
90#include "json_spirit/json_spirit_reader.h"
91
c07f9fc5
FG
92#include <boost/algorithm/string/predicate.hpp>
93
f67539c2
TL
94using std::dec;
95using std::hex;
96using std::list;
97using std::map;
98using std::make_pair;
99using std::ostringstream;
100using std::pair;
101using std::set;
102using std::string;
103using std::stringstream;
104using std::to_string;
105using std::vector;
106
107using ceph::bufferlist;
108using ceph::decode;
109using ceph::encode;
110using ceph::ErasureCodeInterfaceRef;
111using ceph::ErasureCodePluginRegistry;
112using ceph::ErasureCodeProfile;
113using ceph::Formatter;
114using ceph::JSONFormatter;
115using ceph::make_message;
116
7c673cae 117#define dout_subsys ceph_subsys_mon
3efd9988
FG
118static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
119static const string OSD_METADATA_PREFIX("osd_metadata");
11fdf7f2 120static const string OSD_SNAP_PREFIX("osd_snap");
7c673cae 121
9f95a23c
TL
122/*
123
124 OSD snapshot metadata
125 ---------------------
126
127 -- starting with mimic, removed in octopus --
128
129 "removed_epoch_%llu_%08lx" % (pool, epoch)
130 -> interval_set<snapid_t>
131
132 "removed_snap_%llu_%016llx" % (pool, last_snap)
133 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
134
135
136 -- starting with mimic --
137
138 "purged_snap_%llu_%016llx" % (pool, last_snap)
139 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
140
141 - note that the {removed,purged}_snap put the last snap in they key so
142 that we can use forward iteration only to search for an epoch in an
143 interval. e.g., to test if epoch N is removed/purged, we'll find a key
144 >= N that either does or doesn't contain the given snap.
145
146
147 -- starting with octopus --
148
149 "purged_epoch_%08lx" % epoch
150 -> map<int64_t,interval_set<snapid_t>>
151
152 */
153using namespace TOPNSPC::common;
c07f9fc5
FG
154namespace {
155
eafe8130
TL
156struct OSDMemCache : public PriorityCache::PriCache {
157 OSDMonitor *osdmon;
158 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
159 int64_t committed_bytes = 0;
160 double cache_ratio = 0;
161
162 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
163
164 virtual uint64_t _get_used_bytes() const = 0;
165
166 virtual int64_t request_cache_bytes(
167 PriorityCache::Priority pri, uint64_t total_cache) const {
168 int64_t assigned = get_cache_bytes(pri);
169
170 switch (pri) {
171 // All cache items are currently set to have PRI1 priority
172 case PriorityCache::Priority::PRI1:
173 {
174 int64_t request = _get_used_bytes();
175 return (request > assigned) ? request - assigned : 0;
176 }
177 default:
178 break;
179 }
180 return -EOPNOTSUPP;
181 }
182
183 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
184 return cache_bytes[pri];
185 }
186
187 virtual int64_t get_cache_bytes() const {
188 int64_t total = 0;
189
190 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
191 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
192 total += get_cache_bytes(pri);
193 }
194 return total;
195 }
196
197 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
198 cache_bytes[pri] = bytes;
199 }
200 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
201 cache_bytes[pri] += bytes;
202 }
203 virtual int64_t commit_cache_size(uint64_t total_cache) {
204 committed_bytes = PriorityCache::get_chunk(
205 get_cache_bytes(), total_cache);
206 return committed_bytes;
207 }
208 virtual int64_t get_committed_size() const {
209 return committed_bytes;
210 }
211 virtual double get_cache_ratio() const {
212 return cache_ratio;
213 }
214 virtual void set_cache_ratio(double ratio) {
215 cache_ratio = ratio;
216 }
217 virtual string get_cache_name() const = 0;
218};
219
220struct IncCache : public OSDMemCache {
221 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
222
223 virtual uint64_t _get_used_bytes() const {
224 return osdmon->inc_osd_cache.get_bytes();
225 }
226
227 virtual string get_cache_name() const {
228 return "OSDMap Inc Cache";
229 }
230
231 uint64_t _get_num_osdmaps() const {
232 return osdmon->inc_osd_cache.get_size();
233 }
234};
235
236struct FullCache : public OSDMemCache {
237 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
238
239 virtual uint64_t _get_used_bytes() const {
240 return osdmon->full_osd_cache.get_bytes();
241 }
242
243 virtual string get_cache_name() const {
244 return "OSDMap Full Cache";
245 }
246
247 uint64_t _get_num_osdmaps() const {
248 return osdmon->full_osd_cache.get_size();
249 }
250};
251
252std::shared_ptr<IncCache> inc_cache;
253std::shared_ptr<FullCache> full_cache;
254
c07f9fc5
FG
255const uint32_t MAX_POOL_APPLICATIONS = 4;
256const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
257const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
258
28e407b8
AA
259bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
260 // Note: this doesn't include support for the application tag match
261 if ((grant.spec.allow & OSD_CAP_W) != 0) {
262 auto& match = grant.match;
263 if (match.is_match_all()) {
264 return true;
11fdf7f2 265 } else if (pool_name != nullptr &&
28e407b8
AA
266 !match.pool_namespace.pool_name.empty() &&
267 match.pool_namespace.pool_name == *pool_name) {
268 return true;
269 }
270 }
271 return false;
272}
273
274bool is_unmanaged_snap_op_permitted(CephContext* cct,
275 const KeyServer& key_server,
276 const EntityName& entity_name,
277 const MonCap& mon_caps,
11fdf7f2 278 const entity_addr_t& peer_socket_addr,
28e407b8
AA
279 const std::string* pool_name)
280{
281 typedef std::map<std::string, std::string> CommandArgs;
282
11fdf7f2 283 if (mon_caps.is_capable(
92f5a8d4 284 cct, entity_name, "osd",
11fdf7f2
TL
285 "osd pool op unmanaged-snap",
286 (pool_name == nullptr ?
287 CommandArgs{} /* pool DNE, require unrestricted cap */ :
288 CommandArgs{{"poolname", *pool_name}}),
289 false, true, false,
290 peer_socket_addr)) {
28e407b8
AA
291 return true;
292 }
293
294 AuthCapsInfo caps_info;
295 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
296 caps_info)) {
297 dout(10) << "unable to locate OSD cap data for " << entity_name
298 << " in auth db" << dendl;
299 return false;
300 }
301
302 string caps_str;
303 if (caps_info.caps.length() > 0) {
11fdf7f2 304 auto p = caps_info.caps.cbegin();
28e407b8
AA
305 try {
306 decode(caps_str, p);
f67539c2 307 } catch (const ceph::buffer::error &err) {
28e407b8
AA
308 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
309 << dendl;
310 return false;
311 }
312 }
313
314 OSDCap osd_cap;
315 if (!osd_cap.parse(caps_str, nullptr)) {
316 dout(10) << "unable to parse OSD cap data for " << entity_name
317 << " in auth db" << dendl;
318 return false;
319 }
320
321 // if the entity has write permissions in one or all pools, permit
322 // usage of unmanaged-snapshots
323 if (osd_cap.allow_all()) {
324 return true;
325 }
326
327 for (auto& grant : osd_cap.grants) {
328 if (grant.profile.is_valid()) {
329 for (auto& profile_grant : grant.profile_grants) {
330 if (is_osd_writable(profile_grant, pool_name)) {
331 return true;
332 }
333 }
334 } else if (is_osd_writable(grant, pool_name)) {
335 return true;
336 }
337 }
338
339 return false;
340}
341
c07f9fc5
FG
342} // anonymous namespace
343
522d829b
TL
344void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
345 epoch_t last_epoch_clean)
7c673cae 346{
522d829b
TL
347 if (ps >= pg_num) {
348 // removed PG
349 return;
7c673cae 350 }
522d829b 351 epoch_by_pg.resize(pg_num, 0);
7c673cae
FG
352 const auto old_lec = epoch_by_pg[ps];
353 if (old_lec >= last_epoch_clean) {
354 // stale lec
355 return;
356 }
357 epoch_by_pg[ps] = last_epoch_clean;
358 if (last_epoch_clean < floor) {
359 floor = last_epoch_clean;
360 } else if (last_epoch_clean > floor) {
361 if (old_lec == floor) {
362 // probably should increase floor?
363 auto new_floor = std::min_element(std::begin(epoch_by_pg),
364 std::end(epoch_by_pg));
365 floor = *new_floor;
366 }
367 }
368 if (ps != next_missing) {
369 return;
370 }
371 for (; next_missing < epoch_by_pg.size(); next_missing++) {
372 if (epoch_by_pg[next_missing] == 0) {
373 break;
374 }
375 }
376}
377
378void LastEpochClean::remove_pool(uint64_t pool)
379{
380 report_by_pool.erase(pool);
381}
382
522d829b
TL
383void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
384 epoch_t last_epoch_clean)
7c673cae
FG
385{
386 auto& lec = report_by_pool[pg.pool()];
522d829b 387 return lec.report(pg_num, pg.ps(), last_epoch_clean);
7c673cae
FG
388}
389
390epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
391{
392 auto floor = latest.get_epoch();
393 for (auto& pool : latest.get_pools()) {
394 auto reported = report_by_pool.find(pool.first);
395 if (reported == report_by_pool.end()) {
396 return 0;
397 }
398 if (reported->second.next_missing < pool.second.get_pg_num()) {
399 return 0;
400 }
401 if (reported->second.floor < floor) {
402 floor = reported->second.floor;
403 }
404 }
405 return floor;
406}
407
1911f103
TL
408void LastEpochClean::dump(Formatter *f) const
409{
410 f->open_array_section("per_pool");
411
f67539c2 412 for (auto& [pool, lec] : report_by_pool) {
1911f103 413 f->open_object_section("pool");
f67539c2
TL
414 f->dump_unsigned("poolid", pool);
415 f->dump_unsigned("floor", lec.floor);
1911f103
TL
416 f->close_section();
417 }
418
419 f->close_section();
420}
7c673cae 421
11fdf7f2
TL
422class C_UpdateCreatingPGs : public Context {
423public:
7c673cae
FG
424 OSDMonitor *osdmon;
425 utime_t start;
426 epoch_t epoch;
427 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
428 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
429 void finish(int r) override {
430 if (r >= 0) {
431 utime_t end = ceph_clock_now();
432 dout(10) << "osdmap epoch " << epoch << " mapping took "
433 << (end - start) << " seconds" << dendl;
434 osdmon->update_creating_pgs();
435 osdmon->check_pg_creates_subs();
436 }
437 }
438};
439
440#undef dout_prefix
441#define dout_prefix _prefix(_dout, mon, osdmap)
f67539c2
TL
442static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
443 return *_dout << "mon." << mon.name << "@" << mon.rank
444 << "(" << mon.get_state_name()
7c673cae
FG
445 << ").osd e" << osdmap.get_epoch() << " ";
446}
447
448OSDMonitor::OSDMonitor(
449 CephContext *cct,
f67539c2
TL
450 Monitor &mn,
451 Paxos &p,
7c673cae
FG
452 const string& service_name)
453 : PaxosService(mn, p, service_name),
454 cct(cct),
11fdf7f2
TL
455 inc_osd_cache(g_conf()->mon_osd_cache_size),
456 full_osd_cache(g_conf()->mon_osd_cache_size),
457 has_osdmap_manifest(false),
f67539c2 458 mapper(mn.cct, &mn.cpu_tp)
eafe8130
TL
459{
460 inc_cache = std::make_shared<IncCache>(this);
461 full_cache = std::make_shared<FullCache>(this);
462 cct->_conf.add_observer(this);
463 int r = _set_cache_sizes();
464 if (r < 0) {
465 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
466 << g_conf()->mon_osd_cache_size
467 << ") without priority cache management"
468 << dendl;
469 }
470}
471
472const char **OSDMonitor::get_tracked_conf_keys() const
473{
474 static const char* KEYS[] = {
475 "mon_memory_target",
476 "mon_memory_autotune",
477 "rocksdb_cache_size",
478 NULL
479 };
480 return KEYS;
481}
482
483void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
484 const std::set<std::string> &changed)
485{
486 dout(10) << __func__ << " " << changed << dendl;
487
488 if (changed.count("mon_memory_autotune")) {
489 _set_cache_autotuning();
490 }
491 if (changed.count("mon_memory_target") ||
492 changed.count("rocksdb_cache_size")) {
493 int r = _update_mon_cache_settings();
494 if (r < 0) {
495 derr << __func__ << " mon_memory_target:"
496 << g_conf()->mon_memory_target
497 << " rocksdb_cache_size:"
498 << g_conf()->rocksdb_cache_size
92f5a8d4 499 << ". Unable to update cache size."
eafe8130
TL
500 << dendl;
501 }
502 }
503}
504
505void OSDMonitor::_set_cache_autotuning()
506{
507 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
508 // Disable cache autotuning
509 std::lock_guard l(balancer_lock);
510 pcm = nullptr;
511 }
512
513 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
514 int r = register_cache_with_pcm();
515 if (r < 0) {
516 dout(10) << __func__
517 << " Error while registering osdmon caches with pcm."
518 << " Cache auto tuning not enabled."
519 << dendl;
520 mon_memory_autotune = false;
521 } else {
522 mon_memory_autotune = true;
523 }
524 }
525}
526
527int OSDMonitor::_update_mon_cache_settings()
528{
529 if (g_conf()->mon_memory_target <= 0 ||
530 g_conf()->mon_memory_target < mon_memory_min ||
531 g_conf()->rocksdb_cache_size <= 0) {
532 return -EINVAL;
533 }
534
92f5a8d4
TL
535 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
536 derr << __func__ << " not using pcm and rocksdb" << dendl;
537 return -EINVAL;
538 }
539
eafe8130
TL
540 uint64_t old_mon_memory_target = mon_memory_target;
541 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
542
543 // Set the new pcm memory cache sizes
544 mon_memory_target = g_conf()->mon_memory_target;
545 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
546
547 uint64_t base = mon_memory_base;
548 double fragmentation = mon_memory_fragmentation;
549 uint64_t target = mon_memory_target;
550 uint64_t min = mon_memory_min;
551 uint64_t max = min;
552
553 uint64_t ltarget = (1.0 - fragmentation) * target;
554 if (ltarget > base + min) {
555 max = ltarget - base;
556 }
557
558 int r = _set_cache_ratios();
559 if (r < 0) {
560 derr << __func__ << " Cache ratios for pcm could not be set."
561 << " Review the kv (rocksdb) and mon_memory_target sizes."
562 << dendl;
563 mon_memory_target = old_mon_memory_target;
564 rocksdb_cache_size = old_rocksdb_cache_size;
565 return -EINVAL;
566 }
567
568 if (mon_memory_autotune && pcm != nullptr) {
569 std::lock_guard l(balancer_lock);
570 // set pcm cache levels
571 pcm->set_target_memory(target);
572 pcm->set_min_memory(min);
573 pcm->set_max_memory(max);
574 // tune memory based on new values
575 pcm->tune_memory();
576 pcm->balance();
577 _set_new_cache_sizes();
92f5a8d4 578 dout(1) << __func__ << " Updated mon cache setting."
eafe8130
TL
579 << " target: " << target
580 << " min: " << min
581 << " max: " << max
582 << dendl;
583 }
584 return 0;
585}
586
587int OSDMonitor::_set_cache_sizes()
588{
589 if (g_conf()->mon_memory_autotune) {
590 // set the new osdmon cache targets to be managed by pcm
591 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
592 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
593 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
594 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
595 mon_memory_target = g_conf()->mon_memory_target;
596 mon_memory_min = g_conf()->mon_osd_cache_size_min;
597 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
598 derr << __func__ << " mon_memory_target:" << mon_memory_target
599 << " mon_memory_min:" << mon_memory_min
600 << ". Invalid size option(s) provided."
601 << dendl;
602 return -EINVAL;
603 }
604 // Set the initial inc and full LRU cache sizes
605 inc_osd_cache.set_bytes(mon_memory_min);
606 full_osd_cache.set_bytes(mon_memory_min);
607 mon_memory_autotune = g_conf()->mon_memory_autotune;
608 }
609 return 0;
610}
7c673cae
FG
611
612bool OSDMonitor::_have_pending_crush()
613{
614 return pending_inc.crush.length() > 0;
615}
616
617CrushWrapper &OSDMonitor::_get_stable_crush()
618{
619 return *osdmap.crush;
620}
621
622void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
623{
624 bufferlist bl;
625 if (pending_inc.crush.length())
626 bl = pending_inc.crush;
627 else
628 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
629
11fdf7f2 630 auto p = bl.cbegin();
7c673cae
FG
631 newcrush.decode(p);
632}
633
634void OSDMonitor::create_initial()
635{
f67539c2 636 dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
7c673cae
FG
637
638 OSDMap newmap;
639
640 bufferlist bl;
f67539c2 641 mon.store->get("mkfs", "osdmap", bl);
7c673cae
FG
642
643 if (bl.length()) {
644 newmap.decode(bl);
f67539c2 645 newmap.set_fsid(mon.monmap->fsid);
7c673cae 646 } else {
f67539c2 647 newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
7c673cae
FG
648 }
649 newmap.set_epoch(1);
650 newmap.created = newmap.modified = ceph_clock_now();
651
652 // new clusters should sort bitwise by default.
653 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
654
11fdf7f2
TL
655 newmap.flags |=
656 CEPH_OSDMAP_RECOVERY_DELETES |
657 CEPH_OSDMAP_PURGED_SNAPDIRS |
658 CEPH_OSDMAP_PGLOG_HARDLIMIT;
659 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
660 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
661 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
662 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
663 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
664 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
665
7c673cae 666 // new cluster should require latest by default
f67539c2
TL
667 if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
668 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
669 derr << __func__ << " mon_debug_no_require_pacific and octopus=true" << dendl;
9f95a23c 670 newmap.require_osd_release = ceph_release_t::nautilus;
f67539c2
TL
671 } else {
672 derr << __func__ << " mon_debug_no_require_pacific=true" << dendl;
673 newmap.require_osd_release = ceph_release_t::octopus;
11fdf7f2 674 }
31f18b77 675 } else {
f67539c2
TL
676 newmap.require_osd_release = ceph_release_t::pacific;
677 }
678
679 if (newmap.require_osd_release >= ceph_release_t::octopus) {
9f95a23c
TL
680 ceph_release_t r = ceph_release_from_name(
681 g_conf()->mon_osd_initial_require_min_compat_client);
682 if (!r) {
11fdf7f2 683 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
31f18b77
FG
684 }
685 newmap.require_min_compat_client = r;
7c673cae
FG
686 }
687
688 // encode into pending incremental
28e407b8 689 uint64_t features = newmap.get_encoding_features();
7c673cae 690 newmap.encode(pending_inc.fullmap,
28e407b8 691 features | CEPH_FEATURE_RESERVED);
7c673cae
FG
692 pending_inc.full_crc = newmap.get_crc();
693 dout(20) << " full crc " << pending_inc.full_crc << dendl;
694}
695
11fdf7f2 696void OSDMonitor::get_store_prefixes(std::set<string>& s) const
7c673cae
FG
697{
698 s.insert(service_name);
699 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 700 s.insert(OSD_METADATA_PREFIX);
11fdf7f2 701 s.insert(OSD_SNAP_PREFIX);
7c673cae
FG
702}
703
704void OSDMonitor::update_from_paxos(bool *need_bootstrap)
705{
11fdf7f2
TL
706 // we really don't care if the version has been updated, because we may
707 // have trimmed without having increased the last committed; yet, we may
708 // need to update the in-memory manifest.
709 load_osdmap_manifest();
710
7c673cae
FG
711 version_t version = get_last_committed();
712 if (version == osdmap.epoch)
713 return;
11fdf7f2 714 ceph_assert(version > osdmap.epoch);
7c673cae
FG
715
716 dout(15) << "update_from_paxos paxos e " << version
717 << ", my e " << osdmap.epoch << dendl;
718
f67539c2
TL
719 int prev_num_up_osd = osdmap.num_up_osd;
720
31f18b77
FG
721 if (mapping_job) {
722 if (!mapping_job->is_done()) {
723 dout(1) << __func__ << " mapping job "
724 << mapping_job.get() << " did not complete, "
725 << mapping_job->shards << " left, canceling" << dendl;
726 mapping_job->abort();
727 }
728 mapping_job.reset();
729 }
7c673cae 730
224ce89b
WB
731 load_health();
732
7c673cae
FG
733 /*
734 * We will possibly have a stashed latest that *we* wrote, and we will
735 * always be sure to have the oldest full map in the first..last range
736 * due to encode_trim_extra(), which includes the oldest full map in the trim
737 * transaction.
738 *
739 * encode_trim_extra() does not however write the full map's
740 * version to 'full_latest'. This is only done when we are building the
741 * full maps from the incremental versions. But don't panic! We make sure
742 * that the following conditions find whichever full map version is newer.
743 */
744 version_t latest_full = get_version_latest_full();
745 if (latest_full == 0 && get_first_committed() > 1)
746 latest_full = get_first_committed();
747
748 if (get_first_committed() > 1 &&
749 latest_full < get_first_committed()) {
750 // the monitor could be just sync'ed with its peer, and the latest_full key
751 // is not encoded in the paxos commits in encode_pending(), so we need to
752 // make sure we get it pointing to a proper version.
753 version_t lc = get_last_committed();
754 version_t fc = get_first_committed();
755
756 dout(10) << __func__ << " looking for valid full map in interval"
757 << " [" << fc << ", " << lc << "]" << dendl;
758
759 latest_full = 0;
760 for (version_t v = lc; v >= fc; v--) {
761 string full_key = "full_" + stringify(v);
f67539c2 762 if (mon.store->exists(get_service_name(), full_key)) {
7c673cae
FG
763 dout(10) << __func__ << " found latest full map v " << v << dendl;
764 latest_full = v;
765 break;
766 }
767 }
768
11fdf7f2 769 ceph_assert(latest_full > 0);
7c673cae
FG
770 auto t(std::make_shared<MonitorDBStore::Transaction>());
771 put_version_latest_full(t, latest_full);
f67539c2 772 mon.store->apply_transaction(t);
7c673cae
FG
773 dout(10) << __func__ << " updated the on-disk full map version to "
774 << latest_full << dendl;
775 }
776
777 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
778 bufferlist latest_bl;
779 get_version_full(latest_full, latest_bl);
11fdf7f2 780 ceph_assert(latest_bl.length() != 0);
7c673cae 781 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
11fdf7f2 782 osdmap = OSDMap();
7c673cae
FG
783 osdmap.decode(latest_bl);
784 }
785
11fdf7f2 786 bufferlist bl;
f67539c2 787 if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
11fdf7f2
TL
788 auto p = bl.cbegin();
789 std::lock_guard<std::mutex> l(creating_pgs_lock);
790 creating_pgs.decode(p);
791 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
792 << creating_pgs.last_scan_epoch
793 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
31f18b77 794 } else {
11fdf7f2
TL
795 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
796 << dendl;
31f18b77
FG
797 }
798
7c673cae
FG
799 // walk through incrementals
800 MonitorDBStore::TransactionRef t;
801 size_t tx_size = 0;
802 while (version > osdmap.epoch) {
803 bufferlist inc_bl;
804 int err = get_version(osdmap.epoch+1, inc_bl);
11fdf7f2
TL
805 ceph_assert(err == 0);
806 ceph_assert(inc_bl.length());
eafe8130
TL
807 // set priority cache manager levels if the osdmap is
808 // being populated for the first time.
809 if (mon_memory_autotune && pcm == nullptr) {
810 int r = register_cache_with_pcm();
811 if (r < 0) {
812 dout(10) << __func__
813 << " Error while registering osdmon caches with pcm."
814 << " Proceeding without cache auto tuning."
815 << dendl;
816 }
817 }
7c673cae
FG
818
819 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
820 << dendl;
821 OSDMap::Incremental inc(inc_bl);
822 err = osdmap.apply_incremental(inc);
11fdf7f2 823 ceph_assert(err == 0);
7c673cae
FG
824
825 if (!t)
826 t.reset(new MonitorDBStore::Transaction);
827
828 // Write out the full map for all past epochs. Encode the full
829 // map with the same features as the incremental. If we don't
830 // know, use the quorum features. If we don't know those either,
831 // encode with all features.
832 uint64_t f = inc.encode_features;
833 if (!f)
f67539c2 834 f = mon.get_quorum_con_features();
7c673cae
FG
835 if (!f)
836 f = -1;
837 bufferlist full_bl;
838 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
839 tx_size += full_bl.length();
840
841 bufferlist orig_full_bl;
842 get_version_full(osdmap.epoch, orig_full_bl);
843 if (orig_full_bl.length()) {
844 // the primary provided the full map
11fdf7f2 845 ceph_assert(inc.have_crc);
7c673cae
FG
846 if (inc.full_crc != osdmap.crc) {
847 // This will happen if the mons were running mixed versions in
848 // the past or some other circumstance made the full encoded
849 // maps divergent. Reloading here will bring us back into
850 // sync with the primary for this and all future maps. OSDs
851 // will also be brought back into sync when they discover the
852 // crc mismatch and request a full map from a mon.
853 derr << __func__ << " full map CRC mismatch, resetting to canonical"
854 << dendl;
11fdf7f2
TL
855
856 dout(20) << __func__ << " my (bad) full osdmap:\n";
857 JSONFormatter jf(true);
858 jf.dump_object("osdmap", osdmap);
859 jf.flush(*_dout);
860 *_dout << "\nhexdump:\n";
861 full_bl.hexdump(*_dout);
862 *_dout << dendl;
863
7c673cae
FG
864 osdmap = OSDMap();
865 osdmap.decode(orig_full_bl);
11fdf7f2
TL
866
867 dout(20) << __func__ << " canonical full osdmap:\n";
868 JSONFormatter jf(true);
869 jf.dump_object("osdmap", osdmap);
870 jf.flush(*_dout);
871 *_dout << "\nhexdump:\n";
872 orig_full_bl.hexdump(*_dout);
873 *_dout << dendl;
7c673cae
FG
874 }
875 } else {
11fdf7f2 876 ceph_assert(!inc.have_crc);
7c673cae
FG
877 put_version_full(t, osdmap.epoch, full_bl);
878 }
879 put_version_latest_full(t, osdmap.epoch);
880
881 // share
882 dout(1) << osdmap << dendl;
883
884 if (osdmap.epoch == 1) {
885 t->erase("mkfs", "osdmap");
886 }
887
11fdf7f2 888 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
f67539c2 889 mon.store->apply_transaction(t);
7c673cae
FG
890 t = MonitorDBStore::TransactionRef();
891 tx_size = 0;
892 }
f67539c2
TL
893 for (const auto [osd, state] : inc.new_state) {
894 if (state & CEPH_OSD_UP) {
11fdf7f2 895 // could be marked up *or* down, but we're too lazy to check which
f67539c2 896 last_osd_report.erase(osd);
11fdf7f2 897 }
f67539c2
TL
898 if (state & CEPH_OSD_OUT) {
899 // could be marked in *or* out, but we can safely drop it
900 osd_epochs.erase(osd);
901 }
902 }
903 for (const auto [osd, weight] : inc.new_weight) {
904 if (weight == CEPH_OSD_OUT) {
905 // manually marked out, so drop it
906 osd_epochs.erase(osd);
7c673cae
FG
907 }
908 }
909 }
910
911 if (t) {
f67539c2 912 mon.store->apply_transaction(t);
7c673cae
FG
913 }
914
f67539c2 915 bool marked_osd_down = false;
7c673cae
FG
916 for (int o = 0; o < osdmap.get_max_osd(); o++) {
917 if (osdmap.is_out(o))
918 continue;
919 auto found = down_pending_out.find(o);
920 if (osdmap.is_down(o)) {
921 // populate down -> out map
922 if (found == down_pending_out.end()) {
923 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
924 down_pending_out[o] = ceph_clock_now();
f67539c2 925 marked_osd_down = true;
7c673cae
FG
926 }
927 } else {
928 if (found != down_pending_out.end()) {
929 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
930 down_pending_out.erase(found);
931 }
932 }
933 }
934 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
935
7c673cae
FG
936 check_osdmap_subs();
937 check_pg_creates_subs();
938
939 share_map_with_random_osd();
940 update_logger();
7c673cae
FG
941 process_failures();
942
943 // make sure our feature bits reflect the latest map
944 update_msgr_features();
945
f67539c2 946 if (!mon.is_leader()) {
7c673cae
FG
947 // will be called by on_active() on the leader, avoid doing so twice
948 start_mapping();
949 }
f67539c2
TL
950 if (osdmap.stretch_mode_enabled) {
951 dout(20) << "Stretch mode enabled in this map" << dendl;
b3b6e05e 952 mon.try_engage_stretch_mode();
f67539c2
TL
953 if (osdmap.degraded_stretch_mode) {
954 dout(20) << "Degraded stretch mode set in this map" << dendl;
955 if (!osdmap.recovering_stretch_mode) {
956 mon.set_degraded_stretch_mode();
957 if (prev_num_up_osd < osdmap.num_up_osd &&
958 (osdmap.num_up_osd / (double)osdmap.num_osd) >
959 cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
960 // TODO: This works for 2-site clusters when the OSD maps are appropriately
961 // trimmed and everything is "normal" but not if you have a lot of out OSDs
962 // you're ignoring or in some really degenerate failure cases
963 dout(10) << "Enabling recovery stretch mode in this map" << dendl;
964 mon.go_recovery_stretch_mode();
965 }
b3b6e05e
TL
966 } else {
967 mon.set_recovery_stretch_mode();
f67539c2 968 }
b3b6e05e
TL
969 } else {
970 mon.set_healthy_stretch_mode();
f67539c2
TL
971 }
972 if (marked_osd_down &&
973 (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
974 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
975 mon.maybe_go_degraded_stretch_mode();
976 }
f67539c2 977 }
7c673cae
FG
978}
979
eafe8130
TL
980int OSDMonitor::register_cache_with_pcm()
981{
982 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
983 derr << __func__ << " Invalid memory size specified for mon caches."
984 << " Caches will not be auto-tuned."
985 << dendl;
986 return -EINVAL;
987 }
988 uint64_t base = mon_memory_base;
989 double fragmentation = mon_memory_fragmentation;
990 // For calculating total target memory, consider rocksdb cache size.
991 uint64_t target = mon_memory_target;
992 uint64_t min = mon_memory_min;
993 uint64_t max = min;
994
995 // Apply the same logic as in bluestore to set the max amount
996 // of memory to use for cache. Assume base memory for OSDMaps
997 // and then add in some overhead for fragmentation.
998 uint64_t ltarget = (1.0 - fragmentation) * target;
999 if (ltarget > base + min) {
1000 max = ltarget - base;
1001 }
1002
f67539c2 1003 rocksdb_binned_kv_cache = mon.store->get_priority_cache();
eafe8130
TL
1004 if (!rocksdb_binned_kv_cache) {
1005 derr << __func__ << " not using rocksdb" << dendl;
1006 return -EINVAL;
1007 }
1008
1009 int r = _set_cache_ratios();
1010 if (r < 0) {
1011 derr << __func__ << " Cache ratios for pcm could not be set."
1012 << " Review the kv (rocksdb) and mon_memory_target sizes."
1013 << dendl;
1014 return -EINVAL;
1015 }
1016
1017 pcm = std::make_shared<PriorityCache::Manager>(
1018 cct, min, max, target, true);
1019 pcm->insert("kv", rocksdb_binned_kv_cache, true);
1020 pcm->insert("inc", inc_cache, true);
1021 pcm->insert("full", full_cache, true);
92f5a8d4 1022 dout(1) << __func__ << " pcm target: " << target
eafe8130
TL
1023 << " pcm max: " << max
1024 << " pcm min: " << min
1025 << " inc_osd_cache size: " << inc_osd_cache.get_size()
1026 << dendl;
1027 return 0;
1028}
1029
1030int OSDMonitor::_set_cache_ratios()
1031{
1032 double old_cache_kv_ratio = cache_kv_ratio;
1033
1034 // Set the cache ratios for kv(rocksdb), inc and full caches
1035 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1036 if (cache_kv_ratio >= 1.0) {
1037 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1038 << ") must be in range [0,<1.0]."
1039 << dendl;
1040 cache_kv_ratio = old_cache_kv_ratio;
1041 return -EINVAL;
1042 }
1043 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1044 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1045 inc_cache->set_cache_ratio(cache_inc_ratio);
1046 full_cache->set_cache_ratio(cache_full_ratio);
1047
92f5a8d4 1048 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
eafe8130
TL
1049 << " inc ratio " << cache_inc_ratio
1050 << " full ratio " << cache_full_ratio
1051 << dendl;
1052 return 0;
1053}
1054
7c673cae
FG
1055void OSDMonitor::start_mapping()
1056{
1057 // initiate mapping job
1058 if (mapping_job) {
1059 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1060 << dendl;
1061 mapping_job->abort();
1062 }
224ce89b
WB
1063 if (!osdmap.get_pools().empty()) {
1064 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1065 mapping_job = mapping.start_update(osdmap, mapper,
11fdf7f2 1066 g_conf()->mon_osd_mapping_pgs_per_chunk);
224ce89b
WB
1067 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1068 << " at " << fin->start << dendl;
1069 mapping_job->set_finish_event(fin);
1070 } else {
1071 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1072 mapping_job = nullptr;
1073 }
7c673cae
FG
1074}
1075
1076void OSDMonitor::update_msgr_features()
1077{
f67539c2
TL
1078 const int types[] = {
1079 entity_name_t::TYPE_OSD,
1080 entity_name_t::TYPE_CLIENT,
1081 entity_name_t::TYPE_MDS,
1082 entity_name_t::TYPE_MON
1083 };
1084 for (int type : types) {
7c673cae 1085 uint64_t mask;
f67539c2
TL
1086 uint64_t features = osdmap.get_features(type, &mask);
1087 if ((mon.messenger->get_policy(type).features_required & mask) != features) {
7c673cae 1088 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
f67539c2 1089 ceph::net::Policy p = mon.messenger->get_policy(type);
7c673cae 1090 p.features_required = (p.features_required & ~mask) | features;
f67539c2 1091 mon.messenger->set_policy(type, p);
7c673cae
FG
1092 }
1093 }
1094}
1095
1096void OSDMonitor::on_active()
1097{
1098 update_logger();
1099
f67539c2
TL
1100 if (mon.is_leader()) {
1101 mon.clog->debug() << "osdmap " << osdmap;
81eedcae
TL
1102 if (!priority_convert) {
1103 // Only do this once at start-up
1104 convert_pool_priorities();
1105 priority_convert = true;
1106 }
7c673cae
FG
1107 } else {
1108 list<MonOpRequestRef> ls;
1109 take_all_failures(ls);
1110 while (!ls.empty()) {
1111 MonOpRequestRef op = ls.front();
1112 op->mark_osdmon_event(__func__);
1113 dispatch(op);
1114 ls.pop_front();
1115 }
1116 }
1117 start_mapping();
1118}
1119
1120void OSDMonitor::on_restart()
1121{
1122 last_osd_report.clear();
1123}
1124
1125void OSDMonitor::on_shutdown()
1126{
1127 dout(10) << __func__ << dendl;
1128 if (mapping_job) {
1129 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1130 << dendl;
1131 mapping_job->abort();
1132 }
1133
1134 // discard failure info, waiters
1135 list<MonOpRequestRef> ls;
1136 take_all_failures(ls);
1137 ls.clear();
1138}
1139
1140void OSDMonitor::update_logger()
1141{
1142 dout(10) << "update_logger" << dendl;
1143
f67539c2
TL
1144 mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1145 mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1146 mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1147 mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
7c673cae
FG
1148}
1149
7c673cae
FG
1150void OSDMonitor::create_pending()
1151{
1152 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
f67539c2 1153 pending_inc.fsid = mon.monmap->fsid;
11fdf7f2
TL
1154 pending_metadata.clear();
1155 pending_metadata_rm.clear();
9f95a23c 1156 pending_pseudo_purged_snaps.clear();
7c673cae
FG
1157
1158 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1159
11fdf7f2
TL
1160 // safety checks (this shouldn't really happen)
1161 {
1162 if (osdmap.backfillfull_ratio <= 0) {
1163 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1164 if (pending_inc.new_backfillfull_ratio > 1.0)
1165 pending_inc.new_backfillfull_ratio /= 100;
1166 dout(1) << __func__ << " setting backfillfull_ratio = "
1167 << pending_inc.new_backfillfull_ratio << dendl;
7c673cae 1168 }
7c673cae 1169 if (osdmap.full_ratio <= 0) {
11fdf7f2 1170 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
7c673cae
FG
1171 if (pending_inc.new_full_ratio > 1.0)
1172 pending_inc.new_full_ratio /= 100;
1173 dout(1) << __func__ << " setting full_ratio = "
1174 << pending_inc.new_full_ratio << dendl;
1175 }
1176 if (osdmap.nearfull_ratio <= 0) {
11fdf7f2 1177 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
7c673cae
FG
1178 if (pending_inc.new_nearfull_ratio > 1.0)
1179 pending_inc.new_nearfull_ratio /= 100;
1180 dout(1) << __func__ << " setting nearfull_ratio = "
1181 << pending_inc.new_nearfull_ratio << dendl;
1182 }
1183 }
3efd9988
FG
1184
1185 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1186 // structure.
1187 if (osdmap.crush->has_legacy_rule_ids()) {
1188 CrushWrapper newcrush;
1189 _get_pending_crush(newcrush);
1190
1191 // First, for all pools, work out which rule they really used
1192 // by resolving ruleset to rule.
1193 for (const auto &i : osdmap.get_pools()) {
1194 const auto pool_id = i.first;
1195 const auto &pool = i.second;
1196 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1197 pool.type, pool.size);
1198
1199 dout(1) << __func__ << " rewriting pool "
1200 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1201 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1202 if (pending_inc.new_pools.count(pool_id) == 0) {
1203 pending_inc.new_pools[pool_id] = pool;
1204 }
1205 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1206 }
1207
1208 // Now, go ahead and renumber all the rules so that their
1209 // rule_id field corresponds to their position in the array
1210 auto old_to_new = newcrush.renumber_rules();
1211 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1212 for (const auto &i : old_to_new) {
1213 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1214 }
1215 pending_inc.crush.clear();
f67539c2 1216 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
3efd9988 1217 }
7c673cae
FG
1218}
1219
1220creating_pgs_t
94b18763
FG
1221OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1222 const OSDMap& nextmap)
7c673cae 1223{
31f18b77 1224 dout(10) << __func__ << dendl;
7c673cae
FG
1225 creating_pgs_t pending_creatings;
1226 {
1227 std::lock_guard<std::mutex> l(creating_pgs_lock);
1228 pending_creatings = creating_pgs;
1229 }
31f18b77
FG
1230 // check for new or old pools
1231 if (pending_creatings.last_scan_epoch < inc.epoch) {
31f18b77
FG
1232 unsigned queued = 0;
1233 queued += scan_for_creating_pgs(osdmap.get_pools(),
1234 inc.old_pools,
1235 inc.modified,
1236 &pending_creatings);
1237 queued += scan_for_creating_pgs(inc.new_pools,
1238 inc.old_pools,
1239 inc.modified,
1240 &pending_creatings);
1241 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1242 for (auto deleted_pool : inc.old_pools) {
1243 auto removed = pending_creatings.remove_pool(deleted_pool);
1244 dout(10) << __func__ << " " << removed
1245 << " pg removed because containing pool deleted: "
1246 << deleted_pool << dendl;
1247 last_epoch_clean.remove_pool(deleted_pool);
1248 }
1249 // pgmon updates its creating_pgs in check_osd_map() which is called by
1250 // on_active() and check_osd_map() could be delayed if lease expires, so its
1251 // creating_pgs could be stale in comparison with the one of osdmon. let's
1252 // trim them here. otherwise, they will be added back after being erased.
1253 unsigned removed = 0;
1254 for (auto& pg : pending_created_pgs) {
1255 dout(20) << __func__ << " noting created pg " << pg << dendl;
1256 pending_creatings.created_pools.insert(pg.pool());
1257 removed += pending_creatings.pgs.erase(pg);
1258 }
1259 pending_created_pgs.clear();
1260 dout(10) << __func__ << " " << removed
1261 << " pgs removed because they're created" << dendl;
1262 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1263 }
1264
94b18763
FG
1265 // filter out any pgs that shouldn't exist.
1266 {
1267 auto i = pending_creatings.pgs.begin();
1268 while (i != pending_creatings.pgs.end()) {
1269 if (!nextmap.pg_exists(i->first)) {
1270 dout(10) << __func__ << " removing pg " << i->first
1271 << " which should not exist" << dendl;
1272 i = pending_creatings.pgs.erase(i);
1273 } else {
1274 ++i;
1275 }
1276 }
1277 }
1278
31f18b77 1279 // process queue
11fdf7f2 1280 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
31f18b77
FG
1281 const auto total = pending_creatings.pgs.size();
1282 while (pending_creatings.pgs.size() < max &&
1283 !pending_creatings.queue.empty()) {
1284 auto p = pending_creatings.queue.begin();
1285 int64_t poolid = p->first;
1286 dout(10) << __func__ << " pool " << poolid
1287 << " created " << p->second.created
1288 << " modified " << p->second.modified
1289 << " [" << p->second.start << "-" << p->second.end << ")"
1290 << dendl;
11fdf7f2
TL
1291 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1292 p->second.end - p->second.start);
31f18b77
FG
1293 ps_t first = p->second.start;
1294 ps_t end = first + n;
1295 for (ps_t ps = first; ps < end; ++ps) {
1296 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1297 // NOTE: use the *current* epoch as the PG creation epoch so that the
1298 // OSD does not have to generate a long set of PastIntervals.
9f95a23c
TL
1299 pending_creatings.pgs.emplace(
1300 pgid,
1301 creating_pgs_t::pg_create_info(inc.epoch,
1302 p->second.modified));
31f18b77
FG
1303 dout(10) << __func__ << " adding " << pgid << dendl;
1304 }
1305 p->second.start = end;
1306 if (p->second.done()) {
1307 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1308 pending_creatings.queue.erase(p);
1309 } else {
1310 dout(10) << __func__ << " pool " << poolid
1311 << " now [" << p->second.start << "-" << p->second.end << ")"
1312 << dendl;
1313 }
1314 }
1315 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1316 << " pools" << dendl;
9f95a23c 1317
f67539c2 1318 if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
9f95a23c
TL
1319 // walk creating pgs' history and past_intervals forward
1320 for (auto& i : pending_creatings.pgs) {
1321 // this mirrors PG::start_peering_interval()
1322 pg_t pgid = i.first;
1323
1324 // this is a bit imprecise, but sufficient?
1325 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1326 const pg_pool_t *pi;
1327 bool operator()(const set<pg_shard_t> &have) const {
1328 return have.size() >= pi->min_size;
1329 }
1330 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1331 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1332
1333 vector<int> up, acting;
1334 int up_primary, acting_primary;
1335 nextmap.pg_to_up_acting_osds(
1336 pgid, &up, &up_primary, &acting, &acting_primary);
1337 if (i.second.history.epoch_created == 0) {
1338 // new pg entry, set it up
1339 i.second.up = up;
1340 i.second.acting = acting;
1341 i.second.up_primary = up_primary;
1342 i.second.acting_primary = acting_primary;
1343 i.second.history = pg_history_t(i.second.create_epoch,
1344 i.second.create_stamp);
1345 dout(10) << __func__ << " pg " << pgid << " just added, "
1346 << " up " << i.second.up
1347 << " p " << i.second.up_primary
1348 << " acting " << i.second.acting
1349 << " p " << i.second.acting_primary
1350 << " history " << i.second.history
1351 << " past_intervals " << i.second.past_intervals
1352 << dendl;
1353 } else {
1354 std::stringstream debug;
1355 if (PastIntervals::check_new_interval(
1356 i.second.acting_primary, acting_primary,
1357 i.second.acting, acting,
1358 i.second.up_primary, up_primary,
1359 i.second.up, up,
1360 i.second.history.same_interval_since,
1361 i.second.history.last_epoch_clean,
1362 &nextmap,
1363 &osdmap,
1364 pgid,
1365 min_size_predicate,
1366 &i.second.past_intervals,
1367 &debug)) {
1368 epoch_t e = inc.epoch;
1369 i.second.history.same_interval_since = e;
1370 if (i.second.up != up) {
1371 i.second.history.same_up_since = e;
1372 }
1373 if (i.second.acting_primary != acting_primary) {
1374 i.second.history.same_primary_since = e;
1375 }
1376 if (pgid.is_split(
1377 osdmap.get_pg_num(pgid.pool()),
1378 nextmap.get_pg_num(pgid.pool()),
1379 nullptr)) {
1380 i.second.history.last_epoch_split = e;
1381 }
1382 dout(10) << __func__ << " pg " << pgid << " new interval,"
1383 << " up " << i.second.up << " -> " << up
1384 << " p " << i.second.up_primary << " -> " << up_primary
1385 << " acting " << i.second.acting << " -> " << acting
1386 << " p " << i.second.acting_primary << " -> "
1387 << acting_primary
1388 << " history " << i.second.history
1389 << " past_intervals " << i.second.past_intervals
1390 << dendl;
1391 dout(20) << " debug: " << debug.str() << dendl;
1392 i.second.up = up;
1393 i.second.acting = acting;
1394 i.second.up_primary = up_primary;
1395 i.second.acting_primary = acting_primary;
1396 }
1397 }
1398 }
1399 }
c07f9fc5
FG
1400 dout(10) << __func__
1401 << " " << (pending_creatings.pgs.size() - total)
1402 << "/" << pending_creatings.pgs.size()
31f18b77 1403 << " pgs added from queued pools" << dendl;
7c673cae
FG
1404 return pending_creatings;
1405}
1406
1407void OSDMonitor::maybe_prime_pg_temp()
1408{
1409 bool all = false;
1410 if (pending_inc.crush.length()) {
1411 dout(10) << __func__ << " new crush map, all" << dendl;
1412 all = true;
1413 }
1414
1415 if (!pending_inc.new_up_client.empty()) {
1416 dout(10) << __func__ << " new up osds, all" << dendl;
1417 all = true;
1418 }
1419
1420 // check for interesting OSDs
1421 set<int> osds;
31f18b77 1422 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
1423 !all && p != pending_inc.new_state.end();
1424 ++p) {
1425 if ((p->second & CEPH_OSD_UP) &&
1426 osdmap.is_up(p->first)) {
1427 osds.insert(p->first);
1428 }
1429 }
f67539c2 1430 for (auto p = pending_inc.new_weight.begin();
7c673cae
FG
1431 !all && p != pending_inc.new_weight.end();
1432 ++p) {
f67539c2 1433 if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
7c673cae
FG
1434 // weight reduction
1435 osds.insert(p->first);
1436 } else {
1437 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1438 << dendl;
1439 all = true;
1440 }
1441 }
1442
1443 if (!all && osds.empty())
1444 return;
1445
1446 if (!all) {
1447 unsigned estimate =
1448 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1449 if (estimate > mapping.get_num_pgs() *
11fdf7f2 1450 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
7c673cae
FG
1451 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1452 << osds.size() << " osds >= "
11fdf7f2 1453 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
7c673cae
FG
1454 << mapping.get_num_pgs() << " pgs, all"
1455 << dendl;
1456 all = true;
1457 } else {
1458 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1459 << osds.size() << " osds" << dendl;
1460 }
1461 }
1462
1463 OSDMap next;
1464 next.deepish_copy_from(osdmap);
1465 next.apply_incremental(pending_inc);
1466
224ce89b
WB
1467 if (next.get_pools().empty()) {
1468 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1469 } else if (all) {
7c673cae 1470 PrimeTempJob job(next, this);
494da23a 1471 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
11fdf7f2 1472 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
7c673cae
FG
1473 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1474 } else {
1475 dout(10) << __func__ << " did not finish in "
11fdf7f2 1476 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1477 << ", stopping" << dendl;
1478 job.abort();
1479 }
1480 } else {
1481 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1482 utime_t stop = ceph_clock_now();
11fdf7f2 1483 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
7c673cae
FG
1484 const int chunk = 1000;
1485 int n = chunk;
1486 std::unordered_set<pg_t> did_pgs;
1487 for (auto osd : osds) {
1488 auto& pgs = mapping.get_osd_acting_pgs(osd);
1489 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1490 for (auto pgid : pgs) {
1491 if (!did_pgs.insert(pgid).second) {
1492 continue;
1493 }
1494 prime_pg_temp(next, pgid);
1495 if (--n <= 0) {
1496 n = chunk;
1497 if (ceph_clock_now() > stop) {
1498 dout(10) << __func__ << " consumed more than "
11fdf7f2 1499 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1500 << " seconds, stopping"
1501 << dendl;
1502 return;
1503 }
1504 }
1505 }
1506 }
1507 }
1508}
1509
1510void OSDMonitor::prime_pg_temp(
1511 const OSDMap& next,
1512 pg_t pgid)
1513{
11fdf7f2
TL
1514 // TODO: remove this creating_pgs direct access?
1515 if (creating_pgs.pgs.count(pgid)) {
1516 return;
7c673cae
FG
1517 }
1518 if (!osdmap.pg_exists(pgid)) {
1519 return;
1520 }
1521
1522 vector<int> up, acting;
1523 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1524
1525 vector<int> next_up, next_acting;
1526 int next_up_primary, next_acting_primary;
1527 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1528 &next_acting, &next_acting_primary);
f64942e4
AA
1529 if (acting == next_acting &&
1530 !(up != acting && next_up == next_acting))
7c673cae
FG
1531 return; // no change since last epoch
1532
1533 if (acting.empty())
1534 return; // if previously empty now we can be no worse off
1535 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1536 if (pool && acting.size() < pool->min_size)
1537 return; // can be no worse off than before
1538
c07f9fc5
FG
1539 if (next_up == next_acting) {
1540 acting.clear();
11fdf7f2
TL
1541 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1542 << dendl;
c07f9fc5
FG
1543 }
1544
7c673cae
FG
1545 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1546 << " -> " << next_up << "/" << next_acting
1547 << ", priming " << acting
1548 << dendl;
1549 {
11fdf7f2 1550 std::lock_guard l(prime_pg_temp_lock);
7c673cae
FG
1551 // do not touch a mapping if a change is pending
1552 pending_inc.new_pg_temp.emplace(
1553 pgid,
1554 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1555 }
1556}
1557
1558/**
1559 * @note receiving a transaction in this function gives a fair amount of
1560 * freedom to the service implementation if it does need it. It shouldn't.
1561 */
1562void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1563{
1564 dout(10) << "encode_pending e " << pending_inc.epoch
1565 << dendl;
1566
11fdf7f2
TL
1567 if (do_prune(t)) {
1568 dout(1) << __func__ << " osdmap full prune encoded e"
1569 << pending_inc.epoch << dendl;
1570 }
1571
7c673cae
FG
1572 // finalize up pending_inc
1573 pending_inc.modified = ceph_clock_now();
1574
f67539c2 1575 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
11fdf7f2 1576 ceph_assert(r == 0);
7c673cae
FG
1577
1578 if (mapping_job) {
1579 if (!mapping_job->is_done()) {
1580 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1581 << mapping_job.get() << " did not complete, "
1582 << mapping_job->shards << " left" << dendl;
1583 mapping_job->abort();
1584 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1585 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1586 << mapping_job.get() << " is prior epoch "
1587 << mapping.get_epoch() << dendl;
1588 } else {
11fdf7f2 1589 if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1590 maybe_prime_pg_temp();
1591 }
1592 }
11fdf7f2 1593 } else if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1594 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1595 << dendl;
1596 }
1597 mapping_job.reset();
1598
c07f9fc5
FG
1599 // ensure we don't have blank new_state updates. these are interrpeted as
1600 // CEPH_OSD_UP (and almost certainly not what we want!).
1601 auto p = pending_inc.new_state.begin();
1602 while (p != pending_inc.new_state.end()) {
1603 if (p->second == 0) {
1604 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1605 p = pending_inc.new_state.erase(p);
1606 } else {
11fdf7f2
TL
1607 if (p->second & CEPH_OSD_UP) {
1608 pending_inc.new_last_up_change = pending_inc.modified;
1609 }
c07f9fc5
FG
1610 ++p;
1611 }
1612 }
11fdf7f2
TL
1613 if (!pending_inc.new_up_client.empty()) {
1614 pending_inc.new_last_up_change = pending_inc.modified;
1615 }
1616 for (auto& i : pending_inc.new_weight) {
9f95a23c 1617 if (i.first >= osdmap.max_osd) {
11fdf7f2
TL
1618 if (i.second) {
1619 // new osd is already marked in
1620 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1621 break;
11fdf7f2
TL
1622 }
1623 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1624 // existing osd marked in or out
1625 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1626 break;
11fdf7f2
TL
1627 }
1628 }
7c673cae
FG
1629
1630 {
1631 OSDMap tmp;
1632 tmp.deepish_copy_from(osdmap);
1633 tmp.apply_incremental(pending_inc);
1634
11fdf7f2
TL
1635 // clean pg_temp mappings
1636 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1637
1638 // clean inappropriate pg_upmap/pg_upmap_items (if any)
494da23a
TL
1639 {
1640 // check every upmapped pg for now
1641 // until we could reliably identify certain cases to ignore,
1642 // which is obviously the hard part TBD..
1643 vector<pg_t> pgs_to_check;
1644 tmp.get_upmap_pgs(&pgs_to_check);
9f95a23c
TL
1645 if (pgs_to_check.size() <
1646 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
494da23a
TL
1647 // not enough pgs, do it inline
1648 tmp.clean_pg_upmaps(cct, &pending_inc);
1649 } else {
1650 CleanUpmapJob job(cct, tmp, pending_inc);
1651 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1652 job.wait();
1653 }
1654 }
11fdf7f2
TL
1655
1656 // update creating pgs first so that we can remove the created pgid and
1657 // process the pool flag removal below in the same osdmap epoch.
1658 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1659 bufferlist creatings_bl;
9f95a23c 1660 uint64_t features = CEPH_FEATURES_ALL;
f67539c2 1661 if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
9f95a23c
TL
1662 dout(20) << __func__ << " encoding pending pgs without octopus features"
1663 << dendl;
1664 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1665 }
1666 encode(pending_creatings, creatings_bl, features);
11fdf7f2
TL
1667 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1668
1669 // remove any old (or incompat) POOL_CREATING flags
1670 for (auto& i : tmp.get_pools()) {
9f95a23c 1671 if (tmp.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1672 // pre-nautilus OSDMaps shouldn't get this flag.
1673 if (pending_inc.new_pools.count(i.first)) {
1674 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1675 }
1676 }
1677 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1678 !pending_creatings.still_creating_pool(i.first)) {
1679 dout(10) << __func__ << " done creating pool " << i.first
1680 << ", clearing CREATING flag" << dendl;
1681 if (pending_inc.new_pools.count(i.first) == 0) {
1682 pending_inc.new_pools[i.first] = i.second;
1683 }
1684 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
3efd9988 1685 }
11fdf7f2
TL
1686 }
1687
11fdf7f2
TL
1688 // collect which pools are currently affected by
1689 // the near/backfill/full osd(s),
1690 // and set per-pool near/backfill/full flag instead
1691 set<int64_t> full_pool_ids;
1692 set<int64_t> backfillfull_pool_ids;
1693 set<int64_t> nearfull_pool_ids;
1694 tmp.get_full_pools(cct,
1695 &full_pool_ids,
1696 &backfillfull_pool_ids,
3efd9988 1697 &nearfull_pool_ids);
11fdf7f2
TL
1698 if (full_pool_ids.empty() ||
1699 backfillfull_pool_ids.empty() ||
1700 nearfull_pool_ids.empty()) {
1701 // normal case - no nearfull, backfillfull or full osds
3efd9988
FG
1702 // try cancel any improper nearfull/backfillfull/full pool
1703 // flags first
11fdf7f2
TL
1704 for (auto &pool: tmp.get_pools()) {
1705 auto p = pool.first;
1706 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1707 nearfull_pool_ids.empty()) {
1708 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1709 << "'s nearfull flag" << dendl;
1710 if (pending_inc.new_pools.count(p) == 0) {
1711 // load original pool info first!
1712 pending_inc.new_pools[p] = pool.second;
1713 }
1714 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1715 }
1716 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1717 backfillfull_pool_ids.empty()) {
1718 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1719 << "'s backfillfull flag" << dendl;
1720 if (pending_inc.new_pools.count(p) == 0) {
1721 pending_inc.new_pools[p] = pool.second;
1722 }
1723 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1724 }
1725 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1726 full_pool_ids.empty()) {
1727 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1728 // set by EQUOTA, skipping
1729 continue;
1730 }
1731 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1732 << "'s full flag" << dendl;
1733 if (pending_inc.new_pools.count(p) == 0) {
1734 pending_inc.new_pools[p] = pool.second;
1735 }
1736 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1737 }
3efd9988 1738 }
11fdf7f2
TL
1739 }
1740 if (!full_pool_ids.empty()) {
1741 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1742 << " as full" << dendl;
1743 for (auto &p: full_pool_ids) {
1744 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1745 continue;
1746 }
1747 if (pending_inc.new_pools.count(p) == 0) {
1748 pending_inc.new_pools[p] = tmp.pools[p];
1749 }
1750 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1751 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1752 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1753 }
1754 // cancel FLAG_FULL for pools which are no longer full too
1755 for (auto &pool: tmp.get_pools()) {
1756 auto p = pool.first;
1757 if (full_pool_ids.count(p)) {
1758 // skip pools we have just marked as full above
1759 continue;
1760 }
1761 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1762 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1763 // don't touch if currently is not full
1764 // or is running out of quota (and hence considered as full)
1765 continue;
1766 }
1767 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1768 << "'s full flag" << dendl;
1769 if (pending_inc.new_pools.count(p) == 0) {
1770 pending_inc.new_pools[p] = pool.second;
1771 }
1772 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
3efd9988 1773 }
11fdf7f2
TL
1774 }
1775 if (!backfillfull_pool_ids.empty()) {
1776 for (auto &p: backfillfull_pool_ids) {
1777 if (full_pool_ids.count(p)) {
1778 // skip pools we have already considered as full above
1779 continue;
1780 }
1781 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1782 // make sure FLAG_FULL is truly set, so we are safe not
1783 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1784 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1785 continue;
1786 }
1787 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1788 // don't bother if pool is already marked as backfillfull
1789 continue;
1790 }
1791 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1792 << "'s as backfillfull" << dendl;
1793 if (pending_inc.new_pools.count(p) == 0) {
1794 pending_inc.new_pools[p] = tmp.pools[p];
1795 }
1796 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1797 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1798 }
1799 // cancel FLAG_BACKFILLFULL for pools
1800 // which are no longer backfillfull too
1801 for (auto &pool: tmp.get_pools()) {
1802 auto p = pool.first;
1803 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1804 // skip pools we have just marked as backfillfull/full above
1805 continue;
1806 }
1807 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1808 // and don't touch if currently is not backfillfull
1809 continue;
1810 }
1811 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1812 << "'s backfillfull flag" << dendl;
1813 if (pending_inc.new_pools.count(p) == 0) {
1814 pending_inc.new_pools[p] = pool.second;
1815 }
1816 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
3efd9988 1817 }
11fdf7f2
TL
1818 }
1819 if (!nearfull_pool_ids.empty()) {
1820 for (auto &p: nearfull_pool_ids) {
1821 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1822 continue;
1823 }
1824 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1825 // make sure FLAG_FULL is truly set, so we are safe not
1826 // to set a extra (redundant) FLAG_NEARFULL flag
1827 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1828 continue;
1829 }
1830 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1831 // don't bother if pool is already marked as nearfull
1832 continue;
1833 }
1834 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1835 << "'s as nearfull" << dendl;
1836 if (pending_inc.new_pools.count(p) == 0) {
1837 pending_inc.new_pools[p] = tmp.pools[p];
1838 }
1839 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1840 }
1841 // cancel FLAG_NEARFULL for pools
1842 // which are no longer nearfull too
1843 for (auto &pool: tmp.get_pools()) {
1844 auto p = pool.first;
1845 if (full_pool_ids.count(p) ||
1846 backfillfull_pool_ids.count(p) ||
1847 nearfull_pool_ids.count(p)) {
1848 // skip pools we have just marked as
1849 // nearfull/backfillfull/full above
1850 continue;
1851 }
1852 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1853 // and don't touch if currently is not nearfull
1854 continue;
1855 }
1856 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1857 << "'s nearfull flag" << dendl;
1858 if (pending_inc.new_pools.count(p) == 0) {
1859 pending_inc.new_pools[p] = pool.second;
1860 }
1861 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
7c673cae 1862 }
11fdf7f2 1863 }
7c673cae 1864
11fdf7f2 1865 // min_compat_client?
9f95a23c 1866 if (!tmp.require_min_compat_client) {
11fdf7f2
TL
1867 auto mv = tmp.get_min_compat_client();
1868 dout(1) << __func__ << " setting require_min_compat_client to currently "
9f95a23c 1869 << "required " << mv << dendl;
f67539c2 1870 mon.clog->info() << "setting require_min_compat_client to currently "
9f95a23c 1871 << "required " << mv;
11fdf7f2
TL
1872 pending_inc.new_require_min_compat_client = mv;
1873 }
1874
9f95a23c
TL
1875 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1876 tmp.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
1877 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1878 // add creating flags?
1879 for (auto& i : tmp.get_pools()) {
1880 if (pending_creatings.still_creating_pool(i.first)) {
1881 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1882 << dendl;
1883 if (pending_inc.new_pools.count(i.first) == 0) {
1884 pending_inc.new_pools[i.first] = i.second;
224ce89b 1885 }
11fdf7f2 1886 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
224ce89b 1887 }
11fdf7f2 1888 }
f67539c2
TL
1889 // adjust blocklist items to all be TYPE_ANY
1890 for (auto& i : tmp.blocklist) {
11fdf7f2
TL
1891 auto a = i.first;
1892 a.set_type(entity_addr_t::TYPE_ANY);
f67539c2
TL
1893 pending_inc.new_blocklist[a] = i.second;
1894 pending_inc.old_blocklist.push_back(i.first);
224ce89b 1895 }
7c673cae 1896 }
9f95a23c
TL
1897
1898 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1899 tmp.require_osd_release >= ceph_release_t::octopus) {
1900 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1901
1902 // adjust obsoleted cache modes
1903 for (auto& [poolid, pi] : tmp.pools) {
1904 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1905 if (pending_inc.new_pools.count(poolid) == 0) {
1906 pending_inc.new_pools[poolid] = pi;
1907 }
1908 dout(10) << __func__ << " switching pool " << poolid
1909 << " cachemode from forward -> proxy" << dendl;
1910 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1911 }
1912 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1913 if (pending_inc.new_pools.count(poolid) == 0) {
1914 pending_inc.new_pools[poolid] = pi;
1915 }
1916 dout(10) << __func__ << " switching pool " << poolid
1917 << " cachemode from readforward -> readproxy" << dendl;
1918 pending_inc.new_pools[poolid].cache_mode =
1919 pg_pool_t::CACHEMODE_READPROXY;
1920 }
1921 }
1922
1923 // clear removed_snaps for every pool
1924 for (auto& [poolid, pi] : tmp.pools) {
1925 if (pi.removed_snaps.empty()) {
1926 continue;
1927 }
1928 if (pending_inc.new_pools.count(poolid) == 0) {
1929 pending_inc.new_pools[poolid] = pi;
1930 }
1931 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1932 << dendl;
1933 pending_inc.new_pools[poolid].removed_snaps.clear();
1934 }
1935
1936 // create a combined purged snap epoch key for all purged snaps
1937 // prior to this epoch, and store it in the current epoch (i.e.,
1938 // the last pre-octopus epoch, just prior to the one we're
1939 // encoding now).
f67539c2 1940 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
9f95a23c
TL
1941 it->lower_bound("purged_snap_");
1942 map<int64_t,snap_interval_set_t> combined;
1943 while (it->valid()) {
1944 if (it->key().find("purged_snap_") != 0) {
1945 break;
1946 }
1947 string k = it->key();
1948 long long unsigned pool;
1949 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1950 if (n != 1) {
1951 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1952 } else {
1953 bufferlist v = it->value();
1954 auto p = v.cbegin();
1955 snapid_t begin, end;
1956 ceph::decode(begin, p);
1957 ceph::decode(end, p);
1958 combined[pool].insert(begin, end - begin);
1959 }
1960 it->next();
1961 }
1962 if (!combined.empty()) {
1963 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1964 bufferlist v;
1965 ceph::encode(combined, v);
1966 t->put(OSD_SNAP_PREFIX, k, v);
1967 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1968 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1969 << dendl;
1970 } else {
1971 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1972 << dendl;
1973 }
1974
1975 // clean out the old removed_snap_ and removed_epoch keys
1976 // ('`' is ASCII '_' + 1)
1977 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1978 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1979 }
7c673cae
FG
1980 }
1981
1982 // tell me about it
31f18b77 1983 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1984 i != pending_inc.new_state.end();
1985 ++i) {
1986 int s = i->second ? i->second : CEPH_OSD_UP;
f6b5b4d7 1987 if (s & CEPH_OSD_UP) {
7c673cae 1988 dout(2) << " osd." << i->first << " DOWN" << dendl;
f6b5b4d7
TL
1989 // Reset laggy parameters if failure interval exceeds a threshold.
1990 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1991 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1992 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1993 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1994 set_default_laggy_params(i->first);
1995 }
1996 }
1997 }
7c673cae
FG
1998 if (s & CEPH_OSD_EXISTS)
1999 dout(2) << " osd." << i->first << " DNE" << dendl;
2000 }
11fdf7f2 2001 for (auto i = pending_inc.new_up_client.begin();
7c673cae
FG
2002 i != pending_inc.new_up_client.end();
2003 ++i) {
2004 //FIXME: insert cluster addresses too
2005 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
2006 }
2007 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
2008 i != pending_inc.new_weight.end();
2009 ++i) {
2010 if (i->second == CEPH_OSD_OUT) {
2011 dout(2) << " osd." << i->first << " OUT" << dendl;
2012 } else if (i->second == CEPH_OSD_IN) {
2013 dout(2) << " osd." << i->first << " IN" << dendl;
2014 } else {
2015 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
2016 }
2017 }
2018
2019 // features for osdmap and its incremental
28e407b8 2020 uint64_t features;
7c673cae
FG
2021
2022 // encode full map and determine its crc
2023 OSDMap tmp;
2024 {
2025 tmp.deepish_copy_from(osdmap);
2026 tmp.apply_incremental(pending_inc);
2027
2028 // determine appropriate features
28e407b8
AA
2029 features = tmp.get_encoding_features();
2030 dout(10) << __func__ << " encoding full map with "
9f95a23c 2031 << tmp.require_osd_release
28e407b8
AA
2032 << " features " << features << dendl;
2033
2034 // the features should be a subset of the mon quorum's features!
f67539c2 2035 ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
7c673cae
FG
2036
2037 bufferlist fullbl;
11fdf7f2 2038 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
2039 pending_inc.full_crc = tmp.get_crc();
2040
2041 // include full map in the txn. note that old monitors will
2042 // overwrite this. new ones will now skip the local full map
2043 // encode and reload from this.
2044 put_version_full(t, pending_inc.epoch, fullbl);
2045 }
2046
2047 // encode
11fdf7f2
TL
2048 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2049 bufferlist bl;
2050 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
2051
2052 dout(20) << " full_crc " << tmp.get_crc()
2053 << " inc_crc " << pending_inc.inc_crc << dendl;
2054
2055 /* put everything in the transaction */
2056 put_version(t, pending_inc.epoch, bl);
2057 put_last_committed(t, pending_inc.epoch);
2058
2059 // metadata, too!
2060 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2061 p != pending_metadata.end();
2062 ++p)
2063 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2064 for (set<int>::iterator p = pending_metadata_rm.begin();
2065 p != pending_metadata_rm.end();
2066 ++p)
2067 t->erase(OSD_METADATA_PREFIX, stringify(*p));
2068 pending_metadata.clear();
2069 pending_metadata_rm.clear();
2070
9f95a23c
TL
2071 // purged_snaps
2072 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2073 !pending_inc.new_purged_snaps.empty()) {
2074 // all snaps purged this epoch (across all pools)
2075 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2076 bufferlist v;
2077 encode(pending_inc.new_purged_snaps, v);
2078 t->put(OSD_SNAP_PREFIX, k, v);
2079 }
2080 for (auto& i : pending_inc.new_purged_snaps) {
2081 for (auto q = i.second.begin();
2082 q != i.second.end();
2083 ++q) {
2084 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2085 pending_inc.epoch,
2086 t);
11fdf7f2 2087 }
9f95a23c
TL
2088 }
2089 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2090 for (auto snap : snaps) {
2091 insert_purged_snap_update(pool, snap, snap + 1,
2092 pending_inc.epoch,
2093 t);
7c673cae 2094 }
7c673cae 2095 }
224ce89b
WB
2096
2097 // health
2098 health_check_map_t next;
92f5a8d4 2099 tmp.check_health(cct, &next);
224ce89b 2100 encode_health(next, t);
7c673cae
FG
2101}
2102
7c673cae
FG
2103int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2104{
2105 bufferlist bl;
f67539c2 2106 int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
7c673cae
FG
2107 if (r < 0)
2108 return r;
2109 try {
11fdf7f2
TL
2110 auto p = bl.cbegin();
2111 decode(m, p);
7c673cae 2112 }
f67539c2 2113 catch (ceph::buffer::error& e) {
7c673cae
FG
2114 if (err)
2115 *err << "osd." << osd << " metadata is corrupt";
2116 return -EIO;
2117 }
2118 return 0;
2119}
2120
c07f9fc5 2121void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 2122{
31f18b77
FG
2123 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2124 if (osdmap.is_up(osd)) {
2125 map<string,string> meta;
2126 load_metadata(osd, meta, nullptr);
2127 auto p = meta.find(field);
2128 if (p == meta.end()) {
c07f9fc5 2129 (*out)["unknown"]++;
31f18b77 2130 } else {
c07f9fc5 2131 (*out)[p->second]++;
31f18b77
FG
2132 }
2133 }
2134 }
c07f9fc5
FG
2135}
2136
2137void OSDMonitor::count_metadata(const string& field, Formatter *f)
2138{
2139 map<string,int> by_val;
2140 count_metadata(field, &by_val);
31f18b77
FG
2141 f->open_object_section(field.c_str());
2142 for (auto& p : by_val) {
2143 f->dump_int(p.first.c_str(), p.second);
2144 }
2145 f->close_section();
2146}
2147
f67539c2
TL
2148void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2149{
2150 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2151 if (osdmap.is_up(osd)) {
2152 map<string,string> meta;
2153 load_metadata(osd, meta, nullptr);
2154 auto p = meta.find("ceph_version_short");
2155 if (p == meta.end()) continue;
2156 versions[p->second].push_back(string("osd.") + stringify(osd));
2157 }
2158 }
2159}
2160
7c673cae
FG
2161int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2162{
2163 map<string, string> metadata;
2164 int r = load_metadata(osd, metadata, nullptr);
2165 if (r < 0)
2166 return r;
2167
2168 auto it = metadata.find("osd_objectstore");
2169 if (it == metadata.end())
2170 return -ENOENT;
2171 *type = it->second;
2172 return 0;
2173}
2174
2175bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2176 const pg_pool_t &pool,
2177 ostream *err)
2178{
2179 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2180 // since filestore osds could always join the pool later
2181 set<int> checked_osds;
11fdf7f2 2182 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
7c673cae 2183 vector<int> up, acting;
11fdf7f2 2184 pg_t pgid(ps, pool_id);
7c673cae
FG
2185 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2186 for (int osd : up) {
2187 if (checked_osds.find(osd) != checked_osds.end())
2188 continue;
2189 string objectstore_type;
2190 int r = get_osd_objectstore_type(osd, &objectstore_type);
2191 // allow with missing metadata, e.g. due to an osd never booting yet
2192 if (r < 0 || objectstore_type == "bluestore") {
2193 checked_osds.insert(osd);
2194 continue;
2195 }
2196 *err << "osd." << osd << " uses " << objectstore_type;
2197 return false;
2198 }
2199 }
2200 return true;
2201}
2202
2203int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2204{
2205 map<string,string> m;
2206 if (int r = load_metadata(osd, m, err))
2207 return r;
2208 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2209 f->dump_string(p->first.c_str(), p->second);
2210 return 0;
2211}
2212
2213void OSDMonitor::print_nodes(Formatter *f)
2214{
2215 // group OSDs by their hosts
2216 map<string, list<int> > osds; // hostname => osd
2217 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2218 map<string, string> m;
2219 if (load_metadata(osd, m, NULL)) {
2220 continue;
2221 }
2222 map<string, string>::iterator hostname = m.find("hostname");
2223 if (hostname == m.end()) {
2224 // not likely though
2225 continue;
2226 }
2227 osds[hostname->second].push_back(osd);
2228 }
2229
2230 dump_services(f, osds, "osd");
2231}
2232
2233void OSDMonitor::share_map_with_random_osd()
2234{
2235 if (osdmap.get_num_up_osds() == 0) {
2236 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2237 return;
2238 }
2239
f67539c2 2240 MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
7c673cae
FG
2241 if (!s) {
2242 dout(10) << __func__ << " no up osd on our session map" << dendl;
2243 return;
2244 }
2245
11fdf7f2
TL
2246 dout(10) << "committed, telling random " << s->name
2247 << " all about it" << dendl;
28e407b8
AA
2248
2249 // get feature of the peer
2250 // use quorum_con_features, if it's an anonymous connection.
2251 uint64_t features = s->con_features ? s->con_features :
f67539c2 2252 mon.get_quorum_con_features();
7c673cae 2253 // whatev, they'll request more if they need it
28e407b8 2254 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
7c673cae
FG
2255 s->con->send_message(m);
2256 // NOTE: do *not* record osd has up to this epoch (as we do
2257 // elsewhere) as they may still need to request older values.
2258}
2259
11fdf7f2 2260version_t OSDMonitor::get_trim_to() const
7c673cae 2261{
f67539c2
TL
2262 if (mon.get_quorum().empty()) {
2263 dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
31f18b77
FG
2264 return 0;
2265 }
7c673cae 2266
11fdf7f2
TL
2267 {
2268 std::lock_guard<std::mutex> l(creating_pgs_lock);
2269 if (!creating_pgs.pgs.empty()) {
f67539c2 2270 dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
7c673cae
FG
2271 return 0;
2272 }
7c673cae 2273 }
11fdf7f2
TL
2274
2275 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2276 dout(0) << __func__
2277 << " blocking osdmap trim"
f67539c2
TL
2278 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2279 << " trim_to = 0" << dendl;
11fdf7f2
TL
2280 return 0;
2281 }
2282
7c673cae 2283 {
11fdf7f2 2284 epoch_t floor = get_min_last_epoch_clean();
7c673cae 2285 dout(10) << " min_last_epoch_clean " << floor << dendl;
11fdf7f2
TL
2286 if (g_conf()->mon_osd_force_trim_to > 0 &&
2287 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2288 floor = g_conf()->mon_osd_force_trim_to;
f67539c2
TL
2289 dout(10) << __func__
2290 << " explicit mon_osd_force_trim_to = " << floor << dendl;
7c673cae 2291 }
11fdf7f2 2292 unsigned min = g_conf()->mon_min_osdmap_epochs;
7c673cae
FG
2293 if (floor + min > get_last_committed()) {
2294 if (min < get_last_committed())
2295 floor = get_last_committed() - min;
2296 else
2297 floor = 0;
2298 }
f67539c2
TL
2299 if (floor > get_first_committed()) {
2300 dout(10) << __func__ << " trim_to = " << floor << dendl;
7c673cae 2301 return floor;
f67539c2 2302 }
7c673cae 2303 }
f67539c2 2304 dout(10) << __func__ << " trim_to = 0" << dendl;
7c673cae
FG
2305 return 0;
2306}
2307
2308epoch_t OSDMonitor::get_min_last_epoch_clean() const
2309{
2310 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2311 // also scan osd epochs
2312 // don't trim past the oldest reported osd epoch
f67539c2
TL
2313 for (auto [osd, epoch] : osd_epochs) {
2314 if (epoch < floor) {
2315 floor = epoch;
7c673cae
FG
2316 }
2317 }
2318 return floor;
2319}
2320
2321void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2322 version_t first)
2323{
2324 dout(10) << __func__ << " including full map for e " << first << dendl;
2325 bufferlist bl;
2326 get_version_full(first, bl);
2327 put_version_full(tx, first, bl);
11fdf7f2
TL
2328
2329 if (has_osdmap_manifest &&
2330 first > osdmap_manifest.get_first_pinned()) {
2331 _prune_update_trimmed(tx, first);
2332 }
7c673cae
FG
2333}
2334
11fdf7f2
TL
2335
2336/* full osdmap prune
2337 *
2338 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2339 */
2340
2341void OSDMonitor::load_osdmap_manifest()
2342{
2343 bool store_has_manifest =
f67539c2 2344 mon.store->exists(get_service_name(), "osdmap_manifest");
11fdf7f2
TL
2345
2346 if (!store_has_manifest) {
2347 if (!has_osdmap_manifest) {
2348 return;
2349 }
2350
2351 dout(20) << __func__
2352 << " dropping osdmap manifest from memory." << dendl;
2353 osdmap_manifest = osdmap_manifest_t();
2354 has_osdmap_manifest = false;
2355 return;
2356 }
2357
2358 dout(20) << __func__
2359 << " osdmap manifest detected in store; reload." << dendl;
2360
2361 bufferlist manifest_bl;
2362 int r = get_value("osdmap_manifest", manifest_bl);
2363 if (r < 0) {
2364 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2365 ceph_abort_msg("error reading manifest");
2366 }
2367 osdmap_manifest.decode(manifest_bl);
2368 has_osdmap_manifest = true;
2369
2370 dout(10) << __func__ << " store osdmap manifest pinned ("
2371 << osdmap_manifest.get_first_pinned()
2372 << " .. "
2373 << osdmap_manifest.get_last_pinned()
2374 << ")"
2375 << dendl;
2376}
2377
2378bool OSDMonitor::should_prune() const
2379{
2380 version_t first = get_first_committed();
2381 version_t last = get_last_committed();
2382 version_t min_osdmap_epochs =
2383 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2384 version_t prune_min =
2385 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2386 version_t prune_interval =
2387 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2388 version_t last_pinned = osdmap_manifest.get_last_pinned();
2389 version_t last_to_pin = last - min_osdmap_epochs;
2390
2391 // Make it or break it constraints.
2392 //
2393 // If any of these conditions fails, we will not prune, regardless of
2394 // whether we have an on-disk manifest with an on-going pruning state.
2395 //
2396 if ((last - first) <= min_osdmap_epochs) {
2397 // between the first and last committed epochs, we don't have
2398 // enough epochs to trim, much less to prune.
2399 dout(10) << __func__
2400 << " currently holding only " << (last - first)
2401 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2402 << "); do not prune."
2403 << dendl;
2404 return false;
2405
2406 } else if ((last_to_pin - first) < prune_min) {
2407 // between the first committed epoch and the last epoch we would prune,
2408 // we simply don't have enough versions over the minimum to prune maps.
2409 dout(10) << __func__
2410 << " could only prune " << (last_to_pin - first)
2411 << " epochs (" << first << ".." << last_to_pin << "), which"
2412 " is less than the required minimum (" << prune_min << ")"
2413 << dendl;
2414 return false;
2415
2416 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2417 dout(10) << __func__
2418 << " we have pruned as far as we can; do not prune."
2419 << dendl;
2420 return false;
2421
2422 } else if (last_pinned + prune_interval > last_to_pin) {
2423 dout(10) << __func__
2424 << " not enough epochs to form an interval (last pinned: "
2425 << last_pinned << ", last to pin: "
2426 << last_to_pin << ", interval: " << prune_interval << ")"
2427 << dendl;
2428 return false;
2429 }
2430
2431 dout(15) << __func__
2432 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2433 << " lc (" << first << ".." << last << ")"
2434 << dendl;
2435 return true;
2436}
2437
2438void OSDMonitor::_prune_update_trimmed(
2439 MonitorDBStore::TransactionRef tx,
2440 version_t first)
2441{
2442 dout(10) << __func__
2443 << " first " << first
2444 << " last_pinned " << osdmap_manifest.get_last_pinned()
11fdf7f2
TL
2445 << dendl;
2446
2447 osdmap_manifest_t manifest = osdmap_manifest;
2448
2449 if (!manifest.is_pinned(first)) {
2450 manifest.pin(first);
2451 }
2452
2453 set<version_t>::iterator p_end = manifest.pinned.find(first);
2454 set<version_t>::iterator p = manifest.pinned.begin();
2455 manifest.pinned.erase(p, p_end);
2456 ceph_assert(manifest.get_first_pinned() == first);
2457
2458 if (manifest.get_last_pinned() == first+1 ||
2459 manifest.pinned.size() == 1) {
2460 // we reached the end of the line, as pinned maps go; clean up our
2461 // manifest, and let `should_prune()` decide whether we should prune
2462 // again.
2463 tx->erase(get_service_name(), "osdmap_manifest");
2464 return;
2465 }
2466
2467 bufferlist bl;
2468 manifest.encode(bl);
2469 tx->put(get_service_name(), "osdmap_manifest", bl);
2470}
2471
2472void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2473{
2474 dout(1) << __func__ << dendl;
2475
2476 version_t pin_first;
2477
2478 // verify constrainsts on stable in-memory state
2479 if (!has_osdmap_manifest) {
2480 // we must have never pruned, OR if we pruned the state must no longer
2481 // be relevant (i.e., the state must have been removed alongside with
2482 // the trim that *must* have removed past the last pinned map in a
2483 // previous prune).
2484 ceph_assert(osdmap_manifest.pinned.empty());
f67539c2 2485 ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
11fdf7f2
TL
2486 pin_first = get_first_committed();
2487
2488 } else {
2489 // we must have pruned in the past AND its state is still relevant
2490 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2491 // and thus we still hold a manifest in the store).
2492 ceph_assert(!osdmap_manifest.pinned.empty());
2493 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2494 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2495
2496 dout(10) << __func__
2497 << " first_pinned " << osdmap_manifest.get_first_pinned()
2498 << " last_pinned " << osdmap_manifest.get_last_pinned()
2499 << dendl;
2500
2501 pin_first = osdmap_manifest.get_last_pinned();
2502 }
2503
2504 manifest.pin(pin_first);
2505}
2506
2507bool OSDMonitor::_prune_sanitize_options() const
2508{
2509 uint64_t prune_interval =
2510 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2511 uint64_t prune_min =
2512 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2513 uint64_t txsize =
2514 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2515
2516 bool r = true;
2517
2518 if (prune_interval == 0) {
2519 derr << __func__
2520 << " prune is enabled BUT prune interval is zero; abort."
2521 << dendl;
2522 r = false;
2523 } else if (prune_interval == 1) {
2524 derr << __func__
2525 << " prune interval is equal to one, which essentially means"
2526 " no pruning; abort."
2527 << dendl;
2528 r = false;
2529 }
2530 if (prune_min == 0) {
2531 derr << __func__
2532 << " prune is enabled BUT prune min is zero; abort."
2533 << dendl;
2534 r = false;
2535 }
2536 if (prune_interval > prune_min) {
2537 derr << __func__
2538 << " impossible to ascertain proper prune interval because"
2539 << " it is greater than the minimum prune epochs"
2540 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2541 << dendl;
2542 r = false;
2543 }
2544
2545 if (txsize < prune_interval - 1) {
2546 derr << __func__
f67539c2 2547 << " 'mon_osdmap_full_prune_txsize' (" << txsize
11fdf7f2
TL
2548 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2549 << "); abort." << dendl;
2550 r = false;
2551 }
2552 return r;
2553}
2554
2555bool OSDMonitor::is_prune_enabled() const {
2556 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2557}
2558
2559bool OSDMonitor::is_prune_supported() const {
f67539c2 2560 return mon.get_required_mon_features().contains_any(
11fdf7f2
TL
2561 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2562}
2563
2564/** do_prune
2565 *
2566 * @returns true if has side-effects; false otherwise.
2567 */
2568bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2569{
2570 bool enabled = is_prune_enabled();
2571
2572 dout(1) << __func__ << " osdmap full prune "
2573 << ( enabled ? "enabled" : "disabled")
2574 << dendl;
2575
2576 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2577 return false;
2578 }
2579
2580 // we are beyond the minimum prune versions, we need to remove maps because
2581 // otherwise the store will grow unbounded and we may end up having issues
2582 // with available disk space or store hangs.
2583
2584 // we will not pin all versions. We will leave a buffer number of versions.
2585 // this allows us the monitor to trim maps without caring too much about
2586 // pinned maps, and then allow us to use another ceph-mon without these
2587 // capabilities, without having to repair the store.
2588
2589 osdmap_manifest_t manifest = osdmap_manifest;
2590
2591 version_t first = get_first_committed();
2592 version_t last = get_last_committed();
2593
2594 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2595 version_t last_pinned = manifest.get_last_pinned();
2596 uint64_t prune_interval =
2597 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2598 uint64_t txsize =
2599 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2600
2601 prune_init(manifest);
2602
2603 // we need to get rid of some osdmaps
2604
2605 dout(5) << __func__
2606 << " lc (" << first << " .. " << last << ")"
2607 << " last_pinned " << last_pinned
2608 << " interval " << prune_interval
2609 << " last_to_pin " << last_to_pin
2610 << dendl;
2611
2612 // We will be erasing maps as we go.
2613 //
2614 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2615 //
2616 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2617 // we stop pruning. We could prune the maps between `next_to_pin` and
2618 // `last_to_pin`, but by not doing it we end up with neater pruned
2619 // intervals, aligned with `prune_interval`. Besides, this should not be a
2620 // problem as long as `prune_interval` is set to a sane value, instead of
2621 // hundreds or thousands of maps.
2622
2623 auto map_exists = [this](version_t v) {
f67539c2
TL
2624 string k = mon.store->combine_strings("full", v);
2625 return mon.store->exists(get_service_name(), k);
11fdf7f2
TL
2626 };
2627
2628 // 'interval' represents the number of maps from the last pinned
2629 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2630 // version 11 next; all intermediate versions will be removed.
2631 //
2632 // 'txsize' represents the maximum number of versions we'll be removing in
2633 // this iteration. If 'txsize' is large enough to perform multiple passes
2634 // pinning and removing maps, we will do so; if not, we'll do at least one
2635 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2636 // ensure that we never go *over* the maximum.
2637
2638 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2639 uint64_t removal_interval = prune_interval - 1;
2640
2641 if (txsize < removal_interval) {
2642 dout(5) << __func__
2643 << " setting txsize to removal interval size ("
2644 << removal_interval << " versions"
2645 << dendl;
2646 txsize = removal_interval;
2647 }
2648 ceph_assert(removal_interval > 0);
2649
2650 uint64_t num_pruned = 0;
2651 while (num_pruned + removal_interval <= txsize) {
2652 last_pinned = manifest.get_last_pinned();
2653
2654 if (last_pinned + prune_interval > last_to_pin) {
2655 break;
2656 }
2657 ceph_assert(last_pinned < last_to_pin);
2658
2659 version_t next_pinned = last_pinned + prune_interval;
2660 ceph_assert(next_pinned <= last_to_pin);
2661 manifest.pin(next_pinned);
2662
2663 dout(20) << __func__
2664 << " last_pinned " << last_pinned
2665 << " next_pinned " << next_pinned
2666 << " num_pruned " << num_pruned
2667 << " removal interval (" << (last_pinned+1)
2668 << ".." << (next_pinned-1) << ")"
2669 << " txsize " << txsize << dendl;
2670
2671 ceph_assert(map_exists(last_pinned));
2672 ceph_assert(map_exists(next_pinned));
2673
2674 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2675 ceph_assert(!manifest.is_pinned(v));
2676
2677 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
f67539c2 2678 string full_key = mon.store->combine_strings("full", v);
11fdf7f2
TL
2679 tx->erase(get_service_name(), full_key);
2680 ++num_pruned;
2681 }
2682 }
2683
2684 ceph_assert(num_pruned > 0);
2685
2686 bufferlist bl;
2687 manifest.encode(bl);
2688 tx->put(get_service_name(), "osdmap_manifest", bl);
2689
2690 return true;
2691}
2692
2693
7c673cae
FG
2694// -------------
2695
2696bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2697{
2698 op->mark_osdmon_event(__func__);
2699 Message *m = op->get_req();
2700 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2701
2702 switch (m->get_type()) {
2703 // READs
2704 case MSG_MON_COMMAND:
f64942e4
AA
2705 try {
2706 return preprocess_command(op);
11fdf7f2 2707 } catch (const bad_cmd_get& e) {
f64942e4 2708 bufferlist bl;
f67539c2 2709 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
2710 return true;
2711 }
7c673cae
FG
2712 case CEPH_MSG_MON_GET_OSDMAP:
2713 return preprocess_get_osdmap(op);
2714
2715 // damp updates
2716 case MSG_OSD_MARK_ME_DOWN:
2717 return preprocess_mark_me_down(op);
9f95a23c
TL
2718 case MSG_OSD_MARK_ME_DEAD:
2719 return preprocess_mark_me_dead(op);
7c673cae
FG
2720 case MSG_OSD_FULL:
2721 return preprocess_full(op);
2722 case MSG_OSD_FAILURE:
2723 return preprocess_failure(op);
2724 case MSG_OSD_BOOT:
2725 return preprocess_boot(op);
2726 case MSG_OSD_ALIVE:
2727 return preprocess_alive(op);
2728 case MSG_OSD_PG_CREATED:
2729 return preprocess_pg_created(op);
11fdf7f2
TL
2730 case MSG_OSD_PG_READY_TO_MERGE:
2731 return preprocess_pg_ready_to_merge(op);
7c673cae
FG
2732 case MSG_OSD_PGTEMP:
2733 return preprocess_pgtemp(op);
2734 case MSG_OSD_BEACON:
2735 return preprocess_beacon(op);
2736
2737 case CEPH_MSG_POOLOP:
2738 return preprocess_pool_op(op);
2739
2740 case MSG_REMOVE_SNAPS:
2741 return preprocess_remove_snaps(op);
2742
9f95a23c
TL
2743 case MSG_MON_GET_PURGED_SNAPS:
2744 return preprocess_get_purged_snaps(op);
2745
7c673cae
FG
2746 default:
2747 ceph_abort();
2748 return true;
2749 }
2750}
2751
2752bool OSDMonitor::prepare_update(MonOpRequestRef op)
2753{
2754 op->mark_osdmon_event(__func__);
2755 Message *m = op->get_req();
2756 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2757
2758 switch (m->get_type()) {
2759 // damp updates
2760 case MSG_OSD_MARK_ME_DOWN:
2761 return prepare_mark_me_down(op);
9f95a23c
TL
2762 case MSG_OSD_MARK_ME_DEAD:
2763 return prepare_mark_me_dead(op);
7c673cae
FG
2764 case MSG_OSD_FULL:
2765 return prepare_full(op);
2766 case MSG_OSD_FAILURE:
2767 return prepare_failure(op);
2768 case MSG_OSD_BOOT:
2769 return prepare_boot(op);
2770 case MSG_OSD_ALIVE:
2771 return prepare_alive(op);
2772 case MSG_OSD_PG_CREATED:
2773 return prepare_pg_created(op);
2774 case MSG_OSD_PGTEMP:
2775 return prepare_pgtemp(op);
11fdf7f2
TL
2776 case MSG_OSD_PG_READY_TO_MERGE:
2777 return prepare_pg_ready_to_merge(op);
7c673cae
FG
2778 case MSG_OSD_BEACON:
2779 return prepare_beacon(op);
2780
2781 case MSG_MON_COMMAND:
f64942e4
AA
2782 try {
2783 return prepare_command(op);
11fdf7f2 2784 } catch (const bad_cmd_get& e) {
f64942e4 2785 bufferlist bl;
f67539c2 2786 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
2787 return true;
2788 }
7c673cae
FG
2789
2790 case CEPH_MSG_POOLOP:
2791 return prepare_pool_op(op);
2792
2793 case MSG_REMOVE_SNAPS:
2794 return prepare_remove_snaps(op);
2795
2796
2797 default:
2798 ceph_abort();
2799 }
2800
2801 return false;
2802}
2803
2804bool OSDMonitor::should_propose(double& delay)
2805{
2806 dout(10) << "should_propose" << dendl;
2807
2808 // if full map, propose immediately! any subsequent changes will be clobbered.
2809 if (pending_inc.fullmap.length())
2810 return true;
2811
2812 // adjust osd weights?
2813 if (!osd_weight.empty() &&
2814 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2815 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2816 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2817 delay = 0.0;
2818 osd_weight.clear();
2819 return true;
2820 }
2821
7c673cae
FG
2822 return PaxosService::should_propose(delay);
2823}
2824
2825
2826
2827// ---------------------------
2828// READs
2829
2830bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2831{
2832 op->mark_osdmon_event(__func__);
9f95a23c 2833 auto m = op->get_req<MMonGetOSDMap>();
28e407b8 2834
f67539c2 2835 uint64_t features = mon.get_quorum_con_features();
11fdf7f2
TL
2836 if (op->get_session() && op->get_session()->con_features)
2837 features = op->get_session()->con_features;
28e407b8 2838
7c673cae 2839 dout(10) << __func__ << " " << *m << dendl;
f67539c2 2840 MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
7c673cae
FG
2841 epoch_t first = get_first_committed();
2842 epoch_t last = osdmap.get_epoch();
11fdf7f2
TL
2843 int max = g_conf()->osd_map_message_max;
2844 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2845 for (epoch_t e = std::max(first, m->get_full_first());
2846 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
7c673cae 2847 ++e, --max) {
11fdf7f2
TL
2848 bufferlist& bl = reply->maps[e];
2849 int r = get_version_full(e, features, bl);
2850 ceph_assert(r >= 0);
2851 max_bytes -= bl.length();
7c673cae 2852 }
11fdf7f2
TL
2853 for (epoch_t e = std::max(first, m->get_inc_first());
2854 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
7c673cae 2855 ++e, --max) {
11fdf7f2
TL
2856 bufferlist& bl = reply->incremental_maps[e];
2857 int r = get_version(e, features, bl);
2858 ceph_assert(r >= 0);
2859 max_bytes -= bl.length();
7c673cae
FG
2860 }
2861 reply->oldest_map = first;
2862 reply->newest_map = last;
f67539c2 2863 mon.send_reply(op, reply);
7c673cae
FG
2864 return true;
2865}
2866
2867
2868// ---------------------------
2869// UPDATEs
2870
2871// failure --
2872
11fdf7f2 2873bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
7c673cae 2874 // check permissions
11fdf7f2 2875 MonSession *session = op->get_session();
7c673cae
FG
2876 if (!session)
2877 return true;
2878 if (!session->is_capable("osd", MON_CAP_X)) {
2879 dout(0) << "got MOSDFailure from entity with insufficient caps "
2880 << session->caps << dendl;
2881 return true;
2882 }
f67539c2 2883 if (fsid != mon.monmap->fsid) {
7c673cae 2884 dout(0) << "check_source: on fsid " << fsid
f67539c2 2885 << " != " << mon.monmap->fsid << dendl;
7c673cae
FG
2886 return true;
2887 }
2888 return false;
2889}
2890
2891
2892bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2893{
2894 op->mark_osdmon_event(__func__);
9f95a23c 2895 auto m = op->get_req<MOSDFailure>();
7c673cae 2896 // who is target_osd
11fdf7f2 2897 int badboy = m->get_target_osd();
7c673cae
FG
2898
2899 // check permissions
11fdf7f2 2900 if (check_source(op, m->fsid))
7c673cae
FG
2901 goto didit;
2902
2903 // first, verify the reporting host is valid
2904 if (m->get_orig_source().is_osd()) {
2905 int from = m->get_orig_source().num();
2906 if (!osdmap.exists(from) ||
11fdf7f2 2907 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
7c673cae 2908 (osdmap.is_down(from) && m->if_osd_failed())) {
11fdf7f2
TL
2909 dout(5) << "preprocess_failure from dead osd." << from
2910 << ", ignoring" << dendl;
7c673cae
FG
2911 send_incremental(op, m->get_epoch()+1);
2912 goto didit;
2913 }
2914 }
2915
2916
2917 // weird?
2918 if (osdmap.is_down(badboy)) {
11fdf7f2
TL
2919 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2920 << " " << m->get_target_addrs()
2921 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2922 if (m->get_epoch() < osdmap.get_epoch())
2923 send_incremental(op, m->get_epoch()+1);
2924 goto didit;
2925 }
11fdf7f2
TL
2926 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2927 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2928 << " " << m->get_target_addrs()
2929 << " != map's " << osdmap.get_addrs(badboy)
2930 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2931 if (m->get_epoch() < osdmap.get_epoch())
2932 send_incremental(op, m->get_epoch()+1);
2933 goto didit;
2934 }
2935
2936 // already reported?
2937 if (osdmap.is_down(badboy) ||
2938 osdmap.get_up_from(badboy) > m->get_epoch()) {
11fdf7f2
TL
2939 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2940 << " " << m->get_target_addrs()
2941 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2942 if (m->get_epoch() < osdmap.get_epoch())
2943 send_incremental(op, m->get_epoch()+1);
2944 goto didit;
2945 }
2946
2947 if (!can_mark_down(badboy)) {
11fdf7f2
TL
2948 dout(5) << "preprocess_failure ignoring report of osd."
2949 << m->get_target_osd() << " " << m->get_target_addrs()
2950 << " from " << m->get_orig_source() << dendl;
7c673cae
FG
2951 goto didit;
2952 }
2953
11fdf7f2
TL
2954 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2955 << " " << m->get_target_addrs()
2956 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2957 return false;
2958
2959 didit:
f67539c2 2960 mon.no_reply(op);
7c673cae
FG
2961 return true;
2962}
2963
2964class C_AckMarkedDown : public C_MonOp {
2965 OSDMonitor *osdmon;
2966public:
2967 C_AckMarkedDown(
2968 OSDMonitor *osdmon,
2969 MonOpRequestRef op)
2970 : C_MonOp(op), osdmon(osdmon) {}
2971
eafe8130
TL
2972 void _finish(int r) override {
2973 if (r == 0) {
9f95a23c 2974 auto m = op->get_req<MOSDMarkMeDown>();
f67539c2 2975 osdmon->mon.send_reply(
eafe8130
TL
2976 op,
2977 new MOSDMarkMeDown(
2978 m->fsid,
2979 m->target_osd,
2980 m->target_addrs,
2981 m->get_epoch(),
2982 false)); // ACK itself does not request an ack
2983 } else if (r == -EAGAIN) {
2984 osdmon->dispatch(op);
2985 } else {
2986 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2987 }
7c673cae
FG
2988 }
2989 ~C_AckMarkedDown() override {
2990 }
2991};
2992
2993bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2994{
2995 op->mark_osdmon_event(__func__);
9f95a23c 2996 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 2997 int from = m->target_osd;
7c673cae
FG
2998
2999 // check permissions
11fdf7f2 3000 if (check_source(op, m->fsid))
7c673cae
FG
3001 goto reply;
3002
3003 // first, verify the reporting host is valid
3004 if (!m->get_orig_source().is_osd())
3005 goto reply;
3006
3007 if (!osdmap.exists(from) ||
3008 osdmap.is_down(from) ||
11fdf7f2 3009 osdmap.get_addrs(from) != m->target_addrs) {
7c673cae
FG
3010 dout(5) << "preprocess_mark_me_down from dead osd."
3011 << from << ", ignoring" << dendl;
3012 send_incremental(op, m->get_epoch()+1);
3013 goto reply;
3014 }
3015
3016 // no down might be set
11fdf7f2 3017 if (!can_mark_down(from))
7c673cae
FG
3018 goto reply;
3019
11fdf7f2
TL
3020 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3021 << " " << m->target_addrs << dendl;
7c673cae
FG
3022 return false;
3023
3024 reply:
3025 if (m->request_ack) {
3026 Context *c(new C_AckMarkedDown(this, op));
3027 c->complete(0);
3028 }
3029 return true;
3030}
3031
3032bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3033{
3034 op->mark_osdmon_event(__func__);
9f95a23c 3035 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 3036 int target_osd = m->target_osd;
7c673cae 3037
11fdf7f2
TL
3038 ceph_assert(osdmap.is_up(target_osd));
3039 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
7c673cae 3040
f67539c2 3041 mon.clog->info() << "osd." << target_osd << " marked itself down";
7c673cae
FG
3042 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3043 if (m->request_ack)
3044 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3045 return true;
3046}
3047
9f95a23c
TL
3048bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3049{
3050 op->mark_osdmon_event(__func__);
3051 auto m = op->get_req<MOSDMarkMeDead>();
3052 int from = m->target_osd;
3053
3054 // check permissions
3055 if (check_source(op, m->fsid)) {
f67539c2 3056 mon.no_reply(op);
9f95a23c
TL
3057 return true;
3058 }
3059
3060 // first, verify the reporting host is valid
3061 if (!m->get_orig_source().is_osd()) {
f67539c2 3062 mon.no_reply(op);
9f95a23c
TL
3063 return true;
3064 }
3065
3066 if (!osdmap.exists(from) ||
3067 !osdmap.is_down(from)) {
3068 dout(5) << __func__ << " from nonexistent or up osd." << from
3069 << ", ignoring" << dendl;
3070 send_incremental(op, m->get_epoch()+1);
f67539c2 3071 mon.no_reply(op);
9f95a23c
TL
3072 return true;
3073 }
3074
3075 return false;
3076}
3077
3078bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3079{
3080 op->mark_osdmon_event(__func__);
3081 auto m = op->get_req<MOSDMarkMeDead>();
3082 int target_osd = m->target_osd;
3083
3084 ceph_assert(osdmap.is_down(target_osd));
3085
f67539c2 3086 mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
9f95a23c
TL
3087 << m->get_epoch();
3088 if (!pending_inc.new_xinfo.count(target_osd)) {
3089 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3090 }
3091 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3092 wait_for_finished_proposal(
3093 op,
3094 new LambdaContext(
3095 [op, this] (int r) {
3096 if (r >= 0) {
f67539c2 3097 mon.no_reply(op); // ignore on success
9f95a23c
TL
3098 }
3099 }
3100 ));
3101 return true;
3102}
3103
7c673cae
FG
3104bool OSDMonitor::can_mark_down(int i)
3105{
31f18b77
FG
3106 if (osdmap.is_nodown(i)) {
3107 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3108 << "will not mark it down" << dendl;
7c673cae
FG
3109 return false;
3110 }
31f18b77 3111
7c673cae
FG
3112 int num_osds = osdmap.get_num_osds();
3113 if (num_osds == 0) {
31f18b77 3114 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
3115 return false;
3116 }
3117 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3118 float up_ratio = (float)up / (float)num_osds;
11fdf7f2 3119 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
31f18b77 3120 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
11fdf7f2 3121 << g_conf()->mon_osd_min_up_ratio
7c673cae
FG
3122 << ", will not mark osd." << i << " down" << dendl;
3123 return false;
3124 }
3125 return true;
3126}
3127
3128bool OSDMonitor::can_mark_up(int i)
3129{
31f18b77
FG
3130 if (osdmap.is_noup(i)) {
3131 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3132 << "will not mark it up" << dendl;
7c673cae
FG
3133 return false;
3134 }
31f18b77 3135
7c673cae
FG
3136 return true;
3137}
3138
3139/**
3140 * @note the parameter @p i apparently only exists here so we can output the
3141 * osd's id on messages.
3142 */
3143bool OSDMonitor::can_mark_out(int i)
3144{
31f18b77
FG
3145 if (osdmap.is_noout(i)) {
3146 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3147 << "will not mark it out" << dendl;
3148 return false;
3149 }
3150
7c673cae
FG
3151 int num_osds = osdmap.get_num_osds();
3152 if (num_osds == 0) {
3153 dout(5) << __func__ << " no osds" << dendl;
3154 return false;
3155 }
3156 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3157 float in_ratio = (float)in / (float)num_osds;
11fdf7f2 3158 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
7c673cae
FG
3159 if (i >= 0)
3160 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3161 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3162 << ", will not mark osd." << i << " out" << dendl;
3163 else
3164 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3165 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3166 << ", will not mark osds out" << dendl;
3167 return false;
3168 }
3169
3170 return true;
3171}
3172
3173bool OSDMonitor::can_mark_in(int i)
3174{
31f18b77
FG
3175 if (osdmap.is_noin(i)) {
3176 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3177 << "will not mark it in" << dendl;
7c673cae
FG
3178 return false;
3179 }
31f18b77 3180
7c673cae
FG
3181 return true;
3182}
3183
3184bool OSDMonitor::check_failures(utime_t now)
3185{
3186 bool found_failure = false;
b3b6e05e
TL
3187 auto p = failure_info.begin();
3188 while (p != failure_info.end()) {
3189 auto& [target_osd, fi] = *p;
3190 if (can_mark_down(target_osd) &&
3191 check_failure(now, target_osd, fi)) {
3192 found_failure = true;
3193 ++p;
3194 } else if (is_failure_stale(now, fi)) {
3195 dout(10) << " dropping stale failure_info for osd." << target_osd
3196 << " from " << fi.reporters.size() << " reporters"
3197 << dendl;
3198 p = failure_info.erase(p);
3199 } else {
3200 ++p;
7c673cae
FG
3201 }
3202 }
3203 return found_failure;
3204}
3205
b3b6e05e
TL
3206utime_t OSDMonitor::get_grace_time(utime_t now,
3207 int target_osd,
3208 failure_info_t& fi) const
3209{
3210 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3211 if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3212 return orig_grace;
3213 }
3214 utime_t grace = orig_grace;
3215 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3216 double decay_k = ::log(.5) / halflife;
3217
3218 // scale grace period based on historical probability of 'lagginess'
3219 // (false positive failures due to slowness).
3220 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3221 const utime_t failed_for = now - fi.get_failed_since();
3222 double decay = exp((double)failed_for * decay_k);
3223 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3224 << " failed_for " << failed_for << " decay " << decay << dendl;
3225 double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3226 grace += my_grace;
3227
3228 // consider the peers reporting a failure a proxy for a potential
3229 // 'subcluster' over the overall cluster that is similarly
3230 // laggy. this is clearly not true in all cases, but will sometimes
3231 // help us localize the grace correction to a subset of the system
3232 // (say, a rack with a bad switch) that is unhappy.
3233 double peer_grace = 0;
3234 for (auto& [reporter, report] : fi.reporters) {
3235 if (osdmap.exists(reporter)) {
3236 const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3237 utime_t elapsed = now - xi.down_stamp;
3238 double decay = exp((double)elapsed * decay_k);
3239 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3240 }
3241 }
3242 peer_grace /= (double)fi.reporters.size();
3243 grace += peer_grace;
3244 dout(10) << " osd." << target_osd << " has "
3245 << fi.reporters.size() << " reporters, "
3246 << grace << " grace (" << orig_grace << " + " << my_grace
3247 << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3248 << dendl;
3249
3250 return grace;
3251}
3252
7c673cae
FG
3253bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3254{
3255 // already pending failure?
3256 if (pending_inc.new_state.count(target_osd) &&
3257 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3258 dout(10) << " already pending failure" << dendl;
3259 return true;
3260 }
3261
3262 set<string> reporters_by_subtree;
11fdf7f2 3263 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
11fdf7f2 3264 ceph_assert(fi.reporters.size());
eafe8130 3265 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
7c673cae
FG
3266 // get the parent bucket whose type matches with "reporter_subtree_level".
3267 // fall back to OSD if the level doesn't exist.
eafe8130
TL
3268 if (osdmap.exists(p->first)) {
3269 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3270 if (auto iter = reporter_loc.find(reporter_subtree_level);
3271 iter == reporter_loc.end()) {
3272 reporters_by_subtree.insert("osd." + to_string(p->first));
3273 } else {
3274 reporters_by_subtree.insert(iter->second);
3275 }
eafe8130 3276 ++p;
7c673cae 3277 } else {
eafe8130
TL
3278 fi.cancel_report(p->first);;
3279 p = fi.reporters.erase(p);
7c673cae
FG
3280 }
3281 }
b3b6e05e
TL
3282 if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3283 return false;
7c673cae 3284 }
b3b6e05e
TL
3285 const utime_t failed_for = now - fi.get_failed_since();
3286 const utime_t grace = get_grace_time(now, target_osd, fi);
3287 if (failed_for >= grace) {
7c673cae
FG
3288 dout(1) << " we have enough reporters to mark osd." << target_osd
3289 << " down" << dendl;
3290 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3291
f67539c2 3292 mon.clog->info() << "osd." << target_osd << " failed ("
31f18b77
FG
3293 << osdmap.crush->get_full_location_ordered_string(
3294 target_osd)
3295 << ") ("
3296 << (int)reporters_by_subtree.size()
3297 << " reporters from different "
7c673cae
FG
3298 << reporter_subtree_level << " after "
3299 << failed_for << " >= grace " << grace << ")";
3300 return true;
3301 }
3302 return false;
3303}
3304
b3b6e05e
TL
3305bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3306{
3307 // if it takes too long to either cancel the report to mark the osd down,
3308 // some reporters must have failed to cancel their reports. let's just
3309 // forget these reports.
3310 const utime_t failed_for = now - fi.get_failed_since();
3311 auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3312 auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3313 return failed_for >= (heartbeat_grace + heartbeat_stale);
3314}
3315
224ce89b 3316void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
3317{
3318 // already pending failure?
3319 if (pending_inc.new_state.count(target_osd) &&
3320 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3321 dout(10) << " already pending failure" << dendl;
3322 return;
3323 }
3324
3325 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3326 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
9f95a23c
TL
3327 if (!pending_inc.new_xinfo.count(target_osd)) {
3328 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3329 }
3330 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
7c673cae 3331
f67539c2 3332 mon.clog->info() << "osd." << target_osd << " failed ("
31f18b77
FG
3333 << osdmap.crush->get_full_location_ordered_string(target_osd)
3334 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
3335 return;
3336}
3337
3338bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3339{
3340 op->mark_osdmon_event(__func__);
9f95a23c 3341 auto m = op->get_req<MOSDFailure>();
11fdf7f2
TL
3342 dout(1) << "prepare_failure osd." << m->get_target_osd()
3343 << " " << m->get_target_addrs()
3344 << " from " << m->get_orig_source()
7c673cae
FG
3345 << " is reporting failure:" << m->if_osd_failed() << dendl;
3346
11fdf7f2 3347 int target_osd = m->get_target_osd();
7c673cae 3348 int reporter = m->get_orig_source().num();
11fdf7f2
TL
3349 ceph_assert(osdmap.is_up(target_osd));
3350 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
7c673cae 3351
f67539c2 3352 mon.no_reply(op);
eafe8130 3353
7c673cae
FG
3354 if (m->if_osd_failed()) {
3355 // calculate failure time
3356 utime_t now = ceph_clock_now();
3357 utime_t failed_since =
3358 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3359
3360 // add a report
3361 if (m->is_immediate()) {
f67539c2 3362 mon.clog->debug() << "osd." << m->get_target_osd()
11fdf7f2
TL
3363 << " reported immediately failed by "
3364 << m->get_orig_source();
224ce89b 3365 force_failure(target_osd, reporter);
7c673cae
FG
3366 return true;
3367 }
f67539c2 3368 mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
11fdf7f2 3369 << m->get_orig_source();
7c673cae
FG
3370
3371 failure_info_t& fi = failure_info[target_osd];
b3b6e05e 3372 fi.add_report(reporter, failed_since, op);
7c673cae
FG
3373 return check_failure(now, target_osd, fi);
3374 } else {
3375 // remove the report
f67539c2 3376 mon.clog->debug() << "osd." << m->get_target_osd()
11fdf7f2
TL
3377 << " failure report canceled by "
3378 << m->get_orig_source();
7c673cae
FG
3379 if (failure_info.count(target_osd)) {
3380 failure_info_t& fi = failure_info[target_osd];
b3b6e05e 3381 fi.cancel_report(reporter);
7c673cae
FG
3382 if (fi.reporters.empty()) {
3383 dout(10) << " removing last failure_info for osd." << target_osd
3384 << dendl;
3385 failure_info.erase(target_osd);
3386 } else {
3387 dout(10) << " failure_info for osd." << target_osd << " now "
3388 << fi.reporters.size() << " reporters" << dendl;
3389 }
3390 } else {
3391 dout(10) << " no failure_info for osd." << target_osd << dendl;
3392 }
7c673cae
FG
3393 }
3394
3395 return false;
3396}
3397
3398void OSDMonitor::process_failures()
3399{
3400 map<int,failure_info_t>::iterator p = failure_info.begin();
3401 while (p != failure_info.end()) {
3402 if (osdmap.is_up(p->first)) {
3403 ++p;
3404 } else {
3405 dout(10) << "process_failures osd." << p->first << dendl;
3406 list<MonOpRequestRef> ls;
3407 p->second.take_report_messages(ls);
3408 failure_info.erase(p++);
3409
3410 while (!ls.empty()) {
3411 MonOpRequestRef o = ls.front();
3412 if (o) {
3413 o->mark_event(__func__);
3414 MOSDFailure *m = o->get_req<MOSDFailure>();
3415 send_latest(o, m->get_epoch());
f67539c2 3416 mon.no_reply(o);
7c673cae
FG
3417 }
3418 ls.pop_front();
3419 }
3420 }
3421 }
3422}
3423
3424void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3425{
3426 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3427
3428 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3429 p != failure_info.end();
3430 ++p) {
3431 p->second.take_report_messages(ls);
3432 }
3433 failure_info.clear();
3434}
3435
f6b5b4d7
TL
3436int OSDMonitor::get_grace_interval_threshold()
3437{
3438 int halflife = g_conf()->mon_osd_laggy_halflife;
3439 // Scale the halflife period (default: 1_hr) by
3440 // a factor (48) to calculate the threshold.
3441 int grace_threshold_factor = 48;
3442 return halflife * grace_threshold_factor;
3443}
3444
3445bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3446{
3447 int grace_interval_threshold_secs = get_grace_interval_threshold();
3448 if (last_failed_interval > grace_interval_threshold_secs) {
3449 dout(1) << " last_failed_interval " << last_failed_interval
3450 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3451 << dendl;
3452 return true;
3453 }
3454 return false;
3455}
3456
3457void OSDMonitor::set_default_laggy_params(int target_osd)
3458{
3459 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3460 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3461 }
3462 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3463 xi.down_stamp = pending_inc.modified;
3464 xi.laggy_probability = 0.0;
3465 xi.laggy_interval = 0;
3466 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3467}
3468
7c673cae
FG
3469
3470// boot --
3471
3472bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3473{
3474 op->mark_osdmon_event(__func__);
9f95a23c 3475 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3476 int from = m->get_orig_source_inst().name.num();
3477
3478 // check permissions, ignore if failed (no response expected)
11fdf7f2 3479 MonSession *session = op->get_session();
7c673cae
FG
3480 if (!session)
3481 goto ignore;
3482 if (!session->is_capable("osd", MON_CAP_X)) {
3483 dout(0) << "got preprocess_boot message from entity with insufficient caps"
11fdf7f2 3484 << session->caps << dendl;
7c673cae
FG
3485 goto ignore;
3486 }
3487
f67539c2 3488 if (m->sb.cluster_fsid != mon.monmap->fsid) {
11fdf7f2 3489 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
f67539c2 3490 << " != " << mon.monmap->fsid << dendl;
7c673cae
FG
3491 goto ignore;
3492 }
3493
11fdf7f2
TL
3494 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3495 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
7c673cae
FG
3496 goto ignore;
3497 }
3498
11fdf7f2 3499 ceph_assert(m->get_orig_source_inst().name.is_osd());
7c673cae 3500
11fdf7f2
TL
3501 // force all osds to have gone through luminous prior to upgrade to nautilus
3502 {
3503 vector<string> missing;
3504 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3505 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3506 }
3507 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3508 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3509 }
3510 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3511 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3512 }
3513 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3514 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3515 }
7c673cae 3516
11fdf7f2
TL
3517 if (!missing.empty()) {
3518 using std::experimental::make_ostream_joiner;
7c673cae 3519
11fdf7f2
TL
3520 stringstream ss;
3521 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
c07f9fc5 3522
f67539c2 3523 mon.clog->info() << "disallowing boot of OSD "
11fdf7f2
TL
3524 << m->get_orig_source_inst()
3525 << " because the osd lacks " << ss.str();
7c673cae
FG
3526 goto ignore;
3527 }
3528 }
3529
9f95a23c
TL
3530 // make sure osd versions do not span more than 3 releases
3531 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3532 osdmap.require_osd_release < ceph_release_t::mimic) {
f67539c2 3533 mon.clog->info() << "disallowing boot of octopus+ OSD "
7c673cae 3534 << m->get_orig_source_inst()
9f95a23c 3535 << " because require_osd_release < mimic";
7c673cae
FG
3536 goto ignore;
3537 }
f67539c2
TL
3538 if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
3539 osdmap.require_osd_release < ceph_release_t::nautilus) {
3540 mon.clog->info() << "disallowing boot of pacific+ OSD "
3541 << m->get_orig_source_inst()
3542 << " because require_osd_release < nautilus";
3543 goto ignore;
3544 }
7c673cae 3545
f64942e4
AA
3546 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3547 // we are reusing a jewel feature bit that was retired in luminous.
9f95a23c 3548 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
f64942e4
AA
3549 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3550 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
f67539c2 3551 mon.clog->info() << "disallowing boot of OSD "
f64942e4
AA
3552 << m->get_orig_source_inst()
3553 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3554 goto ignore;
3555 }
3556
f67539c2
TL
3557 if (osdmap.stretch_mode_enabled &&
3558 !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3559 mon.clog->info() << "disallowing boot of OSD "
3560 << m->get_orig_source_inst()
3561 << " because stretch mode is on and OSD lacks support";
3562 goto ignore;
3563 }
3564
7c673cae
FG
3565 // already booted?
3566 if (osdmap.is_up(from) &&
11fdf7f2
TL
3567 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3568 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
7c673cae 3569 // yup.
11fdf7f2
TL
3570 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3571 << " " << m->get_orig_source_addrs()
3572 << " =~ " << osdmap.get_addrs(from) << dendl;
7c673cae
FG
3573 _booted(op, false);
3574 return true;
3575 }
3576
3577 if (osdmap.exists(from) &&
3578 !osdmap.get_uuid(from).is_zero() &&
3579 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3580 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3581 << " clashes with existing osd: different fsid"
3582 << " (ours: " << osdmap.get_uuid(from)
3583 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3584 goto ignore;
3585 }
3586
3587 if (osdmap.exists(from) &&
3588 osdmap.get_info(from).up_from > m->version &&
11fdf7f2
TL
3589 osdmap.get_most_recent_addrs(from).legacy_equals(
3590 m->get_orig_source_addrs())) {
7c673cae
FG
3591 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3592 send_latest(op, m->sb.current_epoch+1);
3593 return true;
3594 }
3595
3596 // noup?
3597 if (!can_mark_up(from)) {
3598 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3599 send_latest(op, m->sb.current_epoch+1);
3600 return true;
3601 }
3602
3603 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3604 return false;
3605
3606 ignore:
3607 return true;
3608}
3609
3610bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3611{
3612 op->mark_osdmon_event(__func__);
9f95a23c 3613 auto m = op->get_req<MOSDBoot>();
11fdf7f2
TL
3614 dout(7) << __func__ << " from " << m->get_source()
3615 << " sb " << m->sb
3616 << " client_addrs" << m->get_connection()->get_peer_addrs()
3617 << " cluster_addrs " << m->cluster_addrs
3618 << " hb_back_addrs " << m->hb_back_addrs
3619 << " hb_front_addrs " << m->hb_front_addrs
7c673cae
FG
3620 << dendl;
3621
11fdf7f2 3622 ceph_assert(m->get_orig_source().is_osd());
7c673cae
FG
3623 int from = m->get_orig_source().num();
3624
3625 // does this osd exist?
3626 if (from >= osdmap.get_max_osd()) {
3627 dout(1) << "boot from osd." << from << " >= max_osd "
3628 << osdmap.get_max_osd() << dendl;
3629 return false;
3630 }
3631
3632 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3633 if (pending_inc.new_state.count(from))
3634 oldstate ^= pending_inc.new_state[from];
3635
3636 // already up? mark down first?
3637 if (osdmap.is_up(from)) {
11fdf7f2
TL
3638 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3639 << osdmap.get_addrs(from) << dendl;
7c673cae 3640 // preprocess should have caught these; if not, assert.
11fdf7f2
TL
3641 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3642 m->get_orig_source_addrs()) ||
3643 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3644 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
7c673cae
FG
3645
3646 if (pending_inc.new_state.count(from) == 0 ||
3647 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3648 // mark previous guy down
3649 pending_inc.new_state[from] = CEPH_OSD_UP;
3650 }
3651 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3652 } else if (pending_inc.new_up_client.count(from)) {
3653 // already prepared, just wait
3654 dout(7) << __func__ << " already prepared, waiting on "
3655 << m->get_orig_source_addr() << dendl;
3656 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3657 } else {
3658 // mark new guy up.
11fdf7f2
TL
3659 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3660 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3661 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3662 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
7c673cae
FG
3663
3664 down_pending_out.erase(from); // if any
3665
3666 if (m->sb.weight)
3667 osd_weight[from] = m->sb.weight;
3668
3669 // set uuid?
3670 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3671 << dendl;
3672 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3673 // preprocess should have caught this; if not, assert.
11fdf7f2 3674 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
7c673cae
FG
3675 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3676 }
3677
3678 // fresh osd?
3679 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3680 const osd_info_t& i = osdmap.get_info(from);
3681 if (i.up_from > i.lost_at) {
3682 dout(10) << " fresh osd; marking lost_at too" << dendl;
3683 pending_inc.new_lost[from] = osdmap.get_epoch();
3684 }
3685 }
3686
3687 // metadata
3688 bufferlist osd_metadata;
11fdf7f2 3689 encode(m->metadata, osd_metadata);
7c673cae 3690 pending_metadata[from] = osd_metadata;
31f18b77 3691 pending_metadata_rm.erase(from);
7c673cae
FG
3692
3693 // adjust last clean unmount epoch?
3694 const osd_info_t& info = osdmap.get_info(from);
3695 dout(10) << " old osd_info: " << info << dendl;
3696 if (m->sb.mounted > info.last_clean_begin ||
3697 (m->sb.mounted == info.last_clean_begin &&
3698 m->sb.clean_thru > info.last_clean_end)) {
3699 epoch_t begin = m->sb.mounted;
3700 epoch_t end = m->sb.clean_thru;
3701
3702 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3703 << "[" << info.last_clean_begin << "," << info.last_clean_end
3704 << ") -> [" << begin << "-" << end << ")"
3705 << dendl;
3706 pending_inc.new_last_clean_interval[from] =
3707 pair<epoch_t,epoch_t>(begin, end);
3708 }
3709
9f95a23c
TL
3710 if (pending_inc.new_xinfo.count(from) == 0)
3711 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3712 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
7c673cae 3713 if (m->boot_epoch == 0) {
11fdf7f2
TL
3714 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3715 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3716 dout(10) << " not laggy, new xi " << xi << dendl;
3717 } else {
3718 if (xi.down_stamp.sec()) {
3719 int interval = ceph_clock_now().sec() -
3720 xi.down_stamp.sec();
11fdf7f2
TL
3721 if (g_conf()->mon_osd_laggy_max_interval &&
3722 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3723 interval = g_conf()->mon_osd_laggy_max_interval;
7c673cae
FG
3724 }
3725 xi.laggy_interval =
11fdf7f2
TL
3726 interval * g_conf()->mon_osd_laggy_weight +
3727 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3728 }
3729 xi.laggy_probability =
11fdf7f2
TL
3730 g_conf()->mon_osd_laggy_weight +
3731 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3732 dout(10) << " laggy, now xi " << xi << dendl;
3733 }
3734
3735 // set features shared by the osd
3736 if (m->osd_features)
3737 xi.features = m->osd_features;
3738 else
3739 xi.features = m->get_connection()->get_features();
3740
3741 // mark in?
11fdf7f2 3742 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
7c673cae 3743 (oldstate & CEPH_OSD_AUTOOUT)) ||
11fdf7f2
TL
3744 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3745 (g_conf()->mon_osd_auto_mark_in)) {
7c673cae 3746 if (can_mark_in(from)) {
9f95a23c
TL
3747 if (xi.old_weight > 0) {
3748 pending_inc.new_weight[from] = xi.old_weight;
7c673cae
FG
3749 xi.old_weight = 0;
3750 } else {
3751 pending_inc.new_weight[from] = CEPH_OSD_IN;
3752 }
3753 } else {
3754 dout(7) << __func__ << " NOIN set, will not mark in "
3755 << m->get_orig_source_addr() << dendl;
3756 }
3757 }
3758
7c673cae
FG
3759 // wait
3760 wait_for_finished_proposal(op, new C_Booted(this, op));
3761 }
3762 return true;
3763}
3764
3765void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3766{
3767 op->mark_osdmon_event(__func__);
9f95a23c 3768 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3769 dout(7) << "_booted " << m->get_orig_source_inst()
3770 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3771
3772 if (logit) {
f67539c2 3773 mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
11fdf7f2 3774 << " boot";
7c673cae
FG
3775 }
3776
3777 send_latest(op, m->sb.current_epoch+1);
3778}
3779
3780
3781// -------------
3782// full
3783
3784bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3785{
3786 op->mark_osdmon_event(__func__);
9f95a23c 3787 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3788 int from = m->get_orig_source().num();
3789 set<string> state;
3790 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3791
3792 // check permissions, ignore if failed
11fdf7f2 3793 MonSession *session = op->get_session();
7c673cae
FG
3794 if (!session)
3795 goto ignore;
3796 if (!session->is_capable("osd", MON_CAP_X)) {
3797 dout(0) << "MOSDFull from entity with insufficient privileges:"
3798 << session->caps << dendl;
3799 goto ignore;
3800 }
3801
3802 // ignore a full message from the osd instance that already went down
3803 if (!osdmap.exists(from)) {
3804 dout(7) << __func__ << " ignoring full message from nonexistent "
3805 << m->get_orig_source_inst() << dendl;
3806 goto ignore;
3807 }
3808 if ((!osdmap.is_up(from) &&
11fdf7f2
TL
3809 osdmap.get_most_recent_addrs(from).legacy_equals(
3810 m->get_orig_source_addrs())) ||
7c673cae 3811 (osdmap.is_up(from) &&
11fdf7f2 3812 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
7c673cae
FG
3813 dout(7) << __func__ << " ignoring full message from down "
3814 << m->get_orig_source_inst() << dendl;
3815 goto ignore;
3816 }
3817
3818 OSDMap::calc_state_set(osdmap.get_state(from), state);
3819
3820 if ((osdmap.get_state(from) & mask) == m->state) {
3821 dout(7) << __func__ << " state already " << state << " for osd." << from
3822 << " " << m->get_orig_source_inst() << dendl;
3823 _reply_map(op, m->version);
3824 goto ignore;
3825 }
3826
3827 dout(10) << __func__ << " want state " << state << " for osd." << from
3828 << " " << m->get_orig_source_inst() << dendl;
3829 return false;
3830
3831 ignore:
3832 return true;
3833}
3834
3835bool OSDMonitor::prepare_full(MonOpRequestRef op)
3836{
3837 op->mark_osdmon_event(__func__);
9f95a23c 3838 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3839 const int from = m->get_orig_source().num();
3840
3841 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3842 const unsigned want_state = m->state & mask; // safety first
3843
3844 unsigned cur_state = osdmap.get_state(from);
3845 auto p = pending_inc.new_state.find(from);
3846 if (p != pending_inc.new_state.end()) {
3847 cur_state ^= p->second;
3848 }
3849 cur_state &= mask;
3850
3851 set<string> want_state_set, cur_state_set;
3852 OSDMap::calc_state_set(want_state, want_state_set);
3853 OSDMap::calc_state_set(cur_state, cur_state_set);
3854
3855 if (cur_state != want_state) {
3856 if (p != pending_inc.new_state.end()) {
3857 p->second &= ~mask;
3858 } else {
3859 pending_inc.new_state[from] = 0;
3860 }
3861 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3862 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3863 << " -> " << want_state_set << dendl;
3864 } else {
3865 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3866 << " = wanted " << want_state_set << ", just waiting" << dendl;
3867 }
3868
3869 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3870 return true;
3871}
3872
3873// -------------
3874// alive
3875
3876bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3877{
3878 op->mark_osdmon_event(__func__);
9f95a23c 3879 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3880 int from = m->get_orig_source().num();
3881
3882 // check permissions, ignore if failed
11fdf7f2 3883 MonSession *session = op->get_session();
7c673cae
FG
3884 if (!session)
3885 goto ignore;
3886 if (!session->is_capable("osd", MON_CAP_X)) {
3887 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3888 << session->caps << dendl;
3889 goto ignore;
3890 }
3891
3892 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3893 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3894 dout(7) << "preprocess_alive ignoring alive message from down "
3895 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3896 << dendl;
7c673cae
FG
3897 goto ignore;
3898 }
3899
3900 if (osdmap.get_up_thru(from) >= m->want) {
3901 // yup.
3902 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3903 _reply_map(op, m->version);
3904 return true;
3905 }
3906
3907 dout(10) << "preprocess_alive want up_thru " << m->want
3908 << " from " << m->get_orig_source_inst() << dendl;
3909 return false;
3910
3911 ignore:
3912 return true;
3913}
3914
3915bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3916{
3917 op->mark_osdmon_event(__func__);
9f95a23c 3918 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3919 int from = m->get_orig_source().num();
3920
3921 if (0) { // we probably don't care much about these
f67539c2 3922 mon.clog->debug() << m->get_orig_source_inst() << " alive";
7c673cae
FG
3923 }
3924
3925 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3926 << " from " << m->get_orig_source_inst() << dendl;
3927
3928 update_up_thru(from, m->version); // set to the latest map the OSD has
3929 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3930 return true;
3931}
3932
3933void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3934{
3935 op->mark_osdmon_event(__func__);
3936 dout(7) << "_reply_map " << e
3937 << " from " << op->get_req()->get_orig_source_inst()
3938 << dendl;
3939 send_latest(op, e);
3940}
3941
3942// pg_created
3943bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3944{
3945 op->mark_osdmon_event(__func__);
9f95a23c 3946 auto m = op->get_req<MOSDPGCreated>();
7c673cae 3947 dout(10) << __func__ << " " << *m << dendl;
11fdf7f2 3948 auto session = op->get_session();
f67539c2 3949 mon.no_reply(op);
7c673cae
FG
3950 if (!session) {
3951 dout(10) << __func__ << ": no monitor session!" << dendl;
3952 return true;
3953 }
3954 if (!session->is_capable("osd", MON_CAP_X)) {
3955 derr << __func__ << " received from entity "
3956 << "with insufficient privileges " << session->caps << dendl;
3957 return true;
3958 }
3959 // always forward the "created!" to the leader
3960 return false;
3961}
3962
3963bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3964{
3965 op->mark_osdmon_event(__func__);
9f95a23c 3966 auto m = op->get_req<MOSDPGCreated>();
7c673cae
FG
3967 dout(10) << __func__ << " " << *m << dendl;
3968 auto src = m->get_orig_source();
3969 auto from = src.num();
3970 if (!src.is_osd() ||
f67539c2
TL
3971 !mon.osdmon()->osdmap.is_up(from) ||
3972 !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
11fdf7f2 3973 m->get_orig_source_addrs())) {
7c673cae
FG
3974 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3975 return false;
3976 }
3977 pending_created_pgs.push_back(m->pgid);
3978 return true;
3979}
3980
11fdf7f2
TL
3981bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3982{
3983 op->mark_osdmon_event(__func__);
9f95a23c 3984 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
3985 dout(10) << __func__ << " " << *m << dendl;
3986 const pg_pool_t *pi;
3987 auto session = op->get_session();
3988 if (!session) {
3989 dout(10) << __func__ << ": no monitor session!" << dendl;
3990 goto ignore;
3991 }
3992 if (!session->is_capable("osd", MON_CAP_X)) {
3993 derr << __func__ << " received from entity "
3994 << "with insufficient privileges " << session->caps << dendl;
3995 goto ignore;
3996 }
3997 pi = osdmap.get_pg_pool(m->pgid.pool());
3998 if (!pi) {
3999 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
4000 goto ignore;
4001 }
4002 if (pi->get_pg_num() <= m->pgid.ps()) {
4003 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
4004 goto ignore;
4005 }
4006 if (pi->get_pg_num() != m->pgid.ps() + 1) {
4007 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4008 goto ignore;
4009 }
4010 if (pi->get_pg_num_pending() > m->pgid.ps()) {
4011 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4012 goto ignore;
4013 }
4014 return false;
4015
4016 ignore:
f67539c2 4017 mon.no_reply(op);
11fdf7f2
TL
4018 return true;
4019}
4020
4021bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4022{
4023 op->mark_osdmon_event(__func__);
9f95a23c 4024 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
4025 dout(10) << __func__ << " " << *m << dendl;
4026 pg_pool_t p;
4027 if (pending_inc.new_pools.count(m->pgid.pool()))
4028 p = pending_inc.new_pools[m->pgid.pool()];
4029 else
4030 p = *osdmap.get_pg_pool(m->pgid.pool());
4031 if (p.get_pg_num() != m->pgid.ps() + 1 ||
4032 p.get_pg_num_pending() > m->pgid.ps()) {
4033 dout(10) << __func__
4034 << " race with concurrent pg_num[_pending] update, will retry"
4035 << dendl;
4036 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4037 return true;
4038 }
4039
4040 if (m->ready) {
4041 p.dec_pg_num(m->pgid,
4042 pending_inc.epoch,
4043 m->source_version,
4044 m->target_version,
4045 m->last_epoch_started,
4046 m->last_epoch_clean);
4047 p.last_change = pending_inc.epoch;
4048 } else {
4049 // back off the merge attempt!
4050 p.set_pg_num_pending(p.get_pg_num());
4051 }
4052
4053 // force pre-nautilus clients to resend their ops, since they
4054 // don't understand pg_num_pending changes form a new interval
4055 p.last_force_op_resend_prenautilus = pending_inc.epoch;
4056
4057 pending_inc.new_pools[m->pgid.pool()] = p;
4058
4059 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4060 if (m->ready &&
4061 prob > 0 &&
4062 prob > (double)(rand() % 1000)/1000.0) {
4063 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
f67539c2 4064 auto n = new MMonCommand(mon.monmap->get_fsid());
11fdf7f2
TL
4065 n->set_connection(m->get_connection());
4066 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4067 osdmap.get_pool_name(m->pgid.pool()) +
4068 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4069 stringify(m->pgid.ps() + 1) + "\"}" };
f67539c2 4070 MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
11fdf7f2
TL
4071 nop->set_type_service();
4072 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4073 } else {
4074 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4075 }
4076 return true;
4077}
4078
4079
7c673cae
FG
4080// -------------
4081// pg_temp changes
4082
4083bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4084{
9f95a23c 4085 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
4086 dout(10) << "preprocess_pgtemp " << *m << dendl;
4087 mempool::osdmap::vector<int> empty;
4088 int from = m->get_orig_source().num();
4089 size_t ignore_cnt = 0;
4090
4091 // check caps
11fdf7f2 4092 MonSession *session = op->get_session();
7c673cae
FG
4093 if (!session)
4094 goto ignore;
4095 if (!session->is_capable("osd", MON_CAP_X)) {
4096 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4097 << session->caps << dendl;
4098 goto ignore;
4099 }
4100
4101 if (!osdmap.is_up(from) ||
11fdf7f2
TL
4102 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4103 dout(7) << "ignoring pgtemp message from down "
4104 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4105 << dendl;
7c673cae
FG
4106 goto ignore;
4107 }
4108
3efd9988
FG
4109 if (m->forced) {
4110 return false;
4111 }
4112
7c673cae
FG
4113 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4114 dout(20) << " " << p->first
31f18b77 4115 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
4116 << " -> " << p->second << dendl;
4117
4118 // does the pool exist?
4119 if (!osdmap.have_pg_pool(p->first.pool())) {
4120 /*
4121 * 1. If the osdmap does not have the pool, it means the pool has been
4122 * removed in-between the osd sending this message and us handling it.
4123 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4124 * not exist in the pending either, as the osds would not send a
4125 * message about a pool they know nothing about (yet).
4126 * 3. However, if the pool does exist in the pending, then it must be a
4127 * new pool, and not relevant to this message (see 1).
4128 */
4129 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4130 << ": pool has been removed" << dendl;
4131 ignore_cnt++;
4132 continue;
4133 }
4134
4135 int acting_primary = -1;
4136 osdmap.pg_to_up_acting_osds(
4137 p->first, nullptr, nullptr, nullptr, &acting_primary);
4138 if (acting_primary != from) {
4139 /* If the source isn't the primary based on the current osdmap, we know
4140 * that the interval changed and that we can discard this message.
4141 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4142 * which of two pg temp mappings on the same pg is more recent.
4143 */
4144 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4145 << ": primary has changed" << dendl;
4146 ignore_cnt++;
4147 continue;
4148 }
4149
4150 // removal?
4151 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4152 osdmap.primary_temp->count(p->first)))
4153 return false;
4154 // change?
4155 // NOTE: we assume that this will clear pg_primary, so consider
4156 // an existing pg_primary field to imply a change
4157 if (p->second.size() &&
4158 (osdmap.pg_temp->count(p->first) == 0 ||
11fdf7f2 4159 osdmap.pg_temp->get(p->first) != p->second ||
7c673cae
FG
4160 osdmap.primary_temp->count(p->first)))
4161 return false;
4162 }
4163
4164 // should we ignore all the pgs?
4165 if (ignore_cnt == m->pg_temp.size())
4166 goto ignore;
4167
4168 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4169 _reply_map(op, m->map_epoch);
4170 return true;
4171
4172 ignore:
f67539c2 4173 mon.no_reply(op);
7c673cae
FG
4174 return true;
4175}
4176
4177void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4178{
4179 epoch_t old_up_thru = osdmap.get_up_thru(from);
4180 auto ut = pending_inc.new_up_thru.find(from);
4181 if (ut != pending_inc.new_up_thru.end()) {
4182 old_up_thru = ut->second;
4183 }
4184 if (up_thru > old_up_thru) {
4185 // set up_thru too, so the osd doesn't have to ask again
4186 pending_inc.new_up_thru[from] = up_thru;
4187 }
4188}
4189
4190bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4191{
4192 op->mark_osdmon_event(__func__);
9f95a23c 4193 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
4194 int from = m->get_orig_source().num();
4195 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4196 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4197 uint64_t pool = p->first.pool();
4198 if (pending_inc.old_pools.count(pool)) {
4199 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4200 << ": pool pending removal" << dendl;
4201 continue;
4202 }
4203 if (!osdmap.have_pg_pool(pool)) {
4204 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4205 << ": pool has been removed" << dendl;
4206 continue;
4207 }
4208 pending_inc.new_pg_temp[p->first] =
4209 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4210
4211 // unconditionally clear pg_primary (until this message can encode
4212 // a change for that, too.. at which point we need to also fix
4213 // preprocess_pg_temp)
4214 if (osdmap.primary_temp->count(p->first) ||
4215 pending_inc.new_primary_temp.count(p->first))
4216 pending_inc.new_primary_temp[p->first] = -1;
4217 }
4218
4219 // set up_thru too, so the osd doesn't have to ask again
4220 update_up_thru(from, m->map_epoch);
4221
4222 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4223 return true;
4224}
4225
4226
4227// ---
4228
4229bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4230{
4231 op->mark_osdmon_event(__func__);
9f95a23c 4232 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4233 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4234
4235 // check privilege, ignore if failed
11fdf7f2 4236 MonSession *session = op->get_session();
f67539c2 4237 mon.no_reply(op);
7c673cae
FG
4238 if (!session)
4239 goto ignore;
4240 if (!session->caps.is_capable(
11fdf7f2 4241 cct,
7c673cae 4242 session->entity_name,
11fdf7f2
TL
4243 "osd", "osd pool rmsnap", {}, true, true, false,
4244 session->get_peer_socket_addr())) {
7c673cae
FG
4245 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4246 << session->caps << dendl;
4247 goto ignore;
4248 }
4249
4250 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4251 q != m->snaps.end();
4252 ++q) {
4253 if (!osdmap.have_pg_pool(q->first)) {
9f95a23c
TL
4254 dout(10) << " ignoring removed_snaps " << q->second
4255 << " on non-existent pool " << q->first << dendl;
7c673cae
FG
4256 continue;
4257 }
4258 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4259 for (vector<snapid_t>::iterator p = q->second.begin();
4260 p != q->second.end();
4261 ++p) {
4262 if (*p > pi->get_snap_seq() ||
9f95a23c 4263 !_is_removed_snap(q->first, *p)) {
7c673cae 4264 return false;
9f95a23c 4265 }
7c673cae
FG
4266 }
4267 }
4268
9f95a23c
TL
4269 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4270 auto reply = make_message<MRemoveSnaps>();
4271 reply->snaps = m->snaps;
f67539c2 4272 mon.send_reply(op, reply.detach());
9f95a23c
TL
4273 }
4274
7c673cae
FG
4275 ignore:
4276 return true;
4277}
4278
4279bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4280{
4281 op->mark_osdmon_event(__func__);
9f95a23c 4282 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4283 dout(7) << "prepare_remove_snaps " << *m << dendl;
4284
9f95a23c
TL
4285 for (auto& [pool, snaps] : m->snaps) {
4286 if (!osdmap.have_pg_pool(pool)) {
4287 dout(10) << " ignoring removed_snaps " << snaps
4288 << " on non-existent pool " << pool << dendl;
7c673cae
FG
4289 continue;
4290 }
4291
9f95a23c
TL
4292 pg_pool_t& pi = osdmap.pools[pool];
4293 for (auto s : snaps) {
4294 if (!_is_removed_snap(pool, s) &&
4295 (!pending_inc.new_pools.count(pool) ||
4296 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4297 (!pending_inc.new_removed_snaps.count(pool) ||
4298 !pending_inc.new_removed_snaps[pool].contains(s))) {
4299 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4300 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4301 newpi->removed_snaps.insert(s);
4302 dout(10) << " pool " << pool << " removed_snaps added " << s
4303 << " (now " << newpi->removed_snaps << ")" << dendl;
4304 }
11fdf7f2 4305 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
9f95a23c
TL
4306 if (s > newpi->get_snap_seq()) {
4307 dout(10) << " pool " << pool << " snap_seq "
4308 << newpi->get_snap_seq() << " -> " << s << dendl;
4309 newpi->set_snap_seq(s);
7c673cae
FG
4310 }
4311 newpi->set_snap_epoch(pending_inc.epoch);
9f95a23c
TL
4312 dout(10) << " added pool " << pool << " snap " << s
4313 << " to removed_snaps queue" << dendl;
4314 pending_inc.new_removed_snaps[pool].insert(s);
7c673cae
FG
4315 }
4316 }
4317 }
9f95a23c
TL
4318
4319 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4320 auto reply = make_message<MRemoveSnaps>();
4321 reply->snaps = m->snaps;
4322 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4323 }
4324
4325 return true;
4326}
4327
4328bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4329{
4330 op->mark_osdmon_event(__func__);
4331 auto m = op->get_req<MMonGetPurgedSnaps>();
4332 dout(7) << __func__ << " " << *m << dendl;
4333
4334 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4335
4336 string k = make_purged_snap_epoch_key(m->start);
f67539c2 4337 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
9f95a23c
TL
4338 it->upper_bound(k);
4339 unsigned long epoch = m->last;
4340 while (it->valid()) {
4341 if (it->key().find("purged_epoch_") != 0) {
4342 break;
4343 }
4344 string k = it->key();
4345 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4346 if (n != 1) {
4347 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4348 } else if (epoch > m->last) {
4349 break;
4350 } else {
4351 bufferlist bl = it->value();
4352 auto p = bl.cbegin();
4353 auto &v = r[epoch];
4354 try {
4355 ceph::decode(v, p);
f67539c2 4356 } catch (ceph::buffer::error& e) {
9f95a23c
TL
4357 derr << __func__ << " unable to parse value for key '" << it->key()
4358 << "': \n";
4359 bl.hexdump(*_dout);
4360 *_dout << dendl;
4361 }
4362 n += 4 + v.size() * 16;
4363 }
4364 if (n > 1048576) {
4365 // impose a semi-arbitrary limit to message size
4366 break;
4367 }
4368 it->next();
4369 }
4370
4371 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4372 reply->purged_snaps.swap(r);
f67539c2 4373 mon.send_reply(op, reply.detach());
9f95a23c 4374
7c673cae
FG
4375 return true;
4376}
4377
4378// osd beacon
4379bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4380{
4381 op->mark_osdmon_event(__func__);
7c673cae 4382 // check caps
11fdf7f2 4383 auto session = op->get_session();
f67539c2 4384 mon.no_reply(op);
7c673cae
FG
4385 if (!session) {
4386 dout(10) << __func__ << " no monitor session!" << dendl;
4387 return true;
4388 }
4389 if (!session->is_capable("osd", MON_CAP_X)) {
4390 derr << __func__ << " received from entity "
4391 << "with insufficient privileges " << session->caps << dendl;
4392 return true;
4393 }
4394 // Always forward the beacon to the leader, even if they are the same as
4395 // the old one. The leader will mark as down osds that haven't sent
4396 // beacon for a few minutes.
4397 return false;
4398}
4399
4400bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4401{
4402 op->mark_osdmon_event(__func__);
9f95a23c 4403 const auto beacon = op->get_req<MOSDBeacon>();
7c673cae
FG
4404 const auto src = beacon->get_orig_source();
4405 dout(10) << __func__ << " " << *beacon
4406 << " from " << src << dendl;
4407 int from = src.num();
4408
4409 if (!src.is_osd() ||
4410 !osdmap.is_up(from) ||
11fdf7f2
TL
4411 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4412 if (src.is_osd() && !osdmap.is_up(from)) {
4413 // share some new maps with this guy in case it may not be
4414 // aware of its own deadness...
4415 send_latest(op, beacon->version+1);
4416 }
4417 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
7c673cae
FG
4418 return false;
4419 }
4420
f67539c2
TL
4421 last_osd_report[from].first = ceph_clock_now();
4422 last_osd_report[from].second = beacon->osd_beacon_report_interval;
7c673cae
FG
4423 osd_epochs[from] = beacon->version;
4424
4425 for (const auto& pg : beacon->pgs) {
522d829b
TL
4426 if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4427 unsigned pg_num = pool->get_pg_num();
4428 last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4429 }
7c673cae 4430 }
9f95a23c
TL
4431
4432 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4433 beacon->last_purged_snaps_scrub) {
4434 if (pending_inc.new_xinfo.count(from) == 0) {
4435 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4436 }
4437 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4438 beacon->last_purged_snaps_scrub;
4439 return true;
4440 } else {
4441 return false;
4442 }
7c673cae
FG
4443}
4444
4445// ---------------
4446// map helpers
4447
4448void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4449{
4450 op->mark_osdmon_event(__func__);
4451 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4452 << " start " << start << dendl;
4453 if (start == 0)
4454 send_full(op);
4455 else
4456 send_incremental(op, start);
4457}
4458
4459
28e407b8 4460MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
7c673cae 4461{
f67539c2 4462 MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
28e407b8 4463 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
7c673cae
FG
4464 r->oldest_map = get_first_committed();
4465 r->newest_map = osdmap.get_epoch();
4466 return r;
4467}
4468
28e407b8 4469MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
7c673cae 4470{
11fdf7f2
TL
4471 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4472 << std::hex << features << std::dec << dendl;
f67539c2 4473 MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
7c673cae
FG
4474 m->oldest_map = get_first_committed();
4475 m->newest_map = osdmap.get_epoch();
4476
4477 for (epoch_t e = to; e >= from && e > 0; e--) {
4478 bufferlist bl;
28e407b8 4479 int err = get_version(e, features, bl);
7c673cae 4480 if (err == 0) {
11fdf7f2 4481 ceph_assert(bl.length());
7c673cae
FG
4482 // if (get_version(e, bl) > 0) {
4483 dout(20) << "build_incremental inc " << e << " "
4484 << bl.length() << " bytes" << dendl;
4485 m->incremental_maps[e] = bl;
4486 } else {
11fdf7f2
TL
4487 ceph_assert(err == -ENOENT);
4488 ceph_assert(!bl.length());
28e407b8 4489 get_version_full(e, features, bl);
7c673cae
FG
4490 if (bl.length() > 0) {
4491 //else if (get_version("full", e, bl) > 0) {
4492 dout(20) << "build_incremental full " << e << " "
4493 << bl.length() << " bytes" << dendl;
4494 m->maps[e] = bl;
4495 } else {
4496 ceph_abort(); // we should have all maps.
4497 }
4498 }
4499 }
4500 return m;
4501}
4502
4503void OSDMonitor::send_full(MonOpRequestRef op)
4504{
4505 op->mark_osdmon_event(__func__);
4506 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
f67539c2 4507 mon.send_reply(op, build_latest_full(op->get_session()->con_features));
7c673cae
FG
4508}
4509
4510void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4511{
4512 op->mark_osdmon_event(__func__);
4513
4514 MonSession *s = op->get_session();
11fdf7f2 4515 ceph_assert(s);
7c673cae 4516
11fdf7f2 4517 if (s->proxy_con) {
7c673cae
FG
4518 // oh, we can tell the other mon to do it
4519 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4520 << first << dendl;
4521 MRoute *r = new MRoute(s->proxy_tid, NULL);
4522 r->send_osdmap_first = first;
4523 s->proxy_con->send_message(r);
4524 op->mark_event("reply: send routed send_osdmap_first reply");
4525 } else {
4526 // do it ourselves
4527 send_incremental(first, s, false, op);
4528 }
4529}
4530
4531void OSDMonitor::send_incremental(epoch_t first,
4532 MonSession *session,
4533 bool onetime,
4534 MonOpRequestRef req)
4535{
4536 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
11fdf7f2 4537 << " to " << session->name << dendl;
7c673cae 4538
28e407b8
AA
4539 // get feature of the peer
4540 // use quorum_con_features, if it's an anonymous connection.
4541 uint64_t features = session->con_features ? session->con_features :
f67539c2 4542 mon.get_quorum_con_features();
28e407b8 4543
7c673cae 4544 if (first <= session->osd_epoch) {
11fdf7f2 4545 dout(10) << __func__ << " " << session->name << " should already have epoch "
7c673cae
FG
4546 << session->osd_epoch << dendl;
4547 first = session->osd_epoch + 1;
4548 }
4549
4550 if (first < get_first_committed()) {
11fdf7f2
TL
4551 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4552 m->oldest_map = get_first_committed();
4553 m->newest_map = osdmap.get_epoch();
4554
7c673cae
FG
4555 first = get_first_committed();
4556 bufferlist bl;
28e407b8 4557 int err = get_version_full(first, features, bl);
11fdf7f2
TL
4558 ceph_assert(err == 0);
4559 ceph_assert(bl.length());
7c673cae
FG
4560 dout(20) << "send_incremental starting with base full "
4561 << first << " " << bl.length() << " bytes" << dendl;
7c673cae
FG
4562 m->maps[first] = bl;
4563
4564 if (req) {
f67539c2 4565 mon.send_reply(req, m);
7c673cae
FG
4566 session->osd_epoch = first;
4567 return;
4568 } else {
4569 session->con->send_message(m);
4570 session->osd_epoch = first;
4571 }
4572 first++;
4573 }
4574
4575 while (first <= osdmap.get_epoch()) {
11fdf7f2 4576 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
28e407b8
AA
4577 osdmap.get_epoch());
4578 MOSDMap *m = build_incremental(first, last, features);
7c673cae
FG
4579
4580 if (req) {
4581 // send some maps. it may not be all of them, but it will get them
4582 // started.
f67539c2 4583 mon.send_reply(req, m);
7c673cae
FG
4584 } else {
4585 session->con->send_message(m);
4586 first = last + 1;
4587 }
4588 session->osd_epoch = last;
4589 if (onetime || req)
4590 break;
4591 }
4592}
4593
4594int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4595{
f67539c2 4596 return get_version(ver, mon.get_quorum_con_features(), bl);
28e407b8
AA
4597}
4598
4599void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4600{
4601 OSDMap::Incremental inc;
11fdf7f2 4602 auto q = bl.cbegin();
28e407b8
AA
4603 inc.decode(q);
4604 // always encode with subset of osdmap's canonical features
4605 uint64_t f = features & inc.encode_features;
4606 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4607 << dendl;
4608 bl.clear();
4609 if (inc.fullmap.length()) {
4610 // embedded full map?
4611 OSDMap m;
4612 m.decode(inc.fullmap);
4613 inc.fullmap.clear();
4614 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4615 }
4616 if (inc.crush.length()) {
4617 // embedded crush map
4618 CrushWrapper c;
11fdf7f2 4619 auto p = inc.crush.cbegin();
28e407b8
AA
4620 c.decode(p);
4621 inc.crush.clear();
4622 c.encode(inc.crush, f);
4623 }
4624 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4625}
4626
4627void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4628{
4629 OSDMap m;
11fdf7f2 4630 auto q = bl.cbegin();
28e407b8
AA
4631 m.decode(q);
4632 // always encode with subset of osdmap's canonical features
4633 uint64_t f = features & m.get_encoding_features();
4634 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4635 << dendl;
4636 bl.clear();
4637 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4638}
4639
4640int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4641{
4642 uint64_t significant_features = OSDMap::get_significant_features(features);
4643 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4644 return 0;
4645 }
4646 int ret = PaxosService::get_version(ver, bl);
4647 if (ret < 0) {
7c673cae 4648 return ret;
28e407b8
AA
4649 }
4650 // NOTE: this check is imprecise; the OSDMap encoding features may
4651 // be a subset of the latest mon quorum features, but worst case we
4652 // reencode once and then cache the (identical) result under both
4653 // feature masks.
4654 if (significant_features !=
f67539c2 4655 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
28e407b8
AA
4656 reencode_incremental_map(bl, features);
4657 }
eafe8130 4658 inc_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4659 return 0;
7c673cae
FG
4660}
4661
11fdf7f2
TL
4662int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4663{
4664 bufferlist inc_bl;
4665 int err = get_version(ver, inc_bl);
4666 ceph_assert(err == 0);
4667 ceph_assert(inc_bl.length());
4668
4669 auto p = inc_bl.cbegin();
4670 inc.decode(p);
4671 dout(10) << __func__ << " "
4672 << " epoch " << inc.epoch
4673 << " inc_crc " << inc.inc_crc
4674 << " full_crc " << inc.full_crc
4675 << " encode_features " << inc.encode_features << dendl;
4676 return 0;
4677}
4678
4679int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4680{
4681 dout(10) << __func__ << " ver " << ver << dendl;
4682
4683 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4684 if (closest_pinned == 0) {
4685 return -ENOENT;
4686 }
4687 if (closest_pinned > ver) {
4688 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4689 }
4690 ceph_assert(closest_pinned <= ver);
4691
4692 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4693
4694 // get osdmap incremental maps and apply on top of this one.
4695 bufferlist osdm_bl;
4696 bool has_cached_osdmap = false;
4697 for (version_t v = ver-1; v >= closest_pinned; --v) {
f67539c2 4698 if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
11fdf7f2
TL
4699 &osdm_bl)) {
4700 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4701 closest_pinned = v;
4702 has_cached_osdmap = true;
4703 break;
4704 }
4705 }
4706
4707 if (!has_cached_osdmap) {
4708 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4709 if (err != 0) {
4710 derr << __func__ << " closest pinned map ver " << closest_pinned
4711 << " not available! error: " << cpp_strerror(err) << dendl;
4712 }
4713 ceph_assert(err == 0);
4714 }
4715
4716 ceph_assert(osdm_bl.length());
4717
4718 OSDMap osdm;
4719 osdm.decode(osdm_bl);
4720
4721 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4722 << " e" << osdm.epoch
4723 << " crc " << osdm.get_crc()
4724 << " -- applying incremental maps." << dendl;
4725
4726 uint64_t encode_features = 0;
4727 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4728 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4729
4730 OSDMap::Incremental inc;
4731 int err = get_inc(v, inc);
4732 ceph_assert(err == 0);
4733
4734 encode_features = inc.encode_features;
4735
4736 err = osdm.apply_incremental(inc);
4737 ceph_assert(err == 0);
4738
4739 // this block performs paranoid checks on map retrieval
4740 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4741 inc.full_crc != 0) {
4742
4743 uint64_t f = encode_features;
4744 if (!f) {
f67539c2 4745 f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
11fdf7f2
TL
4746 }
4747
4748 // encode osdmap to force calculating crcs
4749 bufferlist tbl;
4750 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4751 // decode osdmap to compare crcs with what's expected by incremental
4752 OSDMap tosdm;
4753 tosdm.decode(tbl);
4754
4755 if (tosdm.get_crc() != inc.full_crc) {
4756 derr << __func__
4757 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4758 << ", expected " << inc.full_crc << ")" << dendl;
4759 ceph_abort_msg("osdmap crc mismatch");
4760 }
4761 }
4762
4763 // note: we cannot add the recently computed map to the cache, as is,
4764 // because we have not encoded the map into a bl.
4765 }
4766
4767 if (!encode_features) {
4768 dout(10) << __func__
4769 << " last incremental map didn't have features;"
4770 << " defaulting to quorum's or all" << dendl;
4771 encode_features =
f67539c2 4772 (mon.quorum_con_features ? mon.quorum_con_features : -1);
11fdf7f2
TL
4773 }
4774 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4775
4776 return 0;
4777}
4778
7c673cae
FG
4779int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4780{
f67539c2 4781 return get_version_full(ver, mon.get_quorum_con_features(), bl);
28e407b8
AA
4782}
4783
4784int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4785 bufferlist& bl)
4786{
4787 uint64_t significant_features = OSDMap::get_significant_features(features);
4788 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4789 return 0;
4790 }
4791 int ret = PaxosService::get_version_full(ver, bl);
11fdf7f2
TL
4792 if (ret == -ENOENT) {
4793 // build map?
4794 ret = get_full_from_pinned_map(ver, bl);
4795 }
28e407b8 4796 if (ret < 0) {
7c673cae 4797 return ret;
28e407b8
AA
4798 }
4799 // NOTE: this check is imprecise; the OSDMap encoding features may
4800 // be a subset of the latest mon quorum features, but worst case we
4801 // reencode once and then cache the (identical) result under both
4802 // feature masks.
4803 if (significant_features !=
f67539c2 4804 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
28e407b8
AA
4805 reencode_full_map(bl, features);
4806 }
eafe8130 4807 full_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4808 return 0;
7c673cae
FG
4809}
4810
f67539c2 4811epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
11fdf7f2 4812{
f67539c2 4813 dout(10) << "blocklist " << av << " until " << until << dendl;
11fdf7f2 4814 for (auto a : av.v) {
9f95a23c 4815 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4816 a.set_type(entity_addr_t::TYPE_ANY);
4817 } else {
4818 a.set_type(entity_addr_t::TYPE_LEGACY);
4819 }
f67539c2 4820 pending_inc.new_blocklist[a] = until;
11fdf7f2
TL
4821 }
4822 return pending_inc.epoch;
4823}
4824
f67539c2 4825epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
7c673cae 4826{
9f95a23c 4827 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4828 a.set_type(entity_addr_t::TYPE_ANY);
4829 } else {
4830 a.set_type(entity_addr_t::TYPE_LEGACY);
4831 }
f67539c2
TL
4832 dout(10) << "blocklist " << a << " until " << until << dendl;
4833 pending_inc.new_blocklist[a] = until;
7c673cae
FG
4834 return pending_inc.epoch;
4835}
4836
4837
4838void OSDMonitor::check_osdmap_subs()
4839{
4840 dout(10) << __func__ << dendl;
4841 if (!osdmap.get_epoch()) {
4842 return;
4843 }
f67539c2
TL
4844 auto osdmap_subs = mon.session_map.subs.find("osdmap");
4845 if (osdmap_subs == mon.session_map.subs.end()) {
7c673cae
FG
4846 return;
4847 }
4848 auto p = osdmap_subs->second->begin();
4849 while (!p.end()) {
4850 auto sub = *p;
4851 ++p;
4852 check_osdmap_sub(sub);
4853 }
4854}
4855
4856void OSDMonitor::check_osdmap_sub(Subscription *sub)
4857{
4858 dout(10) << __func__ << " " << sub << " next " << sub->next
4859 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4860 if (sub->next <= osdmap.get_epoch()) {
4861 if (sub->next >= 1)
4862 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4863 else
28e407b8 4864 sub->session->con->send_message(build_latest_full(sub->session->con_features));
7c673cae 4865 if (sub->onetime)
f67539c2 4866 mon.session_map.remove_sub(sub);
7c673cae
FG
4867 else
4868 sub->next = osdmap.get_epoch() + 1;
4869 }
4870}
4871
4872void OSDMonitor::check_pg_creates_subs()
4873{
7c673cae
FG
4874 if (!osdmap.get_num_up_osds()) {
4875 return;
4876 }
11fdf7f2 4877 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
f67539c2 4878 mon.with_session_map([this](const MonSessionMap& session_map) {
7c673cae
FG
4879 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4880 if (pg_creates_subs == session_map.subs.end()) {
4881 return;
4882 }
4883 for (auto sub : *pg_creates_subs->second) {
4884 check_pg_creates_sub(sub);
4885 }
4886 });
4887}
4888
4889void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4890{
11fdf7f2
TL
4891 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4892 ceph_assert(sub->type == "osd_pg_creates");
7c673cae
FG
4893 // only send these if the OSD is up. we will check_subs() when they do
4894 // come up so they will get the creates then.
11fdf7f2 4895 if (sub->session->name.is_osd() &&
f67539c2 4896 mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
11fdf7f2 4897 sub->next = send_pg_creates(sub->session->name.num(),
7c673cae
FG
4898 sub->session->con.get(),
4899 sub->next);
4900 }
4901}
4902
c07f9fc5 4903void OSDMonitor::do_application_enable(int64_t pool_id,
11fdf7f2
TL
4904 const std::string &app_name,
4905 const std::string &app_key,
1911f103
TL
4906 const std::string &app_value,
4907 bool force)
c07f9fc5 4908{
f67539c2 4909 ceph_assert(paxos.is_plugged() && is_writeable());
c07f9fc5
FG
4910
4911 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4912 << dendl;
4913
9f95a23c 4914 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
35e4c445 4915
c07f9fc5 4916 auto pp = osdmap.get_pg_pool(pool_id);
11fdf7f2 4917 ceph_assert(pp != nullptr);
c07f9fc5
FG
4918
4919 pg_pool_t p = *pp;
4920 if (pending_inc.new_pools.count(pool_id)) {
4921 p = pending_inc.new_pools[pool_id];
4922 }
4923
11fdf7f2
TL
4924 if (app_key.empty()) {
4925 p.application_metadata.insert({app_name, {}});
4926 } else {
1911f103
TL
4927 if (force) {
4928 p.application_metadata[app_name][app_key] = app_value;
4929 } else {
4930 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4931 }
11fdf7f2 4932 }
c07f9fc5
FG
4933 p.last_change = pending_inc.epoch;
4934 pending_inc.new_pools[pool_id] = p;
4935}
4936
494da23a
TL
4937void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4938 pool_opts_t::key_t opt,
4939 pool_opts_t::value_t val)
4940{
4941 auto p = pending_inc.new_pools.try_emplace(
4942 pool_id, *osdmap.get_pg_pool(pool_id));
4943 p.first->second.opts.set(opt, val);
4944}
4945
31f18b77 4946unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
4947 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4948 const mempool::osdmap::set<int64_t>& removed_pools,
4949 utime_t modified,
4950 creating_pgs_t* creating_pgs) const
4951{
31f18b77 4952 unsigned queued = 0;
7c673cae
FG
4953 for (auto& p : pools) {
4954 int64_t poolid = p.first;
11fdf7f2
TL
4955 if (creating_pgs->created_pools.count(poolid)) {
4956 dout(10) << __func__ << " already created " << poolid << dendl;
4957 continue;
4958 }
7c673cae 4959 const pg_pool_t& pool = p.second;
31f18b77 4960 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
4961 pool.get_type(), pool.get_size());
4962 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4963 continue;
4964
4965 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4966 const auto created = pool.get_last_change();
4967 if (last_scan_epoch && created <= last_scan_epoch) {
4968 dout(10) << __func__ << " no change in pool " << poolid
4969 << " " << pool << dendl;
4970 continue;
4971 }
4972 if (removed_pools.count(poolid)) {
4973 dout(10) << __func__ << " pool is being removed: " << poolid
4974 << " " << pool << dendl;
4975 continue;
4976 }
31f18b77 4977 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 4978 << " " << pool << dendl;
11fdf7f2
TL
4979 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4980 created, modified);
4981 queued++;
7c673cae 4982 }
31f18b77 4983 return queued;
7c673cae
FG
4984}
4985
4986void OSDMonitor::update_creating_pgs()
4987{
31f18b77
FG
4988 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4989 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
4990 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4991 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 4992 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
4993 int acting_primary = -1;
4994 auto pgid = pg.first;
94b18763
FG
4995 if (!osdmap.pg_exists(pgid)) {
4996 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4997 << dendl;
4998 continue;
4999 }
9f95a23c 5000 auto mapped = pg.second.create_epoch;
c07f9fc5 5001 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
11fdf7f2
TL
5002 spg_t spgid(pgid);
5003 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
7c673cae
FG
5004 // check the previous creating_pgs, look for the target to whom the pg was
5005 // previously mapped
5006 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5007 const auto last_acting_primary = pgs_by_epoch.first;
5008 for (auto& pgs: pgs_by_epoch.second) {
11fdf7f2 5009 if (pgs.second.count(spgid)) {
7c673cae
FG
5010 if (last_acting_primary == acting_primary) {
5011 mapped = pgs.first;
5012 } else {
5013 dout(20) << __func__ << " " << pgid << " "
5014 << " acting_primary:" << last_acting_primary
5015 << " -> " << acting_primary << dendl;
5016 // note epoch if the target of the create message changed.
5017 mapped = mapping.get_epoch();
5018 }
5019 break;
31f18b77
FG
5020 } else {
5021 // newly creating
5022 mapped = mapping.get_epoch();
5023 }
7c673cae
FG
5024 }
5025 }
5026 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 5027 << " to create " << pgid << "@" << mapped << dendl;
11fdf7f2 5028 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
7c673cae
FG
5029 }
5030 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5031 creating_pgs_epoch = mapping.get_epoch();
5032}
5033
c07f9fc5 5034epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
5035{
5036 dout(30) << __func__ << " osd." << osd << " next=" << next
5037 << " " << creating_pgs_by_osd_epoch << dendl;
5038 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
5039 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5040 dout(20) << __func__
5041 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5042 // the subscribers will be updated when the mapping is completed anyway
5043 return next;
5044 }
7c673cae
FG
5045 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5046 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5047 return next;
11fdf7f2
TL
5048 ceph_assert(!creating_pgs_by_epoch->second.empty());
5049
5050 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
5051 MOSDPGCreate2 *m = nullptr;
5052
9f95a23c 5053 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
7c673cae 5054
7c673cae
FG
5055 epoch_t last = 0;
5056 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5057 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5058 auto epoch = epoch_pgs->first;
5059 auto& pgs = epoch_pgs->second;
5060 dout(20) << __func__ << " osd." << osd << " from " << next
5061 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5062 last = epoch;
5063 for (auto& pg : pgs) {
7c673cae
FG
5064 // Need the create time from the monitor using its clock to set
5065 // last_scrub_stamp upon pg creation.
11fdf7f2
TL
5066 auto create = creating_pgs.pgs.find(pg.pgid);
5067 ceph_assert(create != creating_pgs.pgs.end());
5068 if (old) {
5069 if (!oldm) {
5070 oldm = new MOSDPGCreate(creating_pgs_epoch);
5071 }
5072 oldm->mkpg.emplace(pg.pgid,
9f95a23c
TL
5073 pg_create_t{create->second.create_epoch, pg.pgid, 0});
5074 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
11fdf7f2
TL
5075 } else {
5076 if (!m) {
5077 m = new MOSDPGCreate2(creating_pgs_epoch);
5078 }
9f95a23c
TL
5079 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5080 create->second.create_stamp));
5081 if (create->second.history.epoch_created) {
5082 dout(20) << __func__ << " " << pg << " " << create->second.history
5083 << " " << create->second.past_intervals << dendl;
5084 m->pg_extra.emplace(pg, make_pair(create->second.history,
5085 create->second.past_intervals));
5086 }
11fdf7f2 5087 }
7c673cae 5088 dout(20) << __func__ << " will create " << pg
9f95a23c 5089 << " at " << create->second.create_epoch << dendl;
7c673cae
FG
5090 }
5091 }
11fdf7f2
TL
5092 if (m) {
5093 con->send_message(m);
5094 } else if (oldm) {
5095 con->send_message(oldm);
5096 } else {
7c673cae
FG
5097 dout(20) << __func__ << " osd." << osd << " from " << next
5098 << " has nothing to send" << dendl;
5099 return next;
5100 }
11fdf7f2 5101
7c673cae
FG
5102 // sub is current through last + 1
5103 return last + 1;
5104}
5105
5106// TICK
5107
5108
5109void OSDMonitor::tick()
5110{
5111 if (!is_active()) return;
5112
5113 dout(10) << osdmap << dendl;
5114
11fdf7f2
TL
5115 // always update osdmap manifest, regardless of being the leader.
5116 load_osdmap_manifest();
5117
1911f103
TL
5118 // always tune priority cache manager memory on leader and peons
5119 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5120 std::lock_guard l(balancer_lock);
5121 if (pcm != nullptr) {
5122 pcm->tune_memory();
5123 pcm->balance();
5124 _set_new_cache_sizes();
5125 dout(10) << "tick balancer "
5126 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5127 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5128 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5129 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5130 << dendl;
5131 dout(10) << "tick balancer "
5132 << " full cache_bytes: " << full_cache->get_cache_bytes()
5133 << " full comtd_bytes: " << full_cache->get_committed_size()
5134 << " full used_bytes: " << full_cache->_get_used_bytes()
5135 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5136 << dendl;
5137 }
5138 }
5139
f67539c2 5140 if (!mon.is_leader()) return;
7c673cae
FG
5141
5142 bool do_propose = false;
5143 utime_t now = ceph_clock_now();
5144
11fdf7f2 5145 if (handle_osd_timeouts(now, last_osd_report)) {
181888fb
FG
5146 do_propose = true;
5147 }
7c673cae
FG
5148
5149 // mark osds down?
11fdf7f2 5150 if (check_failures(now)) {
7c673cae 5151 do_propose = true;
11fdf7f2
TL
5152 }
5153
5154 // Force a proposal if we need to prune; pruning is performed on
5155 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5156 // even if there's nothing going on.
5157 if (is_prune_enabled() && should_prune()) {
5158 do_propose = true;
5159 }
7c673cae
FG
5160
5161 // mark down osds out?
5162
5163 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5164 * influence at all. The decision is made based on the ratio of "in" osds,
5165 * and the function returns false if this ratio is lower that the minimum
11fdf7f2 5166 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
7c673cae
FG
5167 */
5168 if (can_mark_out(-1)) {
11fdf7f2
TL
5169 string down_out_subtree_limit = g_conf().get_val<string>(
5170 "mon_osd_down_out_subtree_limit");
7c673cae
FG
5171 set<int> down_cache; // quick cache of down subtrees
5172
5173 map<int,utime_t>::iterator i = down_pending_out.begin();
5174 while (i != down_pending_out.end()) {
5175 int o = i->first;
5176 utime_t down = now;
5177 down -= i->second;
5178 ++i;
5179
5180 if (osdmap.is_down(o) &&
5181 osdmap.is_in(o) &&
5182 can_mark_out(o)) {
11fdf7f2 5183 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
7c673cae
FG
5184 utime_t grace = orig_grace;
5185 double my_grace = 0.0;
5186
11fdf7f2 5187 if (g_conf()->mon_osd_adjust_down_out_interval) {
7c673cae
FG
5188 // scale grace period the same way we do the heartbeat grace.
5189 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
11fdf7f2 5190 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
5191 double decay_k = ::log(.5) / halflife;
5192 double decay = exp((double)down * decay_k);
5193 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5194 << " down for " << down << " decay " << decay << dendl;
5195 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5196 grace += my_grace;
5197 }
5198
5199 // is this an entire large subtree down?
11fdf7f2
TL
5200 if (down_out_subtree_limit.length()) {
5201 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
7c673cae 5202 if (type > 0) {
11fdf7f2
TL
5203 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5204 dout(10) << "tick entire containing " << down_out_subtree_limit
5205 << " subtree for osd." << o
5206 << " is down; resetting timer" << dendl;
7c673cae
FG
5207 // reset timer, too.
5208 down_pending_out[o] = now;
5209 continue;
5210 }
5211 }
5212 }
5213
c07f9fc5 5214 bool down_out = !osdmap.is_destroyed(o) &&
11fdf7f2 5215 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
c07f9fc5 5216 bool destroyed_out = osdmap.is_destroyed(o) &&
11fdf7f2 5217 g_conf()->mon_osd_destroyed_out_interval > 0 &&
c07f9fc5
FG
5218 // this is not precise enough as we did not make a note when this osd
5219 // was marked as destroyed, but let's not bother with that
5220 // complexity for now.
11fdf7f2 5221 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
c07f9fc5 5222 if (down_out || destroyed_out) {
7c673cae
FG
5223 dout(10) << "tick marking osd." << o << " OUT after " << down
5224 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5225 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5226
5227 // set the AUTOOUT bit.
5228 if (pending_inc.new_state.count(o) == 0)
5229 pending_inc.new_state[o] = 0;
5230 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5231
5232 // remember previous weight
5233 if (pending_inc.new_xinfo.count(o) == 0)
5234 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5235 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5236
5237 do_propose = true;
5238
f67539c2 5239 mon.clog->info() << "Marking osd." << o << " out (has been down for "
224ce89b 5240 << int(down.sec()) << " seconds)";
7c673cae
FG
5241 } else
5242 continue;
5243 }
5244
5245 down_pending_out.erase(o);
5246 }
5247 } else {
5248 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5249 }
5250
f67539c2
TL
5251 // expire blocklisted items?
5252 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5253 p != osdmap.blocklist.end();
7c673cae
FG
5254 ++p) {
5255 if (p->second < now) {
f67539c2
TL
5256 dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5257 pending_inc.old_blocklist.push_back(p->first);
7c673cae
FG
5258 do_propose = true;
5259 }
5260 }
5261
11fdf7f2
TL
5262 if (try_prune_purged_snaps()) {
5263 do_propose = true;
7c673cae
FG
5264 }
5265
5266 if (update_pools_status())
5267 do_propose = true;
5268
5269 if (do_propose ||
5270 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5271 propose_pending();
eafe8130
TL
5272}
5273
5274void OSDMonitor::_set_new_cache_sizes()
5275{
5276 uint64_t cache_size = 0;
5277 int64_t inc_alloc = 0;
5278 int64_t full_alloc = 0;
5279 int64_t kv_alloc = 0;
5280
5281 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5282 cache_size = pcm->get_tuned_mem();
5283 inc_alloc = inc_cache->get_committed_size();
5284 full_alloc = full_cache->get_committed_size();
5285 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5286 }
5287
5288 inc_osd_cache.set_bytes(inc_alloc);
5289 full_osd_cache.set_bytes(full_alloc);
5290
92f5a8d4 5291 dout(1) << __func__ << " cache_size:" << cache_size
eafe8130
TL
5292 << " inc_alloc: " << inc_alloc
5293 << " full_alloc: " << full_alloc
5294 << " kv_alloc: " << kv_alloc
5295 << dendl;
7c673cae
FG
5296}
5297
5298bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
f67539c2 5299 std::map<int, std::pair<utime_t, int>> &last_osd_report)
7c673cae 5300{
11fdf7f2 5301 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
f67539c2 5302 if (now - mon.get_leader_since() < timeo) {
7c673cae
FG
5303 // We haven't been the leader for long enough to consider OSD timeouts
5304 return false;
5305 }
5306
5307 int max_osd = osdmap.get_max_osd();
5308 bool new_down = false;
5309
5310 for (int i=0; i < max_osd; ++i) {
5311 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
5312 if (!osdmap.exists(i)) {
5313 last_osd_report.erase(i); // if any
5314 continue;
5315 }
7c673cae
FG
5316 if (!osdmap.is_up(i))
5317 continue;
f67539c2 5318 const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
7c673cae
FG
5319 if (t == last_osd_report.end()) {
5320 // it wasn't in the map; start the timer.
f67539c2
TL
5321 last_osd_report[i].first = now;
5322 last_osd_report[i].second = 0;
7c673cae 5323 } else if (can_mark_down(i)) {
f67539c2
TL
5324 utime_t diff = now - t->second.first;
5325 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5326 // to allow for the osd to miss a beacon.
5327 int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5328 utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
5329 if (diff > max_timeout) {
5330 mon.clog->info() << "osd." << i << " marked down after no beacon for "
5331 << diff << " seconds";
5332 derr << "no beacon from osd." << i << " since " << t->second.first
5333 << ", " << diff << " seconds ago. marking down" << dendl;
5334 pending_inc.new_state[i] = CEPH_OSD_UP;
5335 new_down = true;
7c673cae
FG
5336 }
5337 }
5338 }
5339 return new_down;
5340}
5341
11fdf7f2
TL
5342static void dump_cpu_list(Formatter *f, const char *name,
5343 const string& strlist)
7c673cae 5344{
11fdf7f2
TL
5345 cpu_set_t cpu_set;
5346 size_t cpu_set_size;
5347 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5348 return;
5349 }
5350 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5351 f->open_array_section(name);
5352 for (auto cpu : cpus) {
5353 f->dump_int("cpu", cpu);
7c673cae 5354 }
11fdf7f2 5355 f->close_section();
7c673cae
FG
5356}
5357
5358void OSDMonitor::dump_info(Formatter *f)
5359{
5360 f->open_object_section("osdmap");
5361 osdmap.dump(f);
5362 f->close_section();
5363
5364 f->open_array_section("osd_metadata");
5365 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5366 if (osdmap.exists(i)) {
5367 f->open_object_section("osd");
5368 f->dump_unsigned("id", i);
5369 dump_osd_metadata(i, f, NULL);
5370 f->close_section();
5371 }
5372 }
5373 f->close_section();
5374
1911f103
TL
5375 f->open_object_section("osdmap_clean_epochs");
5376 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5377
5378 f->open_object_section("last_epoch_clean");
5379 last_epoch_clean.dump(f);
5380 f->close_section();
5381
5382 f->open_array_section("osd_epochs");
5383 for (auto& osd_epoch : osd_epochs) {
5384 f->open_object_section("osd");
5385 f->dump_unsigned("id", osd_epoch.first);
5386 f->dump_unsigned("epoch", osd_epoch.second);
5387 f->close_section();
5388 }
5389 f->close_section(); // osd_epochs
5390
5391 f->close_section(); // osd_clean_epochs
5392
7c673cae
FG
5393 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5394 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5395
5396 f->open_object_section("crushmap");
5397 osdmap.crush->dump(f);
5398 f->close_section();
11fdf7f2
TL
5399
5400 if (has_osdmap_manifest) {
5401 f->open_object_section("osdmap_manifest");
5402 osdmap_manifest.dump(f);
5403 f->close_section();
5404 }
7c673cae
FG
5405}
5406
5407namespace {
5408 enum osd_pool_get_choices {
11fdf7f2 5409 SIZE, MIN_SIZE,
28e407b8 5410 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
7c673cae
FG
5411 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5412 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5413 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
11fdf7f2 5414 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
7c673cae
FG
5415 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5416 CACHE_TARGET_FULL_RATIO,
5417 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5418 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5419 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5420 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5421 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5422 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5423 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5424 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
11fdf7f2
TL
5425 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5426 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
f67539c2
TL
5427 PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5428 DEDUP_CDC_CHUNK_SIZE };
7c673cae
FG
5429
5430 std::set<osd_pool_get_choices>
5431 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5432 const std::set<osd_pool_get_choices>& second)
5433 {
5434 std::set<osd_pool_get_choices> result;
5435 std::set_difference(first.begin(), first.end(),
5436 second.begin(), second.end(),
5437 std::inserter(result, result.end()));
5438 return result;
5439 }
5440}
5441
5442
5443bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5444{
5445 op->mark_osdmon_event(__func__);
9f95a23c 5446 auto m = op->get_req<MMonCommand>();
7c673cae
FG
5447 int r = 0;
5448 bufferlist rdata;
5449 stringstream ss, ds;
5450
11fdf7f2 5451 cmdmap_t cmdmap;
7c673cae
FG
5452 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5453 string rs = ss.str();
f67539c2 5454 mon.reply_command(op, -EINVAL, rs, get_last_committed());
7c673cae
FG
5455 return true;
5456 }
5457
11fdf7f2 5458 MonSession *session = op->get_session();
7c673cae 5459 if (!session) {
11fdf7f2 5460 derr << __func__ << " no session" << dendl;
f67539c2 5461 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
5462 return true;
5463 }
5464
5465 string prefix;
9f95a23c 5466 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
5467
5468 string format;
9f95a23c 5469 cmd_getval(cmdmap, "format", format, string("plain"));
7c673cae
FG
5470 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5471
5472 if (prefix == "osd stat") {
92f5a8d4
TL
5473 if (f) {
5474 f->open_object_section("osdmap");
5475 osdmap.print_summary(f.get(), ds, "", true);
5476 f->close_section();
7c673cae 5477 f->flush(rdata);
92f5a8d4
TL
5478 } else {
5479 osdmap.print_summary(nullptr, ds, "", true);
7c673cae 5480 rdata.append(ds);
92f5a8d4 5481 }
7c673cae 5482 }
7c673cae
FG
5483 else if (prefix == "osd dump" ||
5484 prefix == "osd tree" ||
11fdf7f2 5485 prefix == "osd tree-from" ||
7c673cae
FG
5486 prefix == "osd ls" ||
5487 prefix == "osd getmap" ||
31f18b77 5488 prefix == "osd getcrushmap" ||
9f95a23c
TL
5489 prefix == "osd ls-tree" ||
5490 prefix == "osd info") {
7c673cae
FG
5491
5492 epoch_t epoch = 0;
5493 int64_t epochnum;
9f95a23c 5494 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
7c673cae
FG
5495 epoch = epochnum;
5496
5497 bufferlist osdmap_bl;
5498 int err = get_version_full(epoch, osdmap_bl);
5499 if (err == -ENOENT) {
5500 r = -ENOENT;
5501 ss << "there is no map for epoch " << epoch;
5502 goto reply;
5503 }
11fdf7f2
TL
5504 ceph_assert(err == 0);
5505 ceph_assert(osdmap_bl.length());
7c673cae
FG
5506
5507 OSDMap *p;
5508 if (epoch == osdmap.get_epoch()) {
5509 p = &osdmap;
5510 } else {
5511 p = new OSDMap;
5512 p->decode(osdmap_bl);
5513 }
5514
224ce89b
WB
5515 auto sg = make_scope_guard([&] {
5516 if (p != &osdmap) {
5517 delete p;
5518 }
5519 });
5520
7c673cae
FG
5521 if (prefix == "osd dump") {
5522 stringstream ds;
5523 if (f) {
5524 f->open_object_section("osdmap");
5525 p->dump(f.get());
5526 f->close_section();
5527 f->flush(ds);
5528 } else {
5529 p->print(ds);
5530 }
5531 rdata.append(ds);
5532 if (!f)
5533 ds << " ";
5534 } else if (prefix == "osd ls") {
5535 if (f) {
5536 f->open_array_section("osds");
5537 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5538 if (osdmap.exists(i)) {
5539 f->dump_int("osd", i);
5540 }
5541 }
5542 f->close_section();
5543 f->flush(ds);
5544 } else {
5545 bool first = true;
5546 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5547 if (osdmap.exists(i)) {
5548 if (!first)
5549 ds << "\n";
5550 first = false;
5551 ds << i;
5552 }
5553 }
5554 }
5555 rdata.append(ds);
9f95a23c
TL
5556 } else if (prefix == "osd info") {
5557 int64_t osd_id;
5558 bool do_single_osd = true;
5559 if (!cmd_getval(cmdmap, "id", osd_id)) {
5560 do_single_osd = false;
5561 }
5562
5563 if (do_single_osd && !osdmap.exists(osd_id)) {
5564 ss << "osd." << osd_id << " does not exist";
5565 r = -EINVAL;
5566 goto reply;
5567 }
5568
5569 if (f) {
5570 if (do_single_osd) {
5571 osdmap.dump_osd(osd_id, f.get());
5572 } else {
5573 osdmap.dump_osds(f.get());
5574 }
5575 f->flush(ds);
5576 } else {
5577 if (do_single_osd) {
5578 osdmap.print_osd(osd_id, ds);
5579 } else {
5580 osdmap.print_osds(ds);
5581 }
5582 }
5583 rdata.append(ds);
11fdf7f2
TL
5584 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5585 string bucket;
5586 if (prefix == "osd tree-from") {
9f95a23c 5587 cmd_getval(cmdmap, "bucket", bucket);
11fdf7f2
TL
5588 if (!osdmap.crush->name_exists(bucket)) {
5589 ss << "bucket '" << bucket << "' does not exist";
5590 r = -ENOENT;
5591 goto reply;
5592 }
5593 int id = osdmap.crush->get_item_id(bucket);
5594 if (id >= 0) {
5595 ss << "\"" << bucket << "\" is not a bucket";
5596 r = -EINVAL;
5597 goto reply;
5598 }
5599 }
5600
31f18b77 5601 vector<string> states;
9f95a23c 5602 cmd_getval(cmdmap, "states", states);
31f18b77
FG
5603 unsigned filter = 0;
5604 for (auto& s : states) {
5605 if (s == "up") {
5606 filter |= OSDMap::DUMP_UP;
5607 } else if (s == "down") {
5608 filter |= OSDMap::DUMP_DOWN;
5609 } else if (s == "in") {
5610 filter |= OSDMap::DUMP_IN;
5611 } else if (s == "out") {
5612 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
5613 } else if (s == "destroyed") {
5614 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
5615 } else {
5616 ss << "unrecognized state '" << s << "'";
5617 r = -EINVAL;
5618 goto reply;
5619 }
5620 }
5621 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
5622 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5623 ss << "cannot specify both 'in' and 'out'";
5624 r = -EINVAL;
5625 goto reply;
5626 }
5627 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5628 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5629 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5630 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5631 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5632 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5633 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
5634 r = -EINVAL;
5635 goto reply;
5636 }
7c673cae
FG
5637 if (f) {
5638 f->open_object_section("tree");
11fdf7f2 5639 p->print_tree(f.get(), NULL, filter, bucket);
7c673cae
FG
5640 f->close_section();
5641 f->flush(ds);
5642 } else {
11fdf7f2 5643 p->print_tree(NULL, &ds, filter, bucket);
7c673cae
FG
5644 }
5645 rdata.append(ds);
5646 } else if (prefix == "osd getmap") {
5647 rdata.append(osdmap_bl);
5648 ss << "got osdmap epoch " << p->get_epoch();
5649 } else if (prefix == "osd getcrushmap") {
f67539c2 5650 p->crush->encode(rdata, mon.get_quorum_con_features());
31f18b77
FG
5651 ss << p->get_crush_version();
5652 } else if (prefix == "osd ls-tree") {
5653 string bucket_name;
9f95a23c 5654 cmd_getval(cmdmap, "name", bucket_name);
31f18b77
FG
5655 set<int> osds;
5656 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5657 if (r == -ENOENT) {
5658 ss << "\"" << bucket_name << "\" does not exist";
5659 goto reply;
5660 } else if (r < 0) {
5661 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5662 goto reply;
5663 }
5664
5665 if (f) {
5666 f->open_array_section("osds");
5667 for (auto &i : osds) {
5668 if (osdmap.exists(i)) {
5669 f->dump_int("osd", i);
5670 }
5671 }
5672 f->close_section();
5673 f->flush(ds);
5674 } else {
5675 bool first = true;
5676 for (auto &i : osds) {
5677 if (osdmap.exists(i)) {
5678 if (!first)
5679 ds << "\n";
5680 first = false;
5681 ds << i;
5682 }
5683 }
5684 }
5685
5686 rdata.append(ds);
7c673cae 5687 }
7c673cae
FG
5688 } else if (prefix == "osd getmaxosd") {
5689 if (f) {
5690 f->open_object_section("getmaxosd");
5691 f->dump_unsigned("epoch", osdmap.get_epoch());
5692 f->dump_int("max_osd", osdmap.get_max_osd());
5693 f->close_section();
5694 f->flush(rdata);
5695 } else {
5696 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5697 rdata.append(ds);
5698 }
5699 } else if (prefix == "osd utilization") {
5700 string out;
5701 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5702 if (f)
5703 f->flush(rdata);
5704 else
5705 rdata.append(out);
5706 r = 0;
5707 goto reply;
5708 } else if (prefix == "osd find") {
5709 int64_t osd;
9f95a23c 5710 if (!cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5711 ss << "unable to parse osd id value '"
5712 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5713 r = -EINVAL;
5714 goto reply;
5715 }
5716 if (!osdmap.exists(osd)) {
5717 ss << "osd." << osd << " does not exist";
5718 r = -ENOENT;
5719 goto reply;
5720 }
5721 string format;
9f95a23c 5722 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5723 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5724 f->open_object_section("osd_location");
5725 f->dump_int("osd", osd);
11fdf7f2 5726 f->dump_object("addrs", osdmap.get_addrs(osd));
f64942e4 5727 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
11fdf7f2
TL
5728
5729 // try to identify host, pod/container name, etc.
5730 map<string,string> m;
5731 load_metadata(osd, m, nullptr);
5732 if (auto p = m.find("hostname"); p != m.end()) {
5733 f->dump_string("host", p->second);
5734 }
5735 for (auto& k : {
5736 "pod_name", "pod_namespace", // set by rook
9f95a23c 5737 "container_name" // set by cephadm, ceph-ansible
11fdf7f2
TL
5738 }) {
5739 if (auto p = m.find(k); p != m.end()) {
5740 f->dump_string(k, p->second);
5741 }
5742 }
5743
5744 // crush is helpful too
7c673cae
FG
5745 f->open_object_section("crush_location");
5746 map<string,string> loc = osdmap.crush->get_full_location(osd);
5747 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5748 f->dump_string(p->first.c_str(), p->second);
5749 f->close_section();
5750 f->close_section();
5751 f->flush(rdata);
5752 } else if (prefix == "osd metadata") {
5753 int64_t osd = -1;
5754 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
9f95a23c 5755 !cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5756 ss << "unable to parse osd id value '"
5757 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5758 r = -EINVAL;
5759 goto reply;
5760 }
5761 if (osd >= 0 && !osdmap.exists(osd)) {
5762 ss << "osd." << osd << " does not exist";
5763 r = -ENOENT;
5764 goto reply;
5765 }
5766 string format;
9f95a23c 5767 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5768 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5769 if (osd >= 0) {
5770 f->open_object_section("osd_metadata");
5771 f->dump_unsigned("id", osd);
5772 r = dump_osd_metadata(osd, f.get(), &ss);
5773 if (r < 0)
5774 goto reply;
5775 f->close_section();
5776 } else {
5777 r = 0;
5778 f->open_array_section("osd_metadata");
5779 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5780 if (osdmap.exists(i)) {
5781 f->open_object_section("osd");
5782 f->dump_unsigned("id", i);
5783 r = dump_osd_metadata(i, f.get(), NULL);
5784 if (r == -EINVAL || r == -ENOENT) {
5785 // Drop error, continue to get other daemons' metadata
5786 dout(4) << "No metadata for osd." << i << dendl;
5787 r = 0;
5788 } else if (r < 0) {
5789 // Unexpected error
5790 goto reply;
5791 }
5792 f->close_section();
5793 }
5794 }
5795 f->close_section();
5796 }
5797 f->flush(rdata);
31f18b77
FG
5798 } else if (prefix == "osd versions") {
5799 if (!f)
5800 f.reset(Formatter::create("json-pretty"));
5801 count_metadata("ceph_version", f.get());
5802 f->flush(rdata);
5803 r = 0;
5804 } else if (prefix == "osd count-metadata") {
5805 if (!f)
5806 f.reset(Formatter::create("json-pretty"));
5807 string field;
9f95a23c 5808 cmd_getval(cmdmap, "property", field);
31f18b77
FG
5809 count_metadata(field, f.get());
5810 f->flush(rdata);
5811 r = 0;
11fdf7f2
TL
5812 } else if (prefix == "osd numa-status") {
5813 TextTable tbl;
5814 if (f) {
5815 f->open_array_section("osds");
5816 } else {
5817 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5818 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5819 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5820 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5821 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5822 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5823 }
5824 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5825 if (osdmap.exists(i)) {
5826 map<string,string> m;
5827 ostringstream err;
5828 if (load_metadata(i, m, &err) < 0) {
5829 continue;
5830 }
5831 string host;
5832 auto p = m.find("hostname");
5833 if (p != m.end()) {
5834 host = p->second;
5835 }
5836 if (f) {
5837 f->open_object_section("osd");
5838 f->dump_int("osd", i);
5839 f->dump_string("host", host);
5840 for (auto n : { "network_numa_node", "objectstore_numa_node",
5841 "numa_node" }) {
5842 p = m.find(n);
5843 if (p != m.end()) {
5844 f->dump_int(n, atoi(p->second.c_str()));
5845 }
5846 }
5847 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5848 p = m.find(n);
5849 if (p != m.end()) {
5850 list<string> ls = get_str_list(p->second, ",");
5851 f->open_array_section(n);
5852 for (auto node : ls) {
5853 f->dump_int("node", atoi(node.c_str()));
5854 }
5855 f->close_section();
5856 }
5857 }
5858 for (auto n : { "numa_node_cpus" }) {
5859 p = m.find(n);
5860 if (p != m.end()) {
5861 dump_cpu_list(f.get(), n, p->second);
5862 }
5863 }
5864 f->close_section();
5865 } else {
5866 tbl << i;
5867 tbl << host;
5868 p = m.find("network_numa_nodes");
5869 if (p != m.end()) {
5870 tbl << p->second;
5871 } else {
5872 tbl << "-";
5873 }
5874 p = m.find("objectstore_numa_nodes");
5875 if (p != m.end()) {
5876 tbl << p->second;
5877 } else {
5878 tbl << "-";
5879 }
5880 p = m.find("numa_node");
5881 auto q = m.find("numa_node_cpus");
5882 if (p != m.end() && q != m.end()) {
5883 tbl << p->second;
5884 tbl << q->second;
5885 } else {
5886 tbl << "-";
5887 tbl << "-";
5888 }
5889 tbl << TextTable::endrow;
5890 }
5891 }
5892 }
5893 if (f) {
5894 f->close_section();
5895 f->flush(rdata);
5896 } else {
5897 rdata.append(stringify(tbl));
5898 }
7c673cae
FG
5899 } else if (prefix == "osd map") {
5900 string poolstr, objstr, namespacestr;
9f95a23c
TL
5901 cmd_getval(cmdmap, "pool", poolstr);
5902 cmd_getval(cmdmap, "object", objstr);
5903 cmd_getval(cmdmap, "nspace", namespacestr);
7c673cae
FG
5904
5905 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5906 if (pool < 0) {
5907 ss << "pool " << poolstr << " does not exist";
5908 r = -ENOENT;
5909 goto reply;
5910 }
5911 object_locator_t oloc(pool, namespacestr);
5912 object_t oid(objstr);
5913 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5914 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5915 vector<int> up, acting;
5916 int up_p, acting_p;
5917 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5918
5919 string fullobjname;
5920 if (!namespacestr.empty())
5921 fullobjname = namespacestr + string("/") + oid.name;
5922 else
5923 fullobjname = oid.name;
5924 if (f) {
5925 f->open_object_section("osd_map");
5926 f->dump_unsigned("epoch", osdmap.get_epoch());
5927 f->dump_string("pool", poolstr);
5928 f->dump_int("pool_id", pool);
5929 f->dump_stream("objname") << fullobjname;
5930 f->dump_stream("raw_pgid") << pgid;
5931 f->dump_stream("pgid") << mpgid;
5932 f->open_array_section("up");
5933 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5934 f->dump_int("osd", *p);
5935 f->close_section();
5936 f->dump_int("up_primary", up_p);
5937 f->open_array_section("acting");
5938 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5939 f->dump_int("osd", *p);
5940 f->close_section();
5941 f->dump_int("acting_primary", acting_p);
5942 f->close_section(); // osd_map
5943 f->flush(rdata);
5944 } else {
5945 ds << "osdmap e" << osdmap.get_epoch()
5946 << " pool '" << poolstr << "' (" << pool << ")"
5947 << " object '" << fullobjname << "' ->"
5948 << " pg " << pgid << " (" << mpgid << ")"
5949 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5950 << pg_vector_string(acting) << ", p" << acting_p << ")";
5951 rdata.append(ds);
5952 }
5953
5954 } else if (prefix == "pg map") {
5955 pg_t pgid;
5956 string pgidstr;
9f95a23c 5957 cmd_getval(cmdmap, "pgid", pgidstr);
7c673cae
FG
5958 if (!pgid.parse(pgidstr.c_str())) {
5959 ss << "invalid pgid '" << pgidstr << "'";
5960 r = -EINVAL;
5961 goto reply;
5962 }
5963 vector<int> up, acting;
5964 if (!osdmap.have_pg_pool(pgid.pool())) {
5965 ss << "pg '" << pgidstr << "' does not exist";
5966 r = -ENOENT;
5967 goto reply;
5968 }
5969 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5970 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5971 if (f) {
5972 f->open_object_section("pg_map");
5973 f->dump_unsigned("epoch", osdmap.get_epoch());
5974 f->dump_stream("raw_pgid") << pgid;
5975 f->dump_stream("pgid") << mpgid;
5976 f->open_array_section("up");
5977 for (auto osd : up) {
5978 f->dump_int("up_osd", osd);
5979 }
5980 f->close_section();
5981 f->open_array_section("acting");
5982 for (auto osd : acting) {
5983 f->dump_int("acting_osd", osd);
5984 }
5985 f->close_section();
5986 f->close_section();
5987 f->flush(rdata);
5988 } else {
5989 ds << "osdmap e" << osdmap.get_epoch()
5990 << " pg " << pgid << " (" << mpgid << ")"
5991 << " -> up " << up << " acting " << acting;
5992 rdata.append(ds);
5993 }
5994 goto reply;
5995
7c673cae 5996 } else if (prefix == "osd lspools") {
7c673cae
FG
5997 if (f)
5998 f->open_array_section("pools");
5999 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6000 p != osdmap.pools.end();
6001 ++p) {
11fdf7f2
TL
6002 if (f) {
6003 f->open_object_section("pool");
6004 f->dump_int("poolnum", p->first);
6005 f->dump_string("poolname", osdmap.pool_name[p->first]);
6006 f->close_section();
6007 } else {
6008 ds << p->first << ' ' << osdmap.pool_name[p->first];
6009 if (next(p) != osdmap.pools.end()) {
6010 ds << '\n';
7c673cae
FG
6011 }
6012 }
6013 }
6014 if (f) {
6015 f->close_section();
6016 f->flush(ds);
6017 }
6018 rdata.append(ds);
f67539c2
TL
6019 } else if (prefix == "osd blocklist ls" ||
6020 prefix == "osd blacklist ls") {
7c673cae 6021 if (f)
f67539c2 6022 f->open_array_section("blocklist");
7c673cae 6023
f67539c2
TL
6024 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6025 p != osdmap.blocklist.end();
7c673cae
FG
6026 ++p) {
6027 if (f) {
6028 f->open_object_section("entry");
11fdf7f2 6029 f->dump_string("addr", p->first.get_legacy_str());
7c673cae
FG
6030 f->dump_stream("until") << p->second;
6031 f->close_section();
6032 } else {
6033 stringstream ss;
6034 string s;
6035 ss << p->first << " " << p->second;
6036 getline(ss, s);
6037 s += "\n";
6038 rdata.append(s);
6039 }
6040 }
6041 if (f) {
6042 f->close_section();
6043 f->flush(rdata);
6044 }
f67539c2 6045 ss << "listed " << osdmap.blocklist.size() << " entries";
7c673cae
FG
6046
6047 } else if (prefix == "osd pool ls") {
6048 string detail;
9f95a23c 6049 cmd_getval(cmdmap, "detail", detail);
7c673cae
FG
6050 if (!f && detail == "detail") {
6051 ostringstream ss;
6052 osdmap.print_pools(ss);
6053 rdata.append(ss.str());
6054 } else {
6055 if (f)
6056 f->open_array_section("pools");
6057 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
6058 it != osdmap.get_pools().end();
6059 ++it) {
6060 if (f) {
6061 if (detail == "detail") {
6062 f->open_object_section("pool");
eafe8130 6063 f->dump_int("pool_id", it->first);
7c673cae
FG
6064 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6065 it->second.dump(f.get());
6066 f->close_section();
6067 } else {
6068 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6069 }
6070 } else {
6071 rdata.append(osdmap.get_pool_name(it->first) + "\n");
6072 }
6073 }
6074 if (f) {
6075 f->close_section();
6076 f->flush(rdata);
6077 }
6078 }
6079
6080 } else if (prefix == "osd crush get-tunable") {
6081 string tunable;
9f95a23c 6082 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
6083 ostringstream rss;
6084 if (f)
6085 f->open_object_section("tunable");
6086 if (tunable == "straw_calc_version") {
6087 if (f)
6088 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6089 else
6090 rss << osdmap.crush->get_straw_calc_version() << "\n";
6091 } else {
6092 r = -EINVAL;
6093 goto reply;
6094 }
6095 if (f) {
6096 f->close_section();
6097 f->flush(rdata);
6098 } else {
6099 rdata.append(rss.str());
6100 }
6101 r = 0;
6102
6103 } else if (prefix == "osd pool get") {
6104 string poolstr;
9f95a23c 6105 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
6106 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6107 if (pool < 0) {
6108 ss << "unrecognized pool '" << poolstr << "'";
6109 r = -ENOENT;
6110 goto reply;
6111 }
6112
6113 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6114 string var;
9f95a23c 6115 cmd_getval(cmdmap, "var", var);
7c673cae
FG
6116
6117 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6118 const choices_map_t ALL_CHOICES = {
6119 {"size", SIZE},
6120 {"min_size", MIN_SIZE},
7c673cae 6121 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
28e407b8
AA
6122 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
6123 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
7c673cae
FG
6124 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6125 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6126 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6127 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6128 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6129 {"use_gmt_hitset", USE_GMT_HITSET},
11fdf7f2 6130 {"target_max_objects", TARGET_MAX_OBJECTS},
7c673cae
FG
6131 {"target_max_bytes", TARGET_MAX_BYTES},
6132 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6133 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6134 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6135 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6136 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6137 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6138 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6139 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6140 {"fast_read", FAST_READ},
6141 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6142 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6143 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6144 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6145 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6146 {"recovery_priority", RECOVERY_PRIORITY},
6147 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6148 {"scrub_priority", SCRUB_PRIORITY},
6149 {"compression_mode", COMPRESSION_MODE},
6150 {"compression_algorithm", COMPRESSION_ALGORITHM},
6151 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6152 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6153 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6154 {"csum_type", CSUM_TYPE},
6155 {"csum_max_block", CSUM_MAX_BLOCK},
6156 {"csum_min_block", CSUM_MIN_BLOCK},
11fdf7f2
TL
6157 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6158 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6159 {"pg_num_min", PG_NUM_MIN},
6160 {"target_size_bytes", TARGET_SIZE_BYTES},
6161 {"target_size_ratio", TARGET_SIZE_RATIO},
6162 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
f67539c2
TL
6163 {"dedup_tier", DEDUP_TIER},
6164 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6165 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
7c673cae
FG
6166 };
6167
6168 typedef std::set<osd_pool_get_choices> choices_set_t;
6169
6170 const choices_set_t ONLY_TIER_CHOICES = {
6171 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6172 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6173 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6174 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6175 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 6176 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
6177 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6178 };
6179 const choices_set_t ONLY_ERASURE_CHOICES = {
28e407b8 6180 EC_OVERWRITES, ERASURE_CODE_PROFILE
7c673cae
FG
6181 };
6182
6183 choices_set_t selected_choices;
6184 if (var == "all") {
6185 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6186 it != ALL_CHOICES.end(); ++it) {
6187 selected_choices.insert(it->second);
6188 }
6189
6190 if(!p->is_tier()) {
6191 selected_choices = subtract_second_from_first(selected_choices,
6192 ONLY_TIER_CHOICES);
6193 }
6194
6195 if(!p->is_erasure()) {
6196 selected_choices = subtract_second_from_first(selected_choices,
6197 ONLY_ERASURE_CHOICES);
6198 }
6199 } else /* var != "all" */ {
6200 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
522d829b
TL
6201 if (found == ALL_CHOICES.end()) {
6202 ss << "pool '" << poolstr
6203 << "': invalid variable: '" << var << "'";
6204 r = -EINVAL;
6205 goto reply;
6206 }
6207
7c673cae
FG
6208 osd_pool_get_choices selected = found->second;
6209
6210 if (!p->is_tier() &&
6211 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6212 ss << "pool '" << poolstr
6213 << "' is not a tier pool: variable not applicable";
6214 r = -EACCES;
6215 goto reply;
6216 }
6217
6218 if (!p->is_erasure() &&
6219 ONLY_ERASURE_CHOICES.find(selected)
6220 != ONLY_ERASURE_CHOICES.end()) {
6221 ss << "pool '" << poolstr
6222 << "' is not a erasure pool: variable not applicable";
6223 r = -EACCES;
6224 goto reply;
6225 }
6226
94b18763
FG
6227 if (pool_opts_t::is_opt_name(var) &&
6228 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6229 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6230 r = -ENOENT;
6231 goto reply;
6232 }
6233
7c673cae
FG
6234 selected_choices.insert(selected);
6235 }
6236
6237 if (f) {
94b18763
FG
6238 f->open_object_section("pool");
6239 f->dump_string("pool", poolstr);
6240 f->dump_int("pool_id", pool);
7c673cae
FG
6241 for(choices_set_t::const_iterator it = selected_choices.begin();
6242 it != selected_choices.end(); ++it) {
6243 choices_map_t::const_iterator i;
c07f9fc5
FG
6244 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6245 if (i->second == *it) {
6246 break;
6247 }
6248 }
11fdf7f2 6249 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6250 switch(*it) {
6251 case PG_NUM:
6252 f->dump_int("pg_num", p->get_pg_num());
6253 break;
6254 case PGP_NUM:
6255 f->dump_int("pgp_num", p->get_pgp_num());
6256 break;
7c673cae
FG
6257 case SIZE:
6258 f->dump_int("size", p->get_size());
6259 break;
6260 case MIN_SIZE:
6261 f->dump_int("min_size", p->get_min_size());
6262 break;
7c673cae 6263 case CRUSH_RULE:
31f18b77 6264 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6265 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 6266 p->get_crush_rule()));
7c673cae 6267 } else {
31f18b77 6268 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
6269 }
6270 break;
28e407b8
AA
6271 case EC_OVERWRITES:
6272 f->dump_bool("allow_ec_overwrites",
6273 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6274 break;
11fdf7f2
TL
6275 case PG_AUTOSCALE_MODE:
6276 f->dump_string("pg_autoscale_mode",
6277 pg_pool_t::get_pg_autoscale_mode_name(
6278 p->pg_autoscale_mode));
6279 break;
7c673cae
FG
6280 case HASHPSPOOL:
6281 case NODELETE:
6282 case NOPGCHANGE:
6283 case NOSIZECHANGE:
6284 case WRITE_FADVISE_DONTNEED:
6285 case NOSCRUB:
6286 case NODEEP_SCRUB:
94b18763
FG
6287 f->dump_bool(i->first.c_str(),
6288 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
6289 break;
6290 case HIT_SET_PERIOD:
6291 f->dump_int("hit_set_period", p->hit_set_period);
6292 break;
6293 case HIT_SET_COUNT:
6294 f->dump_int("hit_set_count", p->hit_set_count);
6295 break;
6296 case HIT_SET_TYPE:
6297 f->dump_string("hit_set_type",
6298 HitSet::get_type_name(p->hit_set_params.get_type()));
6299 break;
6300 case HIT_SET_FPP:
6301 {
6302 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6303 BloomHitSet::Params *bloomp =
6304 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6305 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6306 } else if(var != "all") {
6307 f->close_section();
6308 ss << "hit set is not of type Bloom; " <<
6309 "invalid to get a false positive rate!";
6310 r = -EINVAL;
6311 goto reply;
6312 }
6313 }
6314 break;
6315 case USE_GMT_HITSET:
6316 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6317 break;
6318 case TARGET_MAX_OBJECTS:
6319 f->dump_unsigned("target_max_objects", p->target_max_objects);
6320 break;
6321 case TARGET_MAX_BYTES:
6322 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6323 break;
6324 case CACHE_TARGET_DIRTY_RATIO:
6325 f->dump_unsigned("cache_target_dirty_ratio_micro",
6326 p->cache_target_dirty_ratio_micro);
6327 f->dump_float("cache_target_dirty_ratio",
6328 ((float)p->cache_target_dirty_ratio_micro/1000000));
6329 break;
6330 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6331 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6332 p->cache_target_dirty_high_ratio_micro);
6333 f->dump_float("cache_target_dirty_high_ratio",
6334 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6335 break;
6336 case CACHE_TARGET_FULL_RATIO:
6337 f->dump_unsigned("cache_target_full_ratio_micro",
6338 p->cache_target_full_ratio_micro);
6339 f->dump_float("cache_target_full_ratio",
6340 ((float)p->cache_target_full_ratio_micro/1000000));
6341 break;
6342 case CACHE_MIN_FLUSH_AGE:
6343 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6344 break;
6345 case CACHE_MIN_EVICT_AGE:
6346 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6347 break;
6348 case ERASURE_CODE_PROFILE:
6349 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6350 break;
6351 case MIN_READ_RECENCY_FOR_PROMOTE:
6352 f->dump_int("min_read_recency_for_promote",
6353 p->min_read_recency_for_promote);
6354 break;
6355 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6356 f->dump_int("min_write_recency_for_promote",
6357 p->min_write_recency_for_promote);
6358 break;
6359 case FAST_READ:
6360 f->dump_int("fast_read", p->fast_read);
6361 break;
6362 case HIT_SET_GRADE_DECAY_RATE:
6363 f->dump_int("hit_set_grade_decay_rate",
6364 p->hit_set_grade_decay_rate);
6365 break;
6366 case HIT_SET_SEARCH_LAST_N:
6367 f->dump_int("hit_set_search_last_n",
6368 p->hit_set_search_last_n);
6369 break;
6370 case SCRUB_MIN_INTERVAL:
6371 case SCRUB_MAX_INTERVAL:
6372 case DEEP_SCRUB_INTERVAL:
6373 case RECOVERY_PRIORITY:
6374 case RECOVERY_OP_PRIORITY:
6375 case SCRUB_PRIORITY:
6376 case COMPRESSION_MODE:
6377 case COMPRESSION_ALGORITHM:
6378 case COMPRESSION_REQUIRED_RATIO:
6379 case COMPRESSION_MAX_BLOB_SIZE:
6380 case COMPRESSION_MIN_BLOB_SIZE:
6381 case CSUM_TYPE:
6382 case CSUM_MAX_BLOCK:
6383 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6384 case FINGERPRINT_ALGORITHM:
6385 case PG_NUM_MIN:
6386 case TARGET_SIZE_BYTES:
6387 case TARGET_SIZE_RATIO:
6388 case PG_AUTOSCALE_BIAS:
f67539c2
TL
6389 case DEDUP_TIER:
6390 case DEDUP_CHUNK_ALGORITHM:
6391 case DEDUP_CDC_CHUNK_SIZE:
c07f9fc5
FG
6392 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6393 if (p->opts.is_set(key)) {
c07f9fc5 6394 if(*it == CSUM_TYPE) {
11fdf7f2 6395 int64_t val;
c07f9fc5
FG
6396 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6397 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6398 } else {
6399 p->opts.dump(i->first, f.get());
6400 }
94b18763 6401 }
7c673cae
FG
6402 break;
6403 }
7c673cae 6404 }
94b18763
FG
6405 f->close_section();
6406 f->flush(rdata);
7c673cae
FG
6407 } else /* !f */ {
6408 for(choices_set_t::const_iterator it = selected_choices.begin();
6409 it != selected_choices.end(); ++it) {
6410 choices_map_t::const_iterator i;
6411 switch(*it) {
6412 case PG_NUM:
6413 ss << "pg_num: " << p->get_pg_num() << "\n";
6414 break;
6415 case PGP_NUM:
6416 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6417 break;
7c673cae
FG
6418 case SIZE:
6419 ss << "size: " << p->get_size() << "\n";
6420 break;
6421 case MIN_SIZE:
6422 ss << "min_size: " << p->get_min_size() << "\n";
6423 break;
7c673cae 6424 case CRUSH_RULE:
31f18b77 6425 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6426 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 6427 p->get_crush_rule()) << "\n";
7c673cae 6428 } else {
31f18b77 6429 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
6430 }
6431 break;
11fdf7f2
TL
6432 case PG_AUTOSCALE_MODE:
6433 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6434 p->pg_autoscale_mode) <<"\n";
6435 break;
7c673cae
FG
6436 case HIT_SET_PERIOD:
6437 ss << "hit_set_period: " << p->hit_set_period << "\n";
6438 break;
6439 case HIT_SET_COUNT:
6440 ss << "hit_set_count: " << p->hit_set_count << "\n";
6441 break;
6442 case HIT_SET_TYPE:
6443 ss << "hit_set_type: " <<
6444 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6445 break;
6446 case HIT_SET_FPP:
6447 {
6448 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6449 BloomHitSet::Params *bloomp =
6450 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6451 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6452 } else if(var != "all") {
6453 ss << "hit set is not of type Bloom; " <<
6454 "invalid to get a false positive rate!";
6455 r = -EINVAL;
6456 goto reply;
6457 }
6458 }
6459 break;
6460 case USE_GMT_HITSET:
6461 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6462 break;
6463 case TARGET_MAX_OBJECTS:
6464 ss << "target_max_objects: " << p->target_max_objects << "\n";
6465 break;
6466 case TARGET_MAX_BYTES:
6467 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6468 break;
6469 case CACHE_TARGET_DIRTY_RATIO:
6470 ss << "cache_target_dirty_ratio: "
6471 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6472 break;
6473 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6474 ss << "cache_target_dirty_high_ratio: "
6475 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6476 break;
6477 case CACHE_TARGET_FULL_RATIO:
6478 ss << "cache_target_full_ratio: "
6479 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6480 break;
6481 case CACHE_MIN_FLUSH_AGE:
6482 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6483 break;
6484 case CACHE_MIN_EVICT_AGE:
6485 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6486 break;
6487 case ERASURE_CODE_PROFILE:
6488 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6489 break;
6490 case MIN_READ_RECENCY_FOR_PROMOTE:
6491 ss << "min_read_recency_for_promote: " <<
6492 p->min_read_recency_for_promote << "\n";
6493 break;
6494 case HIT_SET_GRADE_DECAY_RATE:
6495 ss << "hit_set_grade_decay_rate: " <<
6496 p->hit_set_grade_decay_rate << "\n";
6497 break;
6498 case HIT_SET_SEARCH_LAST_N:
6499 ss << "hit_set_search_last_n: " <<
6500 p->hit_set_search_last_n << "\n";
6501 break;
28e407b8
AA
6502 case EC_OVERWRITES:
6503 ss << "allow_ec_overwrites: " <<
6504 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6505 "\n";
6506 break;
7c673cae
FG
6507 case HASHPSPOOL:
6508 case NODELETE:
6509 case NOPGCHANGE:
6510 case NOSIZECHANGE:
6511 case WRITE_FADVISE_DONTNEED:
6512 case NOSCRUB:
6513 case NODEEP_SCRUB:
6514 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6515 if (i->second == *it)
6516 break;
6517 }
11fdf7f2 6518 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6519 ss << i->first << ": " <<
6520 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6521 "true" : "false") << "\n";
6522 break;
6523 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6524 ss << "min_write_recency_for_promote: " <<
6525 p->min_write_recency_for_promote << "\n";
6526 break;
6527 case FAST_READ:
6528 ss << "fast_read: " << p->fast_read << "\n";
6529 break;
6530 case SCRUB_MIN_INTERVAL:
6531 case SCRUB_MAX_INTERVAL:
6532 case DEEP_SCRUB_INTERVAL:
6533 case RECOVERY_PRIORITY:
6534 case RECOVERY_OP_PRIORITY:
6535 case SCRUB_PRIORITY:
6536 case COMPRESSION_MODE:
6537 case COMPRESSION_ALGORITHM:
6538 case COMPRESSION_REQUIRED_RATIO:
6539 case COMPRESSION_MAX_BLOB_SIZE:
6540 case COMPRESSION_MIN_BLOB_SIZE:
6541 case CSUM_TYPE:
6542 case CSUM_MAX_BLOCK:
6543 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6544 case FINGERPRINT_ALGORITHM:
6545 case PG_NUM_MIN:
6546 case TARGET_SIZE_BYTES:
6547 case TARGET_SIZE_RATIO:
6548 case PG_AUTOSCALE_BIAS:
f67539c2
TL
6549 case DEDUP_TIER:
6550 case DEDUP_CHUNK_ALGORITHM:
6551 case DEDUP_CDC_CHUNK_SIZE:
7c673cae
FG
6552 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6553 if (i->second == *it)
6554 break;
6555 }
11fdf7f2 6556 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6557 {
6558 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6559 if (p->opts.is_set(key)) {
6560 if(key == pool_opts_t::CSUM_TYPE) {
11fdf7f2 6561 int64_t val;
7c673cae
FG
6562 p->opts.get(key, &val);
6563 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6564 } else {
6565 ss << i->first << ": " << p->opts.get(key) << "\n";
6566 }
6567 }
6568 }
6569 break;
6570 }
6571 rdata.append(ss.str());
6572 ss.str("");
6573 }
6574 }
6575 r = 0;
7c673cae
FG
6576 } else if (prefix == "osd pool get-quota") {
6577 string pool_name;
9f95a23c 6578 cmd_getval(cmdmap, "pool", pool_name);
7c673cae
FG
6579
6580 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6581 if (poolid < 0) {
11fdf7f2 6582 ceph_assert(poolid == -ENOENT);
7c673cae
FG
6583 ss << "unrecognized pool '" << pool_name << "'";
6584 r = -ENOENT;
6585 goto reply;
6586 }
6587 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
f67539c2 6588 const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
9f95a23c 6589 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
6590 if (f) {
6591 f->open_object_section("pool_quotas");
6592 f->dump_string("pool_name", pool_name);
6593 f->dump_unsigned("pool_id", poolid);
6594 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
9f95a23c 6595 f->dump_int("current_num_objects", sum.num_objects);
7c673cae 6596 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
9f95a23c 6597 f->dump_int("current_num_bytes", sum.num_bytes);
7c673cae
FG
6598 f->close_section();
6599 f->flush(rdata);
6600 } else {
6601 stringstream rs;
6602 rs << "quotas for pool '" << pool_name << "':\n"
6603 << " max objects: ";
6604 if (p->quota_max_objects == 0)
6605 rs << "N/A";
9f95a23c 6606 else {
1adf2230 6607 rs << si_u_t(p->quota_max_objects) << " objects";
9f95a23c
TL
6608 rs << " (current num objects: " << sum.num_objects << " objects)";
6609 }
7c673cae
FG
6610 rs << "\n"
6611 << " max bytes : ";
6612 if (p->quota_max_bytes == 0)
6613 rs << "N/A";
9f95a23c 6614 else {
1adf2230 6615 rs << byte_u_t(p->quota_max_bytes);
9f95a23c
TL
6616 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6617 }
7c673cae
FG
6618 rdata.append(rs.str());
6619 }
6620 rdata.append("\n");
6621 r = 0;
6622 } else if (prefix == "osd crush rule list" ||
6623 prefix == "osd crush rule ls") {
c07f9fc5
FG
6624 if (f) {
6625 f->open_array_section("rules");
6626 osdmap.crush->list_rules(f.get());
6627 f->close_section();
6628 f->flush(rdata);
6629 } else {
6630 ostringstream ss;
6631 osdmap.crush->list_rules(&ss);
6632 rdata.append(ss.str());
6633 }
b5b8bbf5
FG
6634 } else if (prefix == "osd crush rule ls-by-class") {
6635 string class_name;
9f95a23c 6636 cmd_getval(cmdmap, "class", class_name);
b5b8bbf5
FG
6637 if (class_name.empty()) {
6638 ss << "no class specified";
6639 r = -EINVAL;
6640 goto reply;
6641 }
6642 set<int> rules;
6643 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6644 if (r < 0) {
6645 ss << "failed to get rules by class '" << class_name << "'";
6646 goto reply;
6647 }
6648 if (f) {
6649 f->open_array_section("rules");
6650 for (auto &rule: rules) {
6651 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6652 }
6653 f->close_section();
6654 f->flush(rdata);
6655 } else {
6656 ostringstream rs;
6657 for (auto &rule: rules) {
6658 rs << osdmap.crush->get_rule_name(rule) << "\n";
6659 }
6660 rdata.append(rs.str());
6661 }
7c673cae
FG
6662 } else if (prefix == "osd crush rule dump") {
6663 string name;
9f95a23c 6664 cmd_getval(cmdmap, "name", name);
7c673cae 6665 string format;
9f95a23c 6666 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6667 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6668 if (name == "") {
6669 f->open_array_section("rules");
6670 osdmap.crush->dump_rules(f.get());
6671 f->close_section();
6672 } else {
6673 int ruleno = osdmap.crush->get_rule_id(name);
6674 if (ruleno < 0) {
31f18b77 6675 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
6676 r = ruleno;
6677 goto reply;
6678 }
6679 osdmap.crush->dump_rule(ruleno, f.get());
6680 }
6681 ostringstream rs;
6682 f->flush(rs);
6683 rs << "\n";
6684 rdata.append(rs.str());
6685 } else if (prefix == "osd crush dump") {
6686 string format;
9f95a23c 6687 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6688 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6689 f->open_object_section("crush_map");
6690 osdmap.crush->dump(f.get());
6691 f->close_section();
6692 ostringstream rs;
6693 f->flush(rs);
6694 rs << "\n";
6695 rdata.append(rs.str());
6696 } else if (prefix == "osd crush show-tunables") {
6697 string format;
9f95a23c 6698 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6699 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6700 f->open_object_section("crush_map_tunables");
6701 osdmap.crush->dump_tunables(f.get());
6702 f->close_section();
6703 ostringstream rs;
6704 f->flush(rs);
6705 rs << "\n";
6706 rdata.append(rs.str());
6707 } else if (prefix == "osd crush tree") {
c07f9fc5 6708 string shadow;
9f95a23c 6709 cmd_getval(cmdmap, "shadow", shadow);
c07f9fc5
FG
6710 bool show_shadow = shadow == "--show-shadow";
6711 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6712 if (f) {
91327a77 6713 f->open_object_section("crush_tree");
c07f9fc5
FG
6714 osdmap.crush->dump_tree(nullptr,
6715 f.get(),
6716 osdmap.get_pool_names(),
6717 show_shadow);
91327a77 6718 f->close_section();
c07f9fc5
FG
6719 f->flush(rdata);
6720 } else {
6721 ostringstream ss;
6722 osdmap.crush->dump_tree(&ss,
6723 nullptr,
6724 osdmap.get_pool_names(),
6725 show_shadow);
6726 rdata.append(ss.str());
6727 }
d2e6a577
FG
6728 } else if (prefix == "osd crush ls") {
6729 string name;
9f95a23c 6730 if (!cmd_getval(cmdmap, "node", name)) {
d2e6a577
FG
6731 ss << "no node specified";
6732 r = -EINVAL;
6733 goto reply;
6734 }
6735 if (!osdmap.crush->name_exists(name)) {
6736 ss << "node '" << name << "' does not exist";
6737 r = -ENOENT;
6738 goto reply;
6739 }
6740 int id = osdmap.crush->get_item_id(name);
6741 list<int> result;
6742 if (id >= 0) {
6743 result.push_back(id);
6744 } else {
6745 int num = osdmap.crush->get_bucket_size(id);
6746 for (int i = 0; i < num; ++i) {
6747 result.push_back(osdmap.crush->get_bucket_item(id, i));
6748 }
6749 }
6750 if (f) {
6751 f->open_array_section("items");
6752 for (auto i : result) {
6753 f->dump_string("item", osdmap.crush->get_item_name(i));
6754 }
6755 f->close_section();
6756 f->flush(rdata);
6757 } else {
6758 ostringstream ss;
6759 for (auto i : result) {
6760 ss << osdmap.crush->get_item_name(i) << "\n";
6761 }
6762 rdata.append(ss.str());
6763 }
6764 r = 0;
7c673cae
FG
6765 } else if (prefix == "osd crush class ls") {
6766 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6767 f->open_array_section("crush_classes");
6768 for (auto i : osdmap.crush->class_name)
6769 f->dump_string("class", i.second);
6770 f->close_section();
6771 f->flush(rdata);
224ce89b
WB
6772 } else if (prefix == "osd crush class ls-osd") {
6773 string name;
9f95a23c 6774 cmd_getval(cmdmap, "class", name);
224ce89b
WB
6775 set<int> osds;
6776 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
6777 if (f) {
6778 f->open_array_section("osds");
6779 for (auto &osd: osds)
6780 f->dump_int("osd", osd);
6781 f->close_section();
6782 f->flush(rdata);
6783 } else {
6784 bool first = true;
6785 for (auto &osd : osds) {
6786 if (!first)
6787 ds << "\n";
6788 first = false;
6789 ds << osd;
6790 }
6791 rdata.append(ds);
6792 }
11fdf7f2
TL
6793 } else if (prefix == "osd crush get-device-class") {
6794 vector<string> idvec;
9f95a23c 6795 cmd_getval(cmdmap, "ids", idvec);
11fdf7f2
TL
6796 map<int, string> class_by_osd;
6797 for (auto& id : idvec) {
6798 ostringstream ts;
6799 long osd = parse_osd_id(id.c_str(), &ts);
6800 if (osd < 0) {
6801 ss << "unable to parse osd id:'" << id << "'";
6802 r = -EINVAL;
6803 goto reply;
6804 }
6805 auto device_class = osdmap.crush->get_item_class(osd);
6806 if (device_class)
6807 class_by_osd[osd] = device_class;
6808 else
6809 class_by_osd[osd] = ""; // no class
6810 }
6811 if (f) {
6812 f->open_array_section("osd_device_classes");
6813 for (auto& i : class_by_osd) {
6814 f->open_object_section("osd_device_class");
6815 f->dump_int("osd", i.first);
6816 f->dump_string("device_class", i.second);
6817 f->close_section();
6818 }
6819 f->close_section();
6820 f->flush(rdata);
6821 } else {
6822 if (class_by_osd.size() == 1) {
6823 // for single input, make a clean output
6824 ds << class_by_osd.begin()->second;
6825 } else {
6826 // note that we do not group osds by class here
6827 for (auto it = class_by_osd.begin();
6828 it != class_by_osd.end();
6829 it++) {
6830 ds << "osd." << it->first << ' ' << it->second;
6831 if (next(it) != class_by_osd.end())
6832 ds << '\n';
6833 }
6834 }
6835 rdata.append(ds);
6836 }
7c673cae
FG
6837 } else if (prefix == "osd erasure-code-profile ls") {
6838 const auto &profiles = osdmap.get_erasure_code_profiles();
6839 if (f)
6840 f->open_array_section("erasure-code-profiles");
6841 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6842 if (f)
6843 f->dump_string("profile", i->first.c_str());
6844 else
6845 rdata.append(i->first + "\n");
6846 }
6847 if (f) {
6848 f->close_section();
6849 ostringstream rs;
6850 f->flush(rs);
6851 rs << "\n";
6852 rdata.append(rs.str());
6853 }
c07f9fc5
FG
6854 } else if (prefix == "osd crush weight-set ls") {
6855 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6856 if (f) {
6857 f->open_array_section("weight_sets");
6858 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6859 f->dump_string("pool", "(compat)");
6860 }
6861 for (auto& i : osdmap.crush->choose_args) {
6862 if (i.first >= 0) {
6863 f->dump_string("pool", osdmap.get_pool_name(i.first));
6864 }
6865 }
6866 f->close_section();
6867 f->flush(rdata);
6868 } else {
6869 ostringstream rs;
6870 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6871 rs << "(compat)\n";
6872 }
6873 for (auto& i : osdmap.crush->choose_args) {
6874 if (i.first >= 0) {
6875 rs << osdmap.get_pool_name(i.first) << "\n";
6876 }
6877 }
6878 rdata.append(rs.str());
6879 }
6880 } else if (prefix == "osd crush weight-set dump") {
6881 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6882 "json-pretty"));
6883 osdmap.crush->dump_choose_args(f.get());
6884 f->flush(rdata);
7c673cae
FG
6885 } else if (prefix == "osd erasure-code-profile get") {
6886 string name;
9f95a23c 6887 cmd_getval(cmdmap, "name", name);
7c673cae
FG
6888 if (!osdmap.has_erasure_code_profile(name)) {
6889 ss << "unknown erasure code profile '" << name << "'";
6890 r = -ENOENT;
6891 goto reply;
6892 }
6893 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6894 if (f)
6895 f->open_object_section("profile");
6896 for (map<string,string>::const_iterator i = profile.begin();
6897 i != profile.end();
6898 ++i) {
6899 if (f)
6900 f->dump_string(i->first.c_str(), i->second.c_str());
6901 else
6902 rdata.append(i->first + "=" + i->second + "\n");
6903 }
6904 if (f) {
6905 f->close_section();
6906 ostringstream rs;
6907 f->flush(rs);
6908 rs << "\n";
6909 rdata.append(rs.str());
6910 }
181888fb
FG
6911 } else if (prefix == "osd pool application get") {
6912 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6913 "json-pretty"));
6914 string pool_name;
9f95a23c 6915 cmd_getval(cmdmap, "pool", pool_name);
181888fb 6916 string app;
9f95a23c 6917 cmd_getval(cmdmap, "app", app);
181888fb 6918 string key;
9f95a23c 6919 cmd_getval(cmdmap, "key", key);
181888fb
FG
6920
6921 if (pool_name.empty()) {
6922 // all
6923 f->open_object_section("pools");
6924 for (const auto &pool : osdmap.pools) {
6925 std::string name("<unknown>");
6926 const auto &pni = osdmap.pool_name.find(pool.first);
6927 if (pni != osdmap.pool_name.end())
6928 name = pni->second;
6929 f->open_object_section(name.c_str());
6930 for (auto &app_pair : pool.second.application_metadata) {
6931 f->open_object_section(app_pair.first.c_str());
6932 for (auto &kv_pair : app_pair.second) {
6933 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6934 }
6935 f->close_section();
6936 }
6937 f->close_section(); // name
6938 }
6939 f->close_section(); // pools
6940 f->flush(rdata);
6941 } else {
6942 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6943 if (pool < 0) {
6944 ss << "unrecognized pool '" << pool_name << "'";
6945 r = -ENOENT;
6946 goto reply;
6947 }
6948 auto p = osdmap.get_pg_pool(pool);
6949 // filter by pool
6950 if (app.empty()) {
6951 f->open_object_section(pool_name.c_str());
6952 for (auto &app_pair : p->application_metadata) {
6953 f->open_object_section(app_pair.first.c_str());
6954 for (auto &kv_pair : app_pair.second) {
6955 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6956 }
6957 f->close_section(); // application
6958 }
6959 f->close_section(); // pool_name
6960 f->flush(rdata);
6961 goto reply;
6962 }
6963
6964 auto app_it = p->application_metadata.find(app);
6965 if (app_it == p->application_metadata.end()) {
6966 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6967 r = -ENOENT;
6968 goto reply;
6969 }
6970 // filter by pool + app
6971 if (key.empty()) {
6972 f->open_object_section(app_it->first.c_str());
6973 for (auto &kv_pair : app_it->second) {
6974 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6975 }
6976 f->close_section(); // application
6977 f->flush(rdata);
6978 goto reply;
6979 }
6980 // filter by pool + app + key
6981 auto key_it = app_it->second.find(key);
6982 if (key_it == app_it->second.end()) {
6983 ss << "application '" << app << "' on pool '" << pool_name
6984 << "' does not have key '" << key << "'";
6985 r = -ENOENT;
6986 goto reply;
6987 }
6988 ss << key_it->second << "\n";
6989 rdata.append(ss.str());
6990 ss.str("");
6991 }
11fdf7f2 6992 } else if (prefix == "osd get-require-min-compat-client") {
9f95a23c 6993 ss << osdmap.require_min_compat_client << std::endl;
11fdf7f2
TL
6994 rdata.append(ss.str());
6995 ss.str("");
6996 goto reply;
6997 } else if (prefix == "osd pool application enable" ||
6998 prefix == "osd pool application disable" ||
6999 prefix == "osd pool application set" ||
7000 prefix == "osd pool application rm") {
7001 bool changed = false;
7002 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7003 if (r != 0) {
7004 // Error, reply.
7005 goto reply;
7006 } else if (changed) {
7007 // Valid mutation, proceed to prepare phase
7008 return false;
7009 } else {
7010 // Idempotent case, reply
7011 goto reply;
7012 }
7c673cae
FG
7013 } else {
7014 // try prepare update
7015 return false;
7016 }
7017
7018 reply:
7019 string rs;
7020 getline(ss, rs);
f67539c2 7021 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
7022 return true;
7023}
7024
3efd9988
FG
7025void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7026{
7027 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7028 osdmap.get_pg_pool(pool_id));
11fdf7f2 7029 ceph_assert(pool);
3efd9988
FG
7030 pool->set_flag(flags);
7031}
7032
7033void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 7034{
3efd9988
FG
7035 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7036 osdmap.get_pg_pool(pool_id));
11fdf7f2 7037 ceph_assert(pool);
3efd9988 7038 pool->unset_flag(flags);
7c673cae
FG
7039}
7040
9f95a23c 7041string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
11fdf7f2
TL
7042{
7043 char k[80];
9f95a23c 7044 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
11fdf7f2
TL
7045 return k;
7046}
7047
9f95a23c 7048string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
11fdf7f2
TL
7049{
7050 char k[80];
9f95a23c 7051 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
11fdf7f2
TL
7052 (unsigned long long)pool, (unsigned long long)snap);
7053 return k;
7054}
7055
9f95a23c 7056string OSDMonitor::make_purged_snap_key_value(
11fdf7f2
TL
7057 int64_t pool, snapid_t snap, snapid_t num,
7058 epoch_t epoch, bufferlist *v)
7059{
7060 // encode the *last* epoch in the key so that we can use forward
7061 // iteration only to search for an epoch in an interval.
7062 encode(snap, *v);
7063 encode(snap + num, *v);
7064 encode(epoch, *v);
9f95a23c 7065 return make_purged_snap_key(pool, snap + num - 1);
11fdf7f2
TL
7066}
7067
11fdf7f2 7068
9f95a23c
TL
7069int OSDMonitor::lookup_purged_snap(
7070 int64_t pool, snapid_t snap,
7071 snapid_t *begin, snapid_t *end)
11fdf7f2 7072{
9f95a23c 7073 string k = make_purged_snap_key(pool, snap);
f67539c2 7074 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
11fdf7f2
TL
7075 it->lower_bound(k);
7076 if (!it->valid()) {
9f95a23c
TL
7077 dout(20) << __func__
7078 << " pool " << pool << " snap " << snap
7079 << " - key '" << k << "' not found" << dendl;
7080 return -ENOENT;
7081 }
7082 if (it->key().find("purged_snap_") != 0) {
7083 dout(20) << __func__
7084 << " pool " << pool << " snap " << snap
7085 << " - key '" << k << "' got '" << it->key()
7086 << "', wrong prefix" << dendl;
11fdf7f2
TL
7087 return -ENOENT;
7088 }
9f95a23c
TL
7089 string gotk = it->key();
7090 const char *format = "purged_snap_%llu_";
7091 long long int keypool;
7092 int n = sscanf(gotk.c_str(), format, &keypool);
7093 if (n != 1) {
7094 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7095 return -ENOENT;
7096 }
7097 if (pool != keypool) {
7098 dout(20) << __func__
7099 << " pool " << pool << " snap " << snap
7100 << " - key '" << k << "' got '" << gotk
7101 << "', wrong pool " << keypool
7102 << dendl;
11fdf7f2
TL
7103 return -ENOENT;
7104 }
7105 bufferlist v = it->value();
7106 auto p = v.cbegin();
7107 decode(*begin, p);
7108 decode(*end, p);
7109 if (snap < *begin || snap >= *end) {
9f95a23c
TL
7110 dout(20) << __func__
7111 << " pool " << pool << " snap " << snap
7112 << " - found [" << *begin << "," << *end << "), no overlap"
7113 << dendl;
11fdf7f2
TL
7114 return -ENOENT;
7115 }
7116 return 0;
7117}
7118
9f95a23c
TL
7119void OSDMonitor::insert_purged_snap_update(
7120 int64_t pool,
7121 snapid_t start, snapid_t end,
7122 epoch_t epoch,
7123 MonitorDBStore::TransactionRef t)
7124{
7125 snapid_t before_begin, before_end;
7126 snapid_t after_begin, after_end;
7127 int b = lookup_purged_snap(pool, start - 1,
7128 &before_begin, &before_end);
7129 int a = lookup_purged_snap(pool, end,
7130 &after_begin, &after_end);
7131 if (!b && !a) {
7132 dout(10) << __func__
7133 << " [" << start << "," << end << ") - joins ["
7134 << before_begin << "," << before_end << ") and ["
7135 << after_begin << "," << after_end << ")" << dendl;
7136 // erase only the begin record; we'll overwrite the end one.
7137 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7138 bufferlist v;
7139 string k = make_purged_snap_key_value(pool,
7140 before_begin, after_end - before_begin,
7141 pending_inc.epoch, &v);
7142 t->put(OSD_SNAP_PREFIX, k, v);
7143 } else if (!b) {
7144 dout(10) << __func__
7145 << " [" << start << "," << end << ") - join with earlier ["
7146 << before_begin << "," << before_end << ")" << dendl;
7147 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7148 bufferlist v;
7149 string k = make_purged_snap_key_value(pool,
7150 before_begin, end - before_begin,
7151 pending_inc.epoch, &v);
7152 t->put(OSD_SNAP_PREFIX, k, v);
7153 } else if (!a) {
7154 dout(10) << __func__
7155 << " [" << start << "," << end << ") - join with later ["
7156 << after_begin << "," << after_end << ")" << dendl;
7157 // overwrite after record
7158 bufferlist v;
7159 string k = make_purged_snap_key_value(pool,
7160 start, after_end - start,
7161 pending_inc.epoch, &v);
7162 t->put(OSD_SNAP_PREFIX, k, v);
7163 } else {
7164 dout(10) << __func__
7165 << " [" << start << "," << end << ") - new"
7166 << dendl;
7167 bufferlist v;
7168 string k = make_purged_snap_key_value(pool,
7169 start, end - start,
7170 pending_inc.epoch, &v);
7171 t->put(OSD_SNAP_PREFIX, k, v);
7172 }
7173}
7174
11fdf7f2
TL
7175bool OSDMonitor::try_prune_purged_snaps()
7176{
f67539c2 7177 if (!mon.mgrstatmon()->is_readable()) {
11fdf7f2
TL
7178 return false;
7179 }
11fdf7f2
TL
7180 if (!pending_inc.new_purged_snaps.empty()) {
7181 return false; // we already pruned for this epoch
7182 }
7183
7184 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7185 "mon_max_snap_prune_per_epoch");
7186 if (!max_prune) {
7187 max_prune = 100000;
7188 }
7189 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7190
7191 unsigned actually_pruned = 0;
f67539c2 7192 auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
11fdf7f2
TL
7193 for (auto& p : osdmap.get_pools()) {
7194 auto q = purged_snaps.find(p.first);
7195 if (q == purged_snaps.end()) {
7196 continue;
7197 }
7198 auto& purged = q->second;
7199 if (purged.empty()) {
7200 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7201 continue;
7202 }
7203 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
9f95a23c 7204 snap_interval_set_t to_prune;
11fdf7f2
TL
7205 unsigned maybe_pruned = actually_pruned;
7206 for (auto i = purged.begin(); i != purged.end(); ++i) {
7207 snapid_t begin = i.get_start();
7208 auto end = i.get_start() + i.get_len();
7209 snapid_t pbegin = 0, pend = 0;
9f95a23c 7210 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
11fdf7f2
TL
7211 if (r == 0) {
7212 // already purged.
7213 // be a bit aggressive about backing off here, because the mon may
7214 // do a lot of work going through this set, and if we know the
7215 // purged set from the OSDs is at least *partly* stale we may as
7216 // well wait for it to be fresh.
9f95a23c 7217 dout(20) << __func__ << " we've already purged " << pbegin
11fdf7f2
TL
7218 << "~" << (pend - pbegin) << dendl;
7219 break; // next pool
7220 }
9f95a23c 7221 if (pbegin && pbegin > begin && pbegin < end) {
11fdf7f2 7222 // the tail of [begin,end) is purged; shorten the range
11fdf7f2
TL
7223 end = pbegin;
7224 }
7225 to_prune.insert(begin, end - begin);
7226 maybe_pruned += end - begin;
7227 if (maybe_pruned >= max_prune) {
7228 break;
7229 }
7230 }
7231 if (!to_prune.empty()) {
7232 // PGs may still be reporting things as purged that we have already
7233 // pruned from removed_snaps_queue.
9f95a23c 7234 snap_interval_set_t actual;
11fdf7f2
TL
7235 auto r = osdmap.removed_snaps_queue.find(p.first);
7236 if (r != osdmap.removed_snaps_queue.end()) {
7237 actual.intersection_of(to_prune, r->second);
7238 }
7239 actually_pruned += actual.size();
7240 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7241 << ", actual pruned " << actual << dendl;
7242 if (!actual.empty()) {
7243 pending_inc.new_purged_snaps[p.first].swap(actual);
7244 }
7245 }
7246 if (actually_pruned >= max_prune) {
7247 break;
7248 }
7249 }
7250 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7251 return !!actually_pruned;
7252}
7253
7c673cae
FG
7254bool OSDMonitor::update_pools_status()
7255{
f67539c2 7256 if (!mon.mgrstatmon()->is_readable())
7c673cae
FG
7257 return false;
7258
7259 bool ret = false;
7260
7261 auto& pools = osdmap.get_pools();
7262 for (auto it = pools.begin(); it != pools.end(); ++it) {
f67539c2 7263 const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
31f18b77 7264 if (!pstat)
7c673cae 7265 continue;
31f18b77 7266 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
7267 const pg_pool_t &pool = it->second;
7268 const string& pool_name = osdmap.get_pool_name(it->first);
7269
7270 bool pool_is_full =
7271 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7272 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7273
11fdf7f2 7274 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7c673cae
FG
7275 if (pool_is_full)
7276 continue;
7277
f67539c2 7278 mon.clog->info() << "pool '" << pool_name
3efd9988
FG
7279 << "' no longer out of quota; removing NO_QUOTA flag";
7280 // below we cancel FLAG_FULL too, we'll set it again in
7281 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7282 clear_pool_flags(it->first,
11fdf7f2 7283 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
7284 ret = true;
7285 } else {
7286 if (!pool_is_full)
7287 continue;
7288
7289 if (pool.quota_max_bytes > 0 &&
7290 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
f67539c2 7291 mon.clog->warn() << "pool '" << pool_name << "' is full"
7c673cae 7292 << " (reached quota's max_bytes: "
1adf2230 7293 << byte_u_t(pool.quota_max_bytes) << ")";
7c673cae
FG
7294 }
7295 if (pool.quota_max_objects > 0 &&
7296 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
f67539c2 7297 mon.clog->warn() << "pool '" << pool_name << "' is full"
7c673cae
FG
7298 << " (reached quota's max_objects: "
7299 << pool.quota_max_objects << ")";
7300 }
11fdf7f2 7301 // set both FLAG_FULL_QUOTA and FLAG_FULL
3efd9988
FG
7302 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7303 // since FLAG_FULL should always take precedence
7304 set_pool_flags(it->first,
11fdf7f2 7305 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
3efd9988
FG
7306 clear_pool_flags(it->first,
7307 pg_pool_t::FLAG_NEARFULL |
7308 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
7309 ret = true;
7310 }
7311 }
7312 return ret;
7313}
7314
7c673cae
FG
7315int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7316{
7317 op->mark_osdmon_event(__func__);
9f95a23c 7318 auto m = op->get_req<MPoolOp>();
7c673cae 7319 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
11fdf7f2 7320 MonSession *session = op->get_session();
7c673cae
FG
7321 if (!session)
7322 return -EPERM;
7323 string erasure_code_profile;
7324 stringstream ss;
31f18b77 7325 string rule_name;
94b18763 7326 int ret = 0;
11fdf7f2
TL
7327 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7328 0, 0, 0, 0, 0, 0.0,
7329 erasure_code_profile,
9f95a23c
TL
7330 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7331 &ss);
94b18763
FG
7332
7333 if (ret < 0) {
7334 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7335 }
7336 return ret;
7c673cae
FG
7337}
7338
7339int OSDMonitor::crush_rename_bucket(const string& srcname,
7340 const string& dstname,
7341 ostream *ss)
7342{
7343 int ret;
7344 //
7345 // Avoid creating a pending crush if it does not already exists and
7346 // the rename would fail.
7347 //
7348 if (!_have_pending_crush()) {
7349 ret = _get_stable_crush().can_rename_bucket(srcname,
7350 dstname,
7351 ss);
7352 if (ret)
7353 return ret;
7354 }
7355
7356 CrushWrapper newcrush;
7357 _get_pending_crush(newcrush);
7358
7359 ret = newcrush.rename_bucket(srcname,
7360 dstname,
7361 ss);
7362 if (ret)
7363 return ret;
7364
7365 pending_inc.crush.clear();
f67539c2 7366 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
7367 *ss << "renamed bucket " << srcname << " into " << dstname;
7368 return 0;
7369}
7370
7371void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7372{
7373 string replacement = "";
7374
7375 if (plugin == "jerasure_generic" ||
7376 plugin == "jerasure_sse3" ||
7377 plugin == "jerasure_sse4" ||
7378 plugin == "jerasure_neon") {
7379 replacement = "jerasure";
7380 } else if (plugin == "shec_generic" ||
7381 plugin == "shec_sse3" ||
7382 plugin == "shec_sse4" ||
7383 plugin == "shec_neon") {
7384 replacement = "shec";
7385 }
7386
7387 if (replacement != "") {
7388 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7389 << plugin << " that has been deprecated. Please use "
7390 << replacement << " instead." << dendl;
7391 }
7392}
7393
7394int OSDMonitor::normalize_profile(const string& profilename,
7395 ErasureCodeProfile &profile,
7396 bool force,
7397 ostream *ss)
7398{
7399 ErasureCodeInterfaceRef erasure_code;
7400 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7401 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7402 check_legacy_ec_plugin(plugin->second, profilename);
7403 int err = instance.factory(plugin->second,
11fdf7f2 7404 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7405 profile, &erasure_code, ss);
7406 if (err) {
7407 return err;
7408 }
7409
7410 err = erasure_code->init(profile, ss);
7411 if (err) {
7412 return err;
7413 }
7414
7415 auto it = profile.find("stripe_unit");
7416 if (it != profile.end()) {
7417 string err_str;
1adf2230 7418 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7c673cae
FG
7419 if (!err_str.empty()) {
7420 *ss << "could not parse stripe_unit '" << it->second
7421 << "': " << err_str << std::endl;
7422 return -EINVAL;
7423 }
7424 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7425 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7426 if (chunk_size != stripe_unit) {
7427 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7428 << "alignment. Would be padded to " << chunk_size
7429 << std::endl;
7430 return -EINVAL;
7431 }
7432 if ((stripe_unit % 4096) != 0 && !force) {
7433 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7434 << "use --force to override this check" << std::endl;
7435 return -EINVAL;
7436 }
7437 }
7438 return 0;
7439}
7440
31f18b77 7441int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 7442 const string &profile,
31f18b77 7443 int *rule,
7c673cae
FG
7444 ostream *ss)
7445{
7446 int ruleid = osdmap.crush->get_rule_id(name);
7447 if (ruleid != -ENOENT) {
31f18b77 7448 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
7449 return -EEXIST;
7450 }
7451
7452 CrushWrapper newcrush;
7453 _get_pending_crush(newcrush);
7454
7455 ruleid = newcrush.get_rule_id(name);
7456 if (ruleid != -ENOENT) {
31f18b77 7457 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
7458 return -EALREADY;
7459 } else {
7460 ErasureCodeInterfaceRef erasure_code;
7461 int err = get_erasure_code(profile, &erasure_code, ss);
7462 if (err) {
7463 *ss << "failed to load plugin using profile " << profile << std::endl;
7464 return err;
7465 }
7466
224ce89b 7467 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
7468 erasure_code.reset();
7469 if (err < 0)
7470 return err;
31f18b77 7471 *rule = err;
7c673cae 7472 pending_inc.crush.clear();
f67539c2 7473 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
7474 return 0;
7475 }
7476}
7477
7478int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7479 ErasureCodeInterfaceRef *erasure_code,
7480 ostream *ss) const
7481{
7482 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7483 return -EAGAIN;
7484 ErasureCodeProfile profile =
7485 osdmap.get_erasure_code_profile(erasure_code_profile);
7486 ErasureCodeProfile::const_iterator plugin =
7487 profile.find("plugin");
7488 if (plugin == profile.end()) {
7489 *ss << "cannot determine the erasure code plugin"
7490 << " because there is no 'plugin' entry in the erasure_code_profile "
7491 << profile << std::endl;
7492 return -EINVAL;
7493 }
7494 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
f67539c2 7495 auto& instance = ErasureCodePluginRegistry::instance();
7c673cae 7496 return instance.factory(plugin->second,
11fdf7f2 7497 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7498 profile, erasure_code, ss);
7499}
7500
7501int OSDMonitor::check_cluster_features(uint64_t features,
7502 stringstream &ss)
7503{
7504 stringstream unsupported_ss;
7505 int unsupported_count = 0;
f67539c2 7506 if ((mon.get_quorum_con_features() & features) != features) {
7c673cae
FG
7507 unsupported_ss << "the monitor cluster";
7508 ++unsupported_count;
7509 }
7510
7511 set<int32_t> up_osds;
7512 osdmap.get_up_osds(up_osds);
7513 for (set<int32_t>::iterator it = up_osds.begin();
7514 it != up_osds.end(); ++it) {
7515 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7516 if ((xi.features & features) != features) {
7517 if (unsupported_count > 0)
7518 unsupported_ss << ", ";
7519 unsupported_ss << "osd." << *it;
7520 unsupported_count ++;
7521 }
7522 }
7523
7524 if (unsupported_count > 0) {
7525 ss << "features " << features << " unsupported by: "
7526 << unsupported_ss.str();
7527 return -ENOTSUP;
7528 }
7529
7530 // check pending osd state, too!
7531 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7532 pending_inc.new_xinfo.begin();
7533 p != pending_inc.new_xinfo.end(); ++p) {
7534 const osd_xinfo_t &xi = p->second;
7535 if ((xi.features & features) != features) {
7536 dout(10) << __func__ << " pending osd." << p->first
7537 << " features are insufficient; retry" << dendl;
7538 return -EAGAIN;
7539 }
7540 }
7541
7542 return 0;
7543}
7544
7545bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7546 stringstream& ss)
7547{
7548 OSDMap::Incremental new_pending = pending_inc;
f67539c2 7549 encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7c673cae
FG
7550 OSDMap newmap;
7551 newmap.deepish_copy_from(osdmap);
7552 newmap.apply_incremental(new_pending);
7553
7554 // client compat
9f95a23c 7555 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7c673cae 7556 auto mv = newmap.get_min_compat_client();
31f18b77 7557 if (mv > newmap.require_min_compat_client) {
9f95a23c 7558 ss << "new crush map requires client version " << mv
7c673cae 7559 << " but require_min_compat_client is "
9f95a23c 7560 << newmap.require_min_compat_client;
7c673cae
FG
7561 return false;
7562 }
7563 }
7564
7565 // osd compat
7566 uint64_t features =
7567 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7568 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7569 stringstream features_ss;
7570 int r = check_cluster_features(features, features_ss);
7571 if (r) {
7572 ss << "Could not change CRUSH: " << features_ss.str();
7573 return false;
7574 }
7575
7576 return true;
7577}
7578
7579bool OSDMonitor::erasure_code_profile_in_use(
7580 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7581 const string &profile,
7582 ostream *ss)
7583{
7584 bool found = false;
7585 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7586 p != pools.end();
7587 ++p) {
11fdf7f2 7588 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7c673cae
FG
7589 *ss << osdmap.pool_name[p->first] << " ";
7590 found = true;
7591 }
7592 }
7593 if (found) {
7594 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7595 }
7596 return found;
7597}
7598
7599int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7600 map<string,string> *erasure_code_profile_map,
7601 ostream *ss)
7602{
11fdf7f2
TL
7603 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7604 get_json_str_map,
7605 *ss,
7606 erasure_code_profile_map,
7607 true);
7c673cae
FG
7608 if (r)
7609 return r;
11fdf7f2 7610 ceph_assert((*erasure_code_profile_map).count("plugin"));
7c673cae
FG
7611 string default_plugin = (*erasure_code_profile_map)["plugin"];
7612 map<string,string> user_map;
7613 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7614 i != erasure_code_profile.end();
7615 ++i) {
7616 size_t equal = i->find('=');
7617 if (equal == string::npos) {
7618 user_map[*i] = string();
7619 (*erasure_code_profile_map)[*i] = string();
7620 } else {
11fdf7f2 7621 const string key = i->substr(0, equal);
7c673cae
FG
7622 equal++;
7623 const string value = i->substr(equal);
11fdf7f2
TL
7624 if (key.find("ruleset-") == 0) {
7625 *ss << "property '" << key << "' is no longer supported; try "
7626 << "'crush-" << key.substr(8) << "' instead";
7627 return -EINVAL;
3efd9988 7628 }
7c673cae
FG
7629 user_map[key] = value;
7630 (*erasure_code_profile_map)[key] = value;
7631 }
7632 }
7633
7634 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7635 (*erasure_code_profile_map) = user_map;
7636
7637 return 0;
7638}
7639
7640int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7641 const string &erasure_code_profile,
11fdf7f2 7642 uint8_t repl_size,
7c673cae
FG
7643 unsigned *size, unsigned *min_size,
7644 ostream *ss)
7645{
7646 int err = 0;
f67539c2 7647 bool set_min_size = false;
7c673cae
FG
7648 switch (pool_type) {
7649 case pg_pool_t::TYPE_REPLICATED:
f67539c2
TL
7650 if (osdmap.stretch_mode_enabled) {
7651 if (repl_size == 0)
7652 repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7653 if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7654 *ss << "prepare_pool_size: we are in stretch mode but size "
7655 << repl_size << " does not match!";
7656 return -EINVAL;
7657 }
7658 *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7659 set_min_size = true;
7660 }
11fdf7f2
TL
7661 if (repl_size == 0) {
7662 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7663 }
7664 *size = repl_size;
f67539c2
TL
7665 if (!set_min_size)
7666 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7c673cae
FG
7667 break;
7668 case pg_pool_t::TYPE_ERASURE:
7669 {
f67539c2
TL
7670 if (osdmap.stretch_mode_enabled) {
7671 *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7672 return -EINVAL;
7673 }
7c673cae
FG
7674 ErasureCodeInterfaceRef erasure_code;
7675 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7676 if (err == 0) {
7677 *size = erasure_code->get_chunk_count();
11fdf7f2
TL
7678 *min_size =
7679 erasure_code->get_data_chunk_count() +
7680 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7681 assert(*min_size <= *size);
7682 assert(*min_size >= erasure_code->get_data_chunk_count());
7c673cae
FG
7683 }
7684 }
7685 break;
7686 default:
7687 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7688 err = -EINVAL;
7689 break;
7690 }
7691 return err;
7692}
7693
7694int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7695 const string &erasure_code_profile,
7696 uint32_t *stripe_width,
7697 ostream *ss)
7698{
7699 int err = 0;
7700 switch (pool_type) {
7701 case pg_pool_t::TYPE_REPLICATED:
7702 // ignored
7703 break;
7704 case pg_pool_t::TYPE_ERASURE:
7705 {
7706 ErasureCodeProfile profile =
7707 osdmap.get_erasure_code_profile(erasure_code_profile);
7708 ErasureCodeInterfaceRef erasure_code;
7709 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7710 if (err)
7711 break;
7712 uint32_t data_chunks = erasure_code->get_data_chunk_count();
11fdf7f2 7713 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7c673cae
FG
7714 auto it = profile.find("stripe_unit");
7715 if (it != profile.end()) {
7716 string err_str;
1adf2230 7717 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
11fdf7f2 7718 ceph_assert(err_str.empty());
7c673cae
FG
7719 }
7720 *stripe_width = data_chunks *
7721 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7722 }
7723 break;
7724 default:
7725 *ss << "prepare_pool_stripe_width: "
7726 << pool_type << " is not a known pool type";
7727 err = -EINVAL;
7728 break;
7729 }
7730 return err;
7731}
7732
522d829b
TL
7733int OSDMonitor::get_replicated_stretch_crush_rule()
7734{
7735 /* we don't write down the stretch rule anywhere, so
7736 * we have to guess it. How? Look at all the pools
7737 * and count up how many times a given rule is used
7738 * on stretch pools and then return the one with
7739 * the most users!
7740 */
7741 map<int,int> rule_counts;
7742 for (const auto& pooli : osdmap.pools) {
7743 const pg_pool_t& p = pooli.second;
7744 if (p.is_replicated() && p.is_stretch_pool()) {
7745 if (!rule_counts.count(p.crush_rule)) {
7746 rule_counts[p.crush_rule] = 1;
7747 } else {
7748 ++rule_counts[p.crush_rule];
7749 }
7750 }
7751 }
7752
7753 if (rule_counts.empty()) {
7754 return -ENOENT;
7755 }
7756
7757 int most_used_count = 0;
7758 int most_used_rule = -1;
7759 for (auto i : rule_counts) {
7760 if (i.second > most_used_count) {
7761 most_used_rule = i.first;
7762 most_used_count = i.second;
7763 }
7764 }
7765 ceph_assert(most_used_count > 0);
7766 ceph_assert(most_used_rule >= 0);
7767 return most_used_rule;
7768}
7769
31f18b77 7770int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
7771 const string &erasure_code_profile,
7772 const string &rule_name,
7773 int *crush_rule,
7774 ostream *ss)
7c673cae
FG
7775{
7776
31f18b77 7777 if (*crush_rule < 0) {
7c673cae
FG
7778 switch (pool_type) {
7779 case pg_pool_t::TYPE_REPLICATED:
7780 {
31f18b77 7781 if (rule_name == "") {
522d829b
TL
7782 if (osdmap.stretch_mode_enabled) {
7783 *crush_rule = get_replicated_stretch_crush_rule();
7784 } else {
7785 // Use default rule
7786 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7787 }
31f18b77
FG
7788 if (*crush_rule < 0) {
7789 // Errors may happen e.g. if no valid rule is available
7790 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
7791 << "'osd pool default crush *' config options";
7792 return -ENOENT;
7793 }
7794 } else {
31f18b77 7795 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
7796 }
7797 }
7798 break;
7799 case pg_pool_t::TYPE_ERASURE:
7800 {
31f18b77 7801 int err = crush_rule_create_erasure(rule_name,
7c673cae 7802 erasure_code_profile,
31f18b77 7803 crush_rule, ss);
7c673cae
FG
7804 switch (err) {
7805 case -EALREADY:
31f18b77
FG
7806 dout(20) << "prepare_pool_crush_rule: rule "
7807 << rule_name << " try again" << dendl;
7c673cae
FG
7808 // fall through
7809 case 0:
7810 // need to wait for the crush rule to be proposed before proceeding
7811 err = -EAGAIN;
7812 break;
7813 case -EEXIST:
7814 err = 0;
7815 break;
7816 }
7817 return err;
7818 }
7819 break;
7820 default:
31f18b77 7821 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
7822 << " is not a known pool type";
7823 return -EINVAL;
7c673cae
FG
7824 }
7825 } else {
31f18b77
FG
7826 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7827 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
7828 return -ENOENT;
7829 }
7830 }
7831
7832 return 0;
7833}
7834
31f18b77 7835int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
7836 int *crush_rule,
7837 ostream *ss)
7c673cae
FG
7838{
7839 int ret;
31f18b77 7840 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
7841 if (ret != -ENOENT) {
7842 // found it, use it
31f18b77 7843 *crush_rule = ret;
7c673cae
FG
7844 } else {
7845 CrushWrapper newcrush;
7846 _get_pending_crush(newcrush);
7847
31f18b77 7848 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
7849 if (ret != -ENOENT) {
7850 // found it, wait for it to be proposed
31f18b77 7851 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
7852 << " try again" << dendl;
7853 return -EAGAIN;
7854 } else {
224ce89b 7855 // Cannot find it , return error
31f18b77 7856 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
7857 return ret;
7858 }
7859 }
7860 return 0;
7861}
7862
3efd9988
FG
7863int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7864{
11fdf7f2 7865 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988
FG
7866 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7867 auto max_pgs = max_pgs_per_osd * num_osds;
7868 uint64_t projected = 0;
7869 if (pool < 0) {
7870 projected += pg_num * size;
7871 }
7872 for (const auto& i : osdmap.get_pools()) {
7873 if (i.first == pool) {
7874 projected += pg_num * size;
7875 } else {
11fdf7f2 7876 projected += i.second.get_pg_num_target() * i.second.get_size();
3efd9988
FG
7877 }
7878 }
7879 if (projected > max_pgs) {
7880 if (pool >= 0) {
7881 *ss << "pool id " << pool;
7882 }
7883 *ss << " pg_num " << pg_num << " size " << size
7884 << " would mean " << projected
7885 << " total pgs, which exceeds max " << max_pgs
7886 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7887 << " * num_in_osds " << num_osds << ")";
7888 return -ERANGE;
7889 }
7890 return 0;
7891}
7892
7c673cae
FG
7893/**
7894 * @param name The name of the new pool
31f18b77
FG
7895 * @param crush_rule The crush rule to use. If <0, will use the system default
7896 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
7897 * @param pg_num The pg_num to use. If set to 0, will use the system default
7898 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
11fdf7f2 7899 * @param repl_size Replication factor, or 0 for default
7c673cae
FG
7900 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7901 * @param pool_type TYPE_ERASURE, or TYPE_REP
7902 * @param expected_num_objects expected number of objects on the pool
7903 * @param fast_read fast read type.
7904 * @param ss human readable error message, if any.
7905 *
7906 * @return 0 on success, negative errno on failure.
7907 */
11fdf7f2 7908int OSDMonitor::prepare_new_pool(string& name,
31f18b77
FG
7909 int crush_rule,
7910 const string &crush_rule_name,
7c673cae 7911 unsigned pg_num, unsigned pgp_num,
11fdf7f2
TL
7912 unsigned pg_num_min,
7913 const uint64_t repl_size,
7914 const uint64_t target_size_bytes,
7915 const float target_size_ratio,
7c673cae
FG
7916 const string &erasure_code_profile,
7917 const unsigned pool_type,
7918 const uint64_t expected_num_objects,
7919 FastReadType fast_read,
9f95a23c 7920 const string& pg_autoscale_mode,
7c673cae
FG
7921 ostream *ss)
7922{
7923 if (name.length() == 0)
7924 return -EINVAL;
7925 if (pg_num == 0)
11fdf7f2 7926 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7c673cae 7927 if (pgp_num == 0)
11fdf7f2
TL
7928 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7929 if (!pgp_num)
7930 pgp_num = pg_num;
7931 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7c673cae 7932 *ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 7933 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7c673cae
FG
7934 << " (you may adjust 'mon max pool pg num' for higher values)";
7935 return -ERANGE;
7936 }
7937 if (pgp_num > pg_num) {
7938 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7939 << ", which in this case is " << pg_num;
7940 return -ERANGE;
7941 }
7942 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7943 *ss << "'fast_read' can only apply to erasure coding pool";
7944 return -EINVAL;
7945 }
7946 int r;
31f18b77
FG
7947 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7948 crush_rule_name, &crush_rule, ss);
7c673cae 7949 if (r) {
94b18763 7950 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
7951 return r;
7952 }
11fdf7f2 7953 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
7954 CrushWrapper newcrush;
7955 _get_pending_crush(newcrush);
7956 ostringstream err;
7957 CrushTester tester(newcrush, err);
b5b8bbf5 7958 tester.set_min_x(0);
224ce89b
WB
7959 tester.set_max_x(50);
7960 tester.set_rule(crush_rule);
b5b8bbf5 7961 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 7962 r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 7963 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 7964 if (r < 0) {
94b18763 7965 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
7966 << ": " << err.str() << dendl;
7967 *ss << "crush test failed with " << r << ": " << err.str();
7968 return r;
7969 }
181888fb 7970 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 7971 << duration << dendl;
7c673cae
FG
7972 }
7973 unsigned size, min_size;
11fdf7f2
TL
7974 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7975 &size, &min_size, ss);
7c673cae 7976 if (r) {
94b18763 7977 dout(10) << "prepare_pool_size returns " << r << dendl;
7c673cae
FG
7978 return r;
7979 }
3efd9988
FG
7980 r = check_pg_num(-1, pg_num, size, ss);
7981 if (r) {
94b18763 7982 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
7983 return r;
7984 }
7c673cae 7985
31f18b77 7986 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
7987 return -EINVAL;
7988 }
7989
7990 uint32_t stripe_width = 0;
7991 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7992 if (r) {
94b18763 7993 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
7994 return r;
7995 }
7996
7997 bool fread = false;
7998 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7999 switch (fast_read) {
8000 case FAST_READ_OFF:
8001 fread = false;
8002 break;
8003 case FAST_READ_ON:
8004 fread = true;
8005 break;
8006 case FAST_READ_DEFAULT:
11fdf7f2 8007 fread = g_conf()->osd_pool_default_ec_fast_read;
7c673cae
FG
8008 break;
8009 default:
8010 *ss << "invalid fast_read setting: " << fast_read;
8011 return -EINVAL;
8012 }
8013 }
8014
8015 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8016 p != pending_inc.new_pool_names.end();
8017 ++p) {
8018 if (p->second == name)
8019 return 0;
8020 }
8021
8022 if (-1 == pending_inc.new_pool_max)
8023 pending_inc.new_pool_max = osdmap.pool_max;
8024 int64_t pool = ++pending_inc.new_pool_max;
8025 pg_pool_t empty;
8026 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
11fdf7f2 8027 pi->create_time = ceph_clock_now();
7c673cae
FG
8028 pi->type = pool_type;
8029 pi->fast_read = fread;
11fdf7f2
TL
8030 pi->flags = g_conf()->osd_pool_default_flags;
8031 if (g_conf()->osd_pool_default_flag_hashpspool)
7c673cae 8032 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
11fdf7f2 8033 if (g_conf()->osd_pool_default_flag_nodelete)
7c673cae 8034 pi->set_flag(pg_pool_t::FLAG_NODELETE);
11fdf7f2 8035 if (g_conf()->osd_pool_default_flag_nopgchange)
7c673cae 8036 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
11fdf7f2 8037 if (g_conf()->osd_pool_default_flag_nosizechange)
7c673cae 8038 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
8039 pi->set_flag(pg_pool_t::FLAG_CREATING);
8040 if (g_conf()->osd_pool_use_gmt_hitset)
7c673cae
FG
8041 pi->use_gmt_hitset = true;
8042 else
8043 pi->use_gmt_hitset = false;
8044
8045 pi->size = size;
8046 pi->min_size = min_size;
31f18b77 8047 pi->crush_rule = crush_rule;
7c673cae
FG
8048 pi->expected_num_objects = expected_num_objects;
8049 pi->object_hash = CEPH_STR_HASH_RJENKINS;
f67539c2
TL
8050 if (osdmap.stretch_mode_enabled) {
8051 pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8052 pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8053 pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8054 pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8055 if (osdmap.degraded_stretch_mode) {
8056 pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8057 pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8058 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8059 // TODO: drat, we don't record this ^ anywhere, though given that it
8060 // necessarily won't exist elsewhere it likely doesn't matter
8061 pi->min_size = pi->min_size / 2;
8062 pi->size = pi->size / 2; // only support 2 zones now
8063 }
8064 }
11fdf7f2 8065
9f95a23c
TL
8066 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8067 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8068 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8069 pi->pg_autoscale_mode = m;
8070 } else {
8071 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
11fdf7f2
TL
8072 }
8073 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8074 pi->set_pg_num(
8075 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8076 : pg_num);
8077 pi->set_pg_num_pending(pi->get_pg_num());
8078 pi->set_pg_num_target(pg_num);
8079 pi->set_pgp_num(pi->get_pg_num());
8080 pi->set_pgp_num_target(pgp_num);
9f95a23c 8081 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
8082 pg_num_min) {
8083 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8084 }
9f95a23c
TL
8085 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8086 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8087 pi->pg_autoscale_mode = m;
8088 }
11fdf7f2 8089
7c673cae 8090 pi->last_change = pending_inc.epoch;
11fdf7f2
TL
8091 pi->auid = 0;
8092
8093 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8094 pi->erasure_code_profile = erasure_code_profile;
8095 } else {
8096 pi->erasure_code_profile = "";
8097 }
7c673cae 8098 pi->stripe_width = stripe_width;
11fdf7f2 8099
9f95a23c 8100 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
8101 target_size_bytes) {
8102 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8103 // larger than int32_t max.
8104 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8105 }
8106 if (target_size_ratio > 0.0 &&
9f95a23c 8107 osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
8108 // only store for nautilus+, just to be consistent and tidy.
8109 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8110 }
8111
7c673cae 8112 pi->cache_target_dirty_ratio_micro =
11fdf7f2 8113 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7c673cae 8114 pi->cache_target_dirty_high_ratio_micro =
11fdf7f2 8115 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7c673cae 8116 pi->cache_target_full_ratio_micro =
11fdf7f2
TL
8117 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8118 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8119 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8120
7c673cae
FG
8121 pending_inc.new_pool_names[pool] = name;
8122 return 0;
8123}
8124
8125bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8126{
8127 op->mark_osdmon_event(__func__);
8128 ostringstream ss;
8129 if (pending_inc.new_flags < 0)
8130 pending_inc.new_flags = osdmap.get_flags();
8131 pending_inc.new_flags |= flag;
8132 ss << OSDMap::get_flag_string(flag) << " is set";
8133 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8134 get_last_committed() + 1));
8135 return true;
8136}
8137
8138bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8139{
8140 op->mark_osdmon_event(__func__);
8141 ostringstream ss;
8142 if (pending_inc.new_flags < 0)
8143 pending_inc.new_flags = osdmap.get_flags();
8144 pending_inc.new_flags &= ~flag;
8145 ss << OSDMap::get_flag_string(flag) << " is unset";
8146 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8147 get_last_committed() + 1));
8148 return true;
8149}
8150
11fdf7f2 8151int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7c673cae
FG
8152 stringstream& ss)
8153{
8154 string poolstr;
9f95a23c 8155 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
8156 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8157 if (pool < 0) {
8158 ss << "unrecognized pool '" << poolstr << "'";
8159 return -ENOENT;
8160 }
8161 string var;
9f95a23c 8162 cmd_getval(cmdmap, "var", var);
7c673cae
FG
8163
8164 pg_pool_t p = *osdmap.get_pg_pool(pool);
8165 if (pending_inc.new_pools.count(pool))
8166 p = pending_inc.new_pools[pool];
8167
8168 // accept val as a json string in the normal case (current
8169 // generation monitor). parse out int or float values from the
8170 // string as needed. however, if it is not a string, try to pull
8171 // out an int, in case an older monitor with an older json schema is
8172 // forwarding a request.
8173 string val;
8174 string interr, floaterr;
8175 int64_t n = 0;
8176 double f = 0;
8177 int64_t uf = 0; // micro-f
9f95a23c 8178 cmd_getval(cmdmap, "val", val);
f64942e4 8179
9f95a23c
TL
8180 auto si_options = {
8181 "target_max_objects"
8182 };
8183 auto iec_options = {
8184 "target_max_bytes",
8185 "target_size_bytes",
8186 "compression_max_blob_size",
8187 "compression_min_blob_size",
8188 "csum_max_block",
8189 "csum_min_block",
8190 };
8191 if (count(begin(si_options), end(si_options), var)) {
92f5a8d4 8192 n = strict_si_cast<int64_t>(val.c_str(), &interr);
9f95a23c 8193 } else if (count(begin(iec_options), end(iec_options), var)) {
92f5a8d4
TL
8194 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
8195 } else {
8196 // parse string as both int and float; different fields use different types.
8197 n = strict_strtoll(val.c_str(), 10, &interr);
8198 f = strict_strtod(val.c_str(), &floaterr);
8199 uf = llrintl(f * (double)1000000.0);
8200 }
7c673cae
FG
8201
8202 if (!p.is_tier() &&
8203 (var == "hit_set_type" || var == "hit_set_period" ||
8204 var == "hit_set_count" || var == "hit_set_fpp" ||
8205 var == "target_max_objects" || var == "target_max_bytes" ||
8206 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8207 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8208 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8209 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8210 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8211 return -EACCES;
8212 }
8213
8214 if (var == "size") {
8215 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8216 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8217 return -EPERM;
8218 }
8219 if (p.type == pg_pool_t::TYPE_ERASURE) {
8220 ss << "can not change the size of an erasure-coded pool";
8221 return -ENOTSUP;
8222 }
8223 if (interr.length()) {
8224 ss << "error parsing integer value '" << val << "': " << interr;
8225 return -EINVAL;
8226 }
8227 if (n <= 0 || n > 10) {
8228 ss << "pool size must be between 1 and 10";
8229 return -EINVAL;
8230 }
f67539c2
TL
8231 if (n == 1) {
8232 if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8233 ss << "configuring pool size as 1 is disabled by default.";
8234 return -EPERM;
8235 }
8236 bool sure = false;
8237 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8238 if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8239 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8240 "pass the flag --yes-i-really-mean-it.";
8241 return -EPERM;
8242 }
8243 }
eafe8130
TL
8244 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8245 return -EINVAL;
8246 }
3efd9988
FG
8247 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8248 if (r < 0) {
8249 return r;
8250 }
7c673cae 8251 p.size = n;
1911f103 8252 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
7c673cae
FG
8253 } else if (var == "min_size") {
8254 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8255 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8256 return -EPERM;
8257 }
8258 if (interr.length()) {
8259 ss << "error parsing integer value '" << val << "': " << interr;
8260 return -EINVAL;
8261 }
8262
8263 if (p.type != pg_pool_t::TYPE_ERASURE) {
8264 if (n < 1 || n > p.size) {
494da23a 8265 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7c673cae
FG
8266 return -EINVAL;
8267 }
8268 } else {
8269 ErasureCodeInterfaceRef erasure_code;
8270 int k;
8271 stringstream tmp;
8272 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8273 if (err == 0) {
8274 k = erasure_code->get_data_chunk_count();
8275 } else {
b32b8144 8276 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
8277 return err;
8278 }
8279
8280 if (n < k || n > p.size) {
494da23a 8281 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7c673cae
FG
8282 return -EINVAL;
8283 }
8284 }
8285 p.min_size = n;
11fdf7f2 8286 } else if (var == "pg_num_actual") {
7c673cae
FG
8287 if (interr.length()) {
8288 ss << "error parsing integer value '" << val << "': " << interr;
8289 return -EINVAL;
8290 }
11fdf7f2
TL
8291 if (n == (int)p.get_pg_num()) {
8292 return 0;
8293 }
8294 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8295 ss << "'pg_num' must be greater than 0 and less than or equal to "
8296 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8297 << " (you may adjust 'mon max pool pg num' for higher values)";
8298 return -ERANGE;
8299 }
8300 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8301 ss << "cannot adjust pg_num while initial PGs are being created";
8302 return -EBUSY;
8303 }
8304 if (n > (int)p.get_pg_num()) {
8305 if (p.get_pg_num() != p.get_pg_num_pending()) {
8306 // force pre-nautilus clients to resend their ops, since they
8307 // don't understand pg_num_pending changes form a new interval
8308 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8309 }
8310 p.set_pg_num(n);
8311 } else {
9f95a23c 8312 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8313 ss << "nautilus OSDs are required to adjust pg_num_pending";
8314 return -EPERM;
8315 }
8316 if (n < (int)p.get_pgp_num()) {
8317 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8318 return -EINVAL;
8319 }
8320 if (n < (int)p.get_pg_num() - 1) {
8321 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8322 << ") - 1; only single pg decrease is currently supported";
8323 return -EINVAL;
8324 }
8325 p.set_pg_num_pending(n);
8326 // force pre-nautilus clients to resend their ops, since they
8327 // don't understand pg_num_pending changes form a new interval
8328 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7c673cae 8329 }
11fdf7f2
TL
8330 // force pre-luminous clients to resend their ops, since they
8331 // don't understand that split PGs now form a new interval.
8332 p.last_force_op_resend_preluminous = pending_inc.epoch;
7c673cae
FG
8333 } else if (var == "pg_num") {
8334 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8335 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8336 return -EPERM;
8337 }
8338 if (interr.length()) {
8339 ss << "error parsing integer value '" << val << "': " << interr;
8340 return -EINVAL;
8341 }
11fdf7f2 8342 if (n == (int)p.get_pg_num_target()) {
7c673cae
FG
8343 return 0;
8344 }
11fdf7f2
TL
8345 if (n <= 0 || static_cast<uint64_t>(n) >
8346 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
c07f9fc5 8347 ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 8348 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
c07f9fc5
FG
8349 << " (you may adjust 'mon max pool pg num' for higher values)";
8350 return -ERANGE;
8351 }
11fdf7f2
TL
8352 if (n > (int)p.get_pg_num_target()) {
8353 int r = check_pg_num(pool, n, p.get_size(), &ss);
8354 if (r) {
8355 return r;
8356 }
8357 bool force = false;
9f95a23c 8358 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8359 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8360 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8361 return -EPERM;
8362 }
8363 } else {
9f95a23c 8364 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8365 ss << "nautilus OSDs are required to decrease pg_num";
8366 return -EPERM;
8367 }
7c673cae 8368 }
9f95a23c 8369 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8370 // pre-nautilus osdmap format; increase pg_num directly
8371 assert(n > (int)p.get_pg_num());
8372 // force pre-nautilus clients to resend their ops, since they
8373 // don't understand pg_num_target changes form a new interval
8374 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8375 // force pre-luminous clients to resend their ops, since they
8376 // don't understand that split PGs now form a new interval.
8377 p.last_force_op_resend_preluminous = pending_inc.epoch;
8378 p.set_pg_num(n);
8379 } else {
8380 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8381 // make pgp_num track pg_num if it already matches. if it is set
8382 // differently, leave it different and let the user control it
8383 // manually.
8384 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8385 p.set_pgp_num_target(n);
8386 }
8387 p.set_pg_num_target(n);
7c673cae 8388 }
11fdf7f2 8389 } else if (var == "pgp_num_actual") {
7c673cae
FG
8390 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8391 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8392 return -EPERM;
8393 }
8394 if (interr.length()) {
8395 ss << "error parsing integer value '" << val << "': " << interr;
8396 return -EINVAL;
8397 }
8398 if (n <= 0) {
8399 ss << "specified pgp_num must > 0, but you set to " << n;
8400 return -EINVAL;
8401 }
8402 if (n > (int)p.get_pg_num()) {
8403 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8404 return -EINVAL;
8405 }
11fdf7f2
TL
8406 if (n > (int)p.get_pg_num_pending()) {
8407 ss << "specified pgp_num " << n
8408 << " > pg_num_pending " << p.get_pg_num_pending();
8409 return -EINVAL;
8410 }
7c673cae 8411 p.set_pgp_num(n);
11fdf7f2
TL
8412 } else if (var == "pgp_num") {
8413 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8414 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8415 return -EPERM;
8416 }
8417 if (interr.length()) {
8418 ss << "error parsing integer value '" << val << "': " << interr;
8419 return -EINVAL;
8420 }
8421 if (n <= 0) {
8422 ss << "specified pgp_num must > 0, but you set to " << n;
8423 return -EINVAL;
8424 }
8425 if (n > (int)p.get_pg_num_target()) {
8426 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8427 return -EINVAL;
8428 }
9f95a23c 8429 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8430 // pre-nautilus osdmap format; increase pgp_num directly
8431 p.set_pgp_num(n);
8432 } else {
8433 p.set_pgp_num_target(n);
8434 }
11fdf7f2 8435 } else if (var == "pg_autoscale_mode") {
9f95a23c
TL
8436 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8437 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
11fdf7f2
TL
8438 ss << "specified invalid mode " << val;
8439 return -EINVAL;
8440 }
9f95a23c 8441 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8442 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8443 return -EINVAL;
8444 }
9f95a23c 8445 p.pg_autoscale_mode = m;
7c673cae
FG
8446 } else if (var == "crush_rule") {
8447 int id = osdmap.crush->get_rule_id(val);
8448 if (id == -ENOENT) {
8449 ss << "crush rule " << val << " does not exist";
8450 return -ENOENT;
8451 }
8452 if (id < 0) {
8453 ss << cpp_strerror(id);
8454 return -ENOENT;
8455 }
8456 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8457 return -EINVAL;
8458 }
31f18b77 8459 p.crush_rule = id;
7c673cae
FG
8460 } else if (var == "nodelete" || var == "nopgchange" ||
8461 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8462 var == "noscrub" || var == "nodeep-scrub") {
8463 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8464 // make sure we only compare against 'n' if we didn't receive a string
8465 if (val == "true" || (interr.empty() && n == 1)) {
8466 p.set_flag(flag);
8467 } else if (val == "false" || (interr.empty() && n == 0)) {
8468 p.unset_flag(flag);
8469 } else {
8470 ss << "expecting value 'true', 'false', '0', or '1'";
8471 return -EINVAL;
8472 }
8473 } else if (var == "hashpspool") {
8474 uint64_t flag = pg_pool_t::get_flag_by_name(var);
11fdf7f2 8475 bool force = false;
9f95a23c 8476 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8477
8478 if (!force) {
7c673cae
FG
8479 ss << "are you SURE? this will remap all placement groups in this pool,"
8480 " this triggers large data movement,"
8481 " pass --yes-i-really-mean-it if you really do.";
8482 return -EPERM;
8483 }
8484 // make sure we only compare against 'n' if we didn't receive a string
8485 if (val == "true" || (interr.empty() && n == 1)) {
8486 p.set_flag(flag);
8487 } else if (val == "false" || (interr.empty() && n == 0)) {
8488 p.unset_flag(flag);
8489 } else {
8490 ss << "expecting value 'true', 'false', '0', or '1'";
8491 return -EINVAL;
8492 }
8493 } else if (var == "hit_set_type") {
8494 if (val == "none")
8495 p.hit_set_params = HitSet::Params();
8496 else {
8497 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8498 if (err)
8499 return err;
8500 if (val == "bloom") {
8501 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 8502 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae
FG
8503 p.hit_set_params = HitSet::Params(bsp);
8504 } else if (val == "explicit_hash")
8505 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8506 else if (val == "explicit_object")
8507 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8508 else {
8509 ss << "unrecognized hit_set type '" << val << "'";
8510 return -EINVAL;
8511 }
8512 }
8513 } else if (var == "hit_set_period") {
8514 if (interr.length()) {
8515 ss << "error parsing integer value '" << val << "': " << interr;
8516 return -EINVAL;
11fdf7f2
TL
8517 } else if (n < 0) {
8518 ss << "hit_set_period should be non-negative";
8519 return -EINVAL;
7c673cae
FG
8520 }
8521 p.hit_set_period = n;
8522 } else if (var == "hit_set_count") {
8523 if (interr.length()) {
8524 ss << "error parsing integer value '" << val << "': " << interr;
8525 return -EINVAL;
11fdf7f2
TL
8526 } else if (n < 0) {
8527 ss << "hit_set_count should be non-negative";
8528 return -EINVAL;
7c673cae
FG
8529 }
8530 p.hit_set_count = n;
8531 } else if (var == "hit_set_fpp") {
8532 if (floaterr.length()) {
8533 ss << "error parsing floating point value '" << val << "': " << floaterr;
8534 return -EINVAL;
11fdf7f2
TL
8535 } else if (f < 0 || f > 1.0) {
8536 ss << "hit_set_fpp should be in the range 0..1";
8537 return -EINVAL;
7c673cae
FG
8538 }
8539 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8540 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8541 return -EINVAL;
8542 }
8543 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8544 bloomp->set_fpp(f);
8545 } else if (var == "use_gmt_hitset") {
8546 if (val == "true" || (interr.empty() && n == 1)) {
7c673cae
FG
8547 p.use_gmt_hitset = true;
8548 } else {
8549 ss << "expecting value 'true' or '1'";
8550 return -EINVAL;
8551 }
8552 } else if (var == "allow_ec_overwrites") {
8553 if (!p.is_erasure()) {
8554 ss << "ec overwrites can only be enabled for an erasure coded pool";
8555 return -EINVAL;
8556 }
224ce89b 8557 stringstream err;
11fdf7f2 8558 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
224ce89b
WB
8559 !is_pool_currently_all_bluestore(pool, p, &err)) {
8560 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8561 return -EINVAL;
8562 }
7c673cae
FG
8563 if (val == "true" || (interr.empty() && n == 1)) {
8564 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8565 } else if (val == "false" || (interr.empty() && n == 0)) {
8566 ss << "ec overwrites cannot be disabled once enabled";
8567 return -EINVAL;
8568 } else {
8569 ss << "expecting value 'true', 'false', '0', or '1'";
8570 return -EINVAL;
8571 }
7c673cae
FG
8572 } else if (var == "target_max_objects") {
8573 if (interr.length()) {
8574 ss << "error parsing int '" << val << "': " << interr;
8575 return -EINVAL;
8576 }
8577 p.target_max_objects = n;
8578 } else if (var == "target_max_bytes") {
8579 if (interr.length()) {
8580 ss << "error parsing int '" << val << "': " << interr;
8581 return -EINVAL;
8582 }
8583 p.target_max_bytes = n;
8584 } else if (var == "cache_target_dirty_ratio") {
8585 if (floaterr.length()) {
8586 ss << "error parsing float '" << val << "': " << floaterr;
8587 return -EINVAL;
8588 }
8589 if (f < 0 || f > 1.0) {
8590 ss << "value must be in the range 0..1";
8591 return -ERANGE;
8592 }
8593 p.cache_target_dirty_ratio_micro = uf;
8594 } else if (var == "cache_target_dirty_high_ratio") {
8595 if (floaterr.length()) {
8596 ss << "error parsing float '" << val << "': " << floaterr;
8597 return -EINVAL;
8598 }
8599 if (f < 0 || f > 1.0) {
8600 ss << "value must be in the range 0..1";
8601 return -ERANGE;
8602 }
8603 p.cache_target_dirty_high_ratio_micro = uf;
8604 } else if (var == "cache_target_full_ratio") {
8605 if (floaterr.length()) {
8606 ss << "error parsing float '" << val << "': " << floaterr;
8607 return -EINVAL;
8608 }
8609 if (f < 0 || f > 1.0) {
8610 ss << "value must be in the range 0..1";
8611 return -ERANGE;
8612 }
8613 p.cache_target_full_ratio_micro = uf;
8614 } else if (var == "cache_min_flush_age") {
8615 if (interr.length()) {
8616 ss << "error parsing int '" << val << "': " << interr;
8617 return -EINVAL;
8618 }
8619 p.cache_min_flush_age = n;
8620 } else if (var == "cache_min_evict_age") {
8621 if (interr.length()) {
8622 ss << "error parsing int '" << val << "': " << interr;
8623 return -EINVAL;
8624 }
8625 p.cache_min_evict_age = n;
8626 } else if (var == "min_read_recency_for_promote") {
8627 if (interr.length()) {
8628 ss << "error parsing integer value '" << val << "': " << interr;
8629 return -EINVAL;
8630 }
8631 p.min_read_recency_for_promote = n;
8632 } else if (var == "hit_set_grade_decay_rate") {
8633 if (interr.length()) {
8634 ss << "error parsing integer value '" << val << "': " << interr;
8635 return -EINVAL;
8636 }
8637 if (n > 100 || n < 0) {
8638 ss << "value out of range,valid range is 0 - 100";
8639 return -EINVAL;
8640 }
8641 p.hit_set_grade_decay_rate = n;
8642 } else if (var == "hit_set_search_last_n") {
8643 if (interr.length()) {
8644 ss << "error parsing integer value '" << val << "': " << interr;
8645 return -EINVAL;
8646 }
8647 if (n > p.hit_set_count || n < 0) {
8648 ss << "value out of range,valid range is 0 - hit_set_count";
8649 return -EINVAL;
8650 }
8651 p.hit_set_search_last_n = n;
8652 } else if (var == "min_write_recency_for_promote") {
8653 if (interr.length()) {
8654 ss << "error parsing integer value '" << val << "': " << interr;
8655 return -EINVAL;
8656 }
8657 p.min_write_recency_for_promote = n;
8658 } else if (var == "fast_read") {
8659 if (p.is_replicated()) {
8660 ss << "fast read is not supported in replication pool";
8661 return -EINVAL;
8662 }
8663 if (val == "true" || (interr.empty() && n == 1)) {
8664 p.fast_read = true;
8665 } else if (val == "false" || (interr.empty() && n == 0)) {
8666 p.fast_read = false;
8667 } else {
8668 ss << "expecting value 'true', 'false', '0', or '1'";
8669 return -EINVAL;
8670 }
8671 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 8672 bool unset = val == "unset";
7c673cae 8673 if (var == "compression_mode") {
224ce89b
WB
8674 if (!unset) {
8675 auto cmode = Compressor::get_comp_mode_type(val);
8676 if (!cmode) {
8677 ss << "unrecognized compression mode '" << val << "'";
8678 return -EINVAL;
8679 }
7c673cae
FG
8680 }
8681 } else if (var == "compression_algorithm") {
224ce89b
WB
8682 if (!unset) {
8683 auto alg = Compressor::get_comp_alg_type(val);
8684 if (!alg) {
8685 ss << "unrecognized compression_algorithm '" << val << "'";
8686 return -EINVAL;
8687 }
7c673cae
FG
8688 }
8689 } else if (var == "compression_required_ratio") {
8690 if (floaterr.length()) {
8691 ss << "error parsing float value '" << val << "': " << floaterr;
8692 return -EINVAL;
8693 }
224ce89b 8694 if (f < 0 || f > 1) {
7c673cae 8695 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 8696 return -EINVAL;
7c673cae
FG
8697 }
8698 } else if (var == "csum_type") {
224ce89b 8699 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
8700 if (t < 0 ) {
8701 ss << "unrecognized csum_type '" << val << "'";
224ce89b 8702 return -EINVAL;
7c673cae
FG
8703 }
8704 //preserve csum_type numeric value
8705 n = t;
8706 interr.clear();
8707 } else if (var == "compression_max_blob_size" ||
8708 var == "compression_min_blob_size" ||
8709 var == "csum_max_block" ||
8710 var == "csum_min_block") {
8711 if (interr.length()) {
8712 ss << "error parsing int value '" << val << "': " << interr;
8713 return -EINVAL;
8714 }
11fdf7f2
TL
8715 } else if (var == "fingerprint_algorithm") {
8716 if (!unset) {
8717 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8718 if (!alg) {
8719 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8720 return -EINVAL;
8721 }
8722 }
92f5a8d4
TL
8723 } else if (var == "target_size_bytes") {
8724 if (interr.length()) {
8725 ss << "error parsing unit value '" << val << "': " << interr;
8726 return -EINVAL;
8727 }
9f95a23c 8728 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
92f5a8d4
TL
8729 ss << "must set require_osd_release to nautilus or "
8730 << "later before setting target_size_bytes";
8731 return -EINVAL;
8732 }
11fdf7f2
TL
8733 } else if (var == "pg_num_min") {
8734 if (interr.length()) {
8735 ss << "error parsing int value '" << val << "': " << interr;
8736 return -EINVAL;
8737 }
8738 if (n > (int)p.get_pg_num_target()) {
8739 ss << "specified pg_num_min " << n
8740 << " > pg_num " << p.get_pg_num_target();
8741 return -EINVAL;
8742 }
8743 } else if (var == "recovery_priority") {
8744 if (interr.length()) {
8745 ss << "error parsing int value '" << val << "': " << interr;
8746 return -EINVAL;
8747 }
81eedcae
TL
8748 if (!g_conf()->debug_allow_any_pool_priority) {
8749 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8750 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8751 << " and " << OSD_POOL_PRIORITY_MAX;
8752 return -EINVAL;
8753 }
11fdf7f2
TL
8754 }
8755 } else if (var == "pg_autoscale_bias") {
8756 if (f < 0.0 || f > 1000.0) {
8757 ss << "pg_autoscale_bias must be between 0 and 1000";
8758 return -EINVAL;
8759 }
f67539c2
TL
8760 } else if (var == "dedup_tier") {
8761 if (interr.empty()) {
8762 ss << "expecting value 'pool name'";
8763 return -EINVAL;
8764 }
8765 // Current base tier in dedup does not support ec pool
8766 if (p.is_erasure()) {
8767 ss << "pool '" << poolstr
8768 << "' is an ec pool, which cannot be a base tier";
8769 return -ENOTSUP;
8770 }
8771 int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8772 if (lowtierpool_id < 0) {
8773 ss << "unrecognized pool '" << val << "'";
8774 return -ENOENT;
8775 }
8776 const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8777 ceph_assert(tp);
8778 n = lowtierpool_id;
8779 // The original input is string (pool name), but we convert it to int64_t.
8780 // So, clear interr
8781 interr.clear();
8782 } else if (var == "dedup_chunk_algorithm") {
8783 if (!unset) {
8784 auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8785 if (!alg) {
8786 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8787 return -EINVAL;
8788 }
8789 }
8790 } else if (var == "dedup_cdc_chunk_size") {
8791 if (interr.length()) {
8792 ss << "error parsing int value '" << val << "': " << interr;
8793 return -EINVAL;
8794 }
7c673cae
FG
8795 }
8796
8797 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8798 switch (desc.type) {
8799 case pool_opts_t::STR:
224ce89b 8800 if (unset) {
7c673cae
FG
8801 p.opts.unset(desc.key);
8802 } else {
8803 p.opts.set(desc.key, static_cast<std::string>(val));
8804 }
8805 break;
8806 case pool_opts_t::INT:
8807 if (interr.length()) {
8808 ss << "error parsing integer value '" << val << "': " << interr;
8809 return -EINVAL;
8810 }
8811 if (n == 0) {
8812 p.opts.unset(desc.key);
8813 } else {
11fdf7f2 8814 p.opts.set(desc.key, static_cast<int64_t>(n));
7c673cae
FG
8815 }
8816 break;
8817 case pool_opts_t::DOUBLE:
8818 if (floaterr.length()) {
8819 ss << "error parsing floating point value '" << val << "': " << floaterr;
8820 return -EINVAL;
8821 }
8822 if (f == 0) {
8823 p.opts.unset(desc.key);
8824 } else {
8825 p.opts.set(desc.key, static_cast<double>(f));
8826 }
8827 break;
8828 default:
11fdf7f2 8829 ceph_assert(!"unknown type");
7c673cae
FG
8830 }
8831 } else {
8832 ss << "unrecognized variable '" << var << "'";
8833 return -EINVAL;
8834 }
224ce89b
WB
8835 if (val != "unset") {
8836 ss << "set pool " << pool << " " << var << " to " << val;
8837 } else {
8838 ss << "unset pool " << pool << " " << var;
8839 }
7c673cae
FG
8840 p.last_change = pending_inc.epoch;
8841 pending_inc.new_pools[pool] = p;
8842 return 0;
8843}
8844
c07f9fc5 8845int OSDMonitor::prepare_command_pool_application(const string &prefix,
11fdf7f2 8846 const cmdmap_t& cmdmap,
c07f9fc5 8847 stringstream& ss)
11fdf7f2
TL
8848{
8849 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8850}
8851
8852int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8853 const cmdmap_t& cmdmap,
8854 stringstream& ss,
8855 bool *modified)
8856{
8857 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8858}
8859
8860
8861/**
8862 * Common logic for preprocess and prepare phases of pool application
8863 * tag commands. In preprocess mode we're only detecting invalid
8864 * commands, and determining whether it was a modification or a no-op.
8865 * In prepare mode we're actually updating the pending state.
8866 */
8867int OSDMonitor::_command_pool_application(const string &prefix,
8868 const cmdmap_t& cmdmap,
8869 stringstream& ss,
8870 bool *modified,
8871 bool preparing)
c07f9fc5
FG
8872{
8873 string pool_name;
9f95a23c 8874 cmd_getval(cmdmap, "pool", pool_name);
c07f9fc5
FG
8875 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8876 if (pool < 0) {
8877 ss << "unrecognized pool '" << pool_name << "'";
8878 return -ENOENT;
8879 }
8880
8881 pg_pool_t p = *osdmap.get_pg_pool(pool);
11fdf7f2
TL
8882 if (preparing) {
8883 if (pending_inc.new_pools.count(pool)) {
8884 p = pending_inc.new_pools[pool];
8885 }
c07f9fc5
FG
8886 }
8887
8888 string app;
9f95a23c 8889 cmd_getval(cmdmap, "app", app);
c07f9fc5
FG
8890 bool app_exists = (p.application_metadata.count(app) > 0);
8891
11fdf7f2 8892 string key;
9f95a23c 8893 cmd_getval(cmdmap, "key", key);
11fdf7f2
TL
8894 if (key == "all") {
8895 ss << "key cannot be 'all'";
8896 return -EINVAL;
8897 }
8898
8899 string value;
9f95a23c 8900 cmd_getval(cmdmap, "value", value);
11fdf7f2
TL
8901 if (value == "all") {
8902 ss << "value cannot be 'all'";
8903 return -EINVAL;
8904 }
8905
c07f9fc5
FG
8906 if (boost::algorithm::ends_with(prefix, "enable")) {
8907 if (app.empty()) {
8908 ss << "application name must be provided";
8909 return -EINVAL;
8910 }
8911
8912 if (p.is_tier()) {
8913 ss << "application must be enabled on base tier";
8914 return -EINVAL;
8915 }
8916
11fdf7f2 8917 bool force = false;
9f95a23c 8918 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8919
11fdf7f2 8920 if (!app_exists && !p.application_metadata.empty() && !force) {
c07f9fc5
FG
8921 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8922 << "application; pass --yes-i-really-mean-it to proceed anyway";
8923 return -EPERM;
8924 }
8925
8926 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8927 ss << "too many enabled applications on pool '" << pool_name << "'; "
8928 << "max " << MAX_POOL_APPLICATIONS;
8929 return -EINVAL;
8930 }
8931
8932 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8933 ss << "application name '" << app << "' too long; max length "
8934 << MAX_POOL_APPLICATION_LENGTH;
8935 return -EINVAL;
8936 }
8937
8938 if (!app_exists) {
8939 p.application_metadata[app] = {};
8940 }
8941 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8942
8943 } else if (boost::algorithm::ends_with(prefix, "disable")) {
11fdf7f2 8944 bool force = false;
9f95a23c 8945 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8946
11fdf7f2 8947 if (!force) {
c07f9fc5
FG
8948 ss << "Are you SURE? Disabling an application within a pool might result "
8949 << "in loss of application functionality; pass "
8950 << "--yes-i-really-mean-it to proceed anyway";
8951 return -EPERM;
8952 }
8953
8954 if (!app_exists) {
8955 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8956 << "'";
8957 return 0; // idempotent
8958 }
8959
8960 p.application_metadata.erase(app);
8961 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8962
8963 } else if (boost::algorithm::ends_with(prefix, "set")) {
8964 if (p.is_tier()) {
8965 ss << "application metadata must be set on base tier";
8966 return -EINVAL;
8967 }
8968
8969 if (!app_exists) {
8970 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8971 << "'";
8972 return -ENOENT;
8973 }
8974
8975 string key;
9f95a23c 8976 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
8977
8978 if (key.empty()) {
8979 ss << "key must be provided";
8980 return -EINVAL;
8981 }
8982
8983 auto &app_keys = p.application_metadata[app];
8984 if (app_keys.count(key) == 0 &&
8985 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8986 ss << "too many keys set for application '" << app << "' on pool '"
8987 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8988 return -EINVAL;
8989 }
8990
8991 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8992 ss << "key '" << app << "' too long; max length "
8993 << MAX_POOL_APPLICATION_LENGTH;
8994 return -EINVAL;
8995 }
8996
8997 string value;
9f95a23c 8998 cmd_getval(cmdmap, "value", value);
c07f9fc5
FG
8999 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9000 ss << "value '" << value << "' too long; max length "
9001 << MAX_POOL_APPLICATION_LENGTH;
9002 return -EINVAL;
9003 }
9004
9005 p.application_metadata[app][key] = value;
9006 ss << "set application '" << app << "' key '" << key << "' to '"
9007 << value << "' on pool '" << pool_name << "'";
9008 } else if (boost::algorithm::ends_with(prefix, "rm")) {
9009 if (!app_exists) {
9010 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9011 << "'";
9012 return -ENOENT;
9013 }
9014
9015 string key;
9f95a23c 9016 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
9017 auto it = p.application_metadata[app].find(key);
9018 if (it == p.application_metadata[app].end()) {
9019 ss << "application '" << app << "' on pool '" << pool_name
9020 << "' does not have key '" << key << "'";
9021 return 0; // idempotent
9022 }
9023
9024 p.application_metadata[app].erase(it);
9025 ss << "removed application '" << app << "' key '" << key << "' on pool '"
9026 << pool_name << "'";
9027 } else {
11fdf7f2
TL
9028 ceph_abort();
9029 }
9030
9031 if (preparing) {
9032 p.last_change = pending_inc.epoch;
9033 pending_inc.new_pools[pool] = p;
9034 }
9035
9036 // Because we fell through this far, we didn't hit no-op cases,
9037 // so pool was definitely modified
9038 if (modified != nullptr) {
9039 *modified = true;
c07f9fc5
FG
9040 }
9041
c07f9fc5
FG
9042 return 0;
9043}
9044
31f18b77
FG
9045int OSDMonitor::_prepare_command_osd_crush_remove(
9046 CrushWrapper &newcrush,
9047 int32_t id,
9048 int32_t ancestor,
9049 bool has_ancestor,
9050 bool unlink_only)
9051{
9052 int err = 0;
9053
9054 if (has_ancestor) {
11fdf7f2 9055 err = newcrush.remove_item_under(cct, id, ancestor,
31f18b77
FG
9056 unlink_only);
9057 } else {
11fdf7f2 9058 err = newcrush.remove_item(cct, id, unlink_only);
31f18b77
FG
9059 }
9060 return err;
9061}
9062
9063void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9064{
9065 pending_inc.crush.clear();
f67539c2 9066 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
31f18b77
FG
9067}
9068
9069int OSDMonitor::prepare_command_osd_crush_remove(
9070 CrushWrapper &newcrush,
9071 int32_t id,
9072 int32_t ancestor,
9073 bool has_ancestor,
9074 bool unlink_only)
9075{
9076 int err = _prepare_command_osd_crush_remove(
9077 newcrush, id, ancestor,
9078 has_ancestor, unlink_only);
9079
9080 if (err < 0)
9081 return err;
9082
11fdf7f2 9083 ceph_assert(err == 0);
31f18b77
FG
9084 do_osd_crush_remove(newcrush);
9085
9086 return 0;
9087}
9088
9089int OSDMonitor::prepare_command_osd_remove(int32_t id)
9090{
9091 if (osdmap.is_up(id)) {
9092 return -EBUSY;
9093 }
9094
9095 pending_inc.new_state[id] = osdmap.get_state(id);
9096 pending_inc.new_uuid[id] = uuid_d();
9097 pending_metadata_rm.insert(id);
9098 pending_metadata.erase(id);
9099
9100 return 0;
9101}
9102
9103int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9104{
11fdf7f2 9105 ceph_assert(existing_id);
31f18b77
FG
9106 *existing_id = -1;
9107
9108 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9109 if (!osdmap.exists(i) &&
9110 pending_inc.new_up_client.count(i) == 0 &&
9111 (pending_inc.new_state.count(i) == 0 ||
9112 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9113 *existing_id = i;
9114 return -1;
9115 }
9116 }
9117
9118 if (pending_inc.new_max_osd < 0) {
9119 return osdmap.get_max_osd();
9120 }
9121 return pending_inc.new_max_osd;
9122}
9123
9124void OSDMonitor::do_osd_create(
9125 const int32_t id,
9126 const uuid_d& uuid,
3a9019d9 9127 const string& device_class,
31f18b77
FG
9128 int32_t* new_id)
9129{
9130 dout(10) << __func__ << " uuid " << uuid << dendl;
11fdf7f2 9131 ceph_assert(new_id);
31f18b77
FG
9132
9133 // We presume validation has been performed prior to calling this
9134 // function. We assert with prejudice.
9135
9136 int32_t allocated_id = -1; // declare here so we can jump
9137 int32_t existing_id = -1;
9138 if (!uuid.is_zero()) {
9139 existing_id = osdmap.identify_osd(uuid);
9140 if (existing_id >= 0) {
11fdf7f2 9141 ceph_assert(id < 0 || id == existing_id);
31f18b77
FG
9142 *new_id = existing_id;
9143 goto out;
9144 } else if (id >= 0) {
9145 // uuid does not exist, and id has been provided, so just create
9146 // the new osd.id
9147 *new_id = id;
9148 goto out;
9149 }
9150 }
9151
9152 // allocate a new id
9153 allocated_id = _allocate_osd_id(&existing_id);
9154 dout(10) << __func__ << " allocated id " << allocated_id
9155 << " existing id " << existing_id << dendl;
9156 if (existing_id >= 0) {
11fdf7f2
TL
9157 ceph_assert(existing_id < osdmap.get_max_osd());
9158 ceph_assert(allocated_id < 0);
31f18b77 9159 *new_id = existing_id;
31f18b77 9160 } else if (allocated_id >= 0) {
11fdf7f2 9161 ceph_assert(existing_id < 0);
31f18b77
FG
9162 // raise max_osd
9163 if (pending_inc.new_max_osd < 0) {
9164 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9165 } else {
9166 ++pending_inc.new_max_osd;
9167 }
9168 *new_id = pending_inc.new_max_osd - 1;
11fdf7f2 9169 ceph_assert(*new_id == allocated_id);
31f18b77 9170 } else {
11fdf7f2 9171 ceph_abort_msg("unexpected condition");
31f18b77
FG
9172 }
9173
9174out:
3a9019d9
FG
9175 if (device_class.size()) {
9176 CrushWrapper newcrush;
9177 _get_pending_crush(newcrush);
9178 if (newcrush.get_max_devices() < *new_id + 1) {
9179 newcrush.set_max_devices(*new_id + 1);
9180 }
9181 string name = string("osd.") + stringify(*new_id);
9182 if (!newcrush.item_exists(*new_id)) {
9183 newcrush.set_item_name(*new_id, name);
9184 }
9185 ostringstream ss;
9186 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9187 if (r < 0) {
9188 derr << __func__ << " failed to set " << name << " device_class "
9189 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9190 << dendl;
9191 // non-fatal... this might be a replay and we want to be idempotent.
9192 } else {
9193 dout(20) << __func__ << " set " << name << " device_class " << device_class
9194 << dendl;
9195 pending_inc.crush.clear();
f67539c2 9196 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
3a9019d9
FG
9197 }
9198 } else {
9199 dout(20) << __func__ << " no device_class" << dendl;
9200 }
9201
31f18b77
FG
9202 dout(10) << __func__ << " using id " << *new_id << dendl;
9203 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9204 pending_inc.new_max_osd = *new_id + 1;
9205 }
9206
f67539c2
TL
9207 pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9208 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9209 // set it for us. (ugh.)
9210 pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
31f18b77
FG
9211 if (!uuid.is_zero())
9212 pending_inc.new_uuid[*new_id] = uuid;
9213}
9214
9215int OSDMonitor::validate_osd_create(
9216 const int32_t id,
9217 const uuid_d& uuid,
9218 const bool check_osd_exists,
9219 int32_t* existing_id,
9220 stringstream& ss)
9221{
9222
9223 dout(10) << __func__ << " id " << id << " uuid " << uuid
9224 << " check_osd_exists " << check_osd_exists << dendl;
9225
11fdf7f2 9226 ceph_assert(existing_id);
31f18b77
FG
9227
9228 if (id < 0 && uuid.is_zero()) {
9229 // we have nothing to validate
9230 *existing_id = -1;
9231 return 0;
9232 } else if (uuid.is_zero()) {
9233 // we have an id but we will ignore it - because that's what
9234 // `osd create` does.
9235 return 0;
9236 }
9237
9238 /*
9239 * This function will be used to validate whether we are able to
9240 * create a new osd when the `uuid` is specified.
9241 *
9242 * It will be used by both `osd create` and `osd new`, as the checks
9243 * are basically the same when it pertains to osd id and uuid validation.
9244 * However, `osd create` presumes an `uuid` is optional, for legacy
9245 * reasons, while `osd new` requires the `uuid` to be provided. This
9246 * means that `osd create` will not be idempotent if an `uuid` is not
9247 * provided, but we will always guarantee the idempotency of `osd new`.
9248 */
9249
11fdf7f2 9250 ceph_assert(!uuid.is_zero());
31f18b77
FG
9251 if (pending_inc.identify_osd(uuid) >= 0) {
9252 // osd is about to exist
9253 return -EAGAIN;
9254 }
9255
9256 int32_t i = osdmap.identify_osd(uuid);
9257 if (i >= 0) {
9258 // osd already exists
9259 if (id >= 0 && i != id) {
9260 ss << "uuid " << uuid << " already in use for different id " << i;
9261 return -EEXIST;
9262 }
9263 // return a positive errno to distinguish between a blocking error
9264 // and an error we consider to not be a problem (i.e., this would be
9265 // an idempotent operation).
9266 *existing_id = i;
9267 return EEXIST;
9268 }
9269 // i < 0
9270 if (id >= 0) {
9271 if (pending_inc.new_state.count(id)) {
9272 // osd is about to exist
9273 return -EAGAIN;
9274 }
9275 // we may not care if an osd exists if we are recreating a previously
9276 // destroyed osd.
9277 if (check_osd_exists && osdmap.exists(id)) {
9278 ss << "id " << id << " already in use and does not match uuid "
9279 << uuid;
9280 return -EINVAL;
9281 }
9282 }
9283 return 0;
9284}
9285
9286int OSDMonitor::prepare_command_osd_create(
9287 const int32_t id,
9288 const uuid_d& uuid,
9289 int32_t* existing_id,
9290 stringstream& ss)
9291{
9292 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
11fdf7f2 9293 ceph_assert(existing_id);
b5b8bbf5
FG
9294 if (osdmap.is_destroyed(id)) {
9295 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9296 "instead.";
9297 return -EINVAL;
9298 }
31f18b77
FG
9299
9300 if (uuid.is_zero()) {
9301 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9302 }
9303
9304 return validate_osd_create(id, uuid, true, existing_id, ss);
9305}
9306
9307int OSDMonitor::prepare_command_osd_new(
9308 MonOpRequestRef op,
11fdf7f2 9309 const cmdmap_t& cmdmap,
3a9019d9 9310 const map<string,string>& params,
31f18b77
FG
9311 stringstream &ss,
9312 Formatter *f)
9313{
9314 uuid_d uuid;
9315 string uuidstr;
9316 int64_t id = -1;
9317
f67539c2 9318 ceph_assert(paxos.is_plugged());
31f18b77
FG
9319
9320 dout(10) << __func__ << " " << op << dendl;
9321
9322 /* validate command. abort now if something's wrong. */
9323
9324 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9325 *
9326 * If `id` is not specified, we will identify any existing osd based
9327 * on `uuid`. Operation will be idempotent iff secrets match.
9328 *
9329 * If `id` is specified, we will identify any existing osd based on
9330 * `uuid` and match against `id`. If they match, operation will be
9331 * idempotent iff secrets match.
9332 *
9333 * `-i secrets.json` will be optional. If supplied, will be used
9334 * to check for idempotency when `id` and `uuid` match.
9335 *
9336 * If `id` is not specified, and `uuid` does not exist, an id will
9337 * be found or allocated for the osd.
9338 *
9339 * If `id` is specified, and the osd has been previously marked
9340 * as destroyed, then the `id` will be reused.
9341 */
9f95a23c 9342 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
31f18b77
FG
9343 ss << "requires the OSD's UUID to be specified.";
9344 return -EINVAL;
9345 } else if (!uuid.parse(uuidstr.c_str())) {
9346 ss << "invalid UUID value '" << uuidstr << "'.";
9347 return -EINVAL;
9348 }
9349
9f95a23c 9350 if (cmd_getval(cmdmap, "id", id) &&
31f18b77
FG
9351 (id < 0)) {
9352 ss << "invalid OSD id; must be greater or equal than zero.";
9353 return -EINVAL;
9354 }
9355
9356 // are we running an `osd create`-like command, or recreating
9357 // a previously destroyed osd?
9358
9359 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9360
9361 // we will care about `id` to assess whether osd is `destroyed`, or
9362 // to create a new osd.
9363 // we will need an `id` by the time we reach auth.
9364
9365 int32_t existing_id = -1;
9366 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9367 &existing_id, ss);
9368
9369 bool may_be_idempotent = false;
9370 if (err == EEXIST) {
9371 // this is idempotent from the osdmon's point-of-view
9372 may_be_idempotent = true;
11fdf7f2 9373 ceph_assert(existing_id >= 0);
31f18b77
FG
9374 id = existing_id;
9375 } else if (err < 0) {
9376 return err;
9377 }
9378
9379 if (!may_be_idempotent) {
9380 // idempotency is out of the window. We are either creating a new
9381 // osd or recreating a destroyed osd.
9382 //
9383 // We now need to figure out if we have an `id` (and if it's valid),
9384 // of find an `id` if we don't have one.
9385
9386 // NOTE: we need to consider the case where the `id` is specified for
9387 // `osd create`, and we must honor it. So this means checking if
9388 // the `id` is destroyed, and if so assume the destroy; otherwise,
9389 // check if it `exists` - in which case we complain about not being
9390 // `destroyed`. In the end, if nothing fails, we must allow the
9391 // creation, so that we are compatible with `create`.
9392 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9393 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9394 ss << "OSD " << id << " has not yet been destroyed";
9395 return -EINVAL;
9396 } else if (id < 0) {
9397 // find an `id`
9398 id = _allocate_osd_id(&existing_id);
9399 if (id < 0) {
11fdf7f2 9400 ceph_assert(existing_id >= 0);
31f18b77
FG
9401 id = existing_id;
9402 }
9403 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9404 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9405 dout(10) << __func__ << " recreating osd." << id << dendl;
9406 } else {
9407 dout(10) << __func__ << " creating new osd." << id << dendl;
9408 }
9409 } else {
11fdf7f2
TL
9410 ceph_assert(id >= 0);
9411 ceph_assert(osdmap.exists(id));
31f18b77
FG
9412 }
9413
9414 // we are now able to either create a brand new osd or reuse an existing
9415 // osd that has been previously destroyed.
9416
9417 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9418
3a9019d9 9419 if (may_be_idempotent && params.empty()) {
31f18b77 9420 // nothing to do, really.
3a9019d9 9421 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
11fdf7f2 9422 ceph_assert(id >= 0);
31f18b77
FG
9423 if (f) {
9424 f->open_object_section("created_osd");
9425 f->dump_int("osdid", id);
9426 f->close_section();
9427 } else {
9428 ss << id;
9429 }
9430 return EEXIST;
9431 }
9432
3a9019d9
FG
9433 string device_class;
9434 auto p = params.find("crush_device_class");
9435 if (p != params.end()) {
9436 device_class = p->second;
9437 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9438 }
31f18b77
FG
9439 string cephx_secret, lockbox_secret, dmcrypt_key;
9440 bool has_lockbox = false;
3a9019d9
FG
9441 bool has_secrets = params.count("cephx_secret")
9442 || params.count("cephx_lockbox_secret")
9443 || params.count("dmcrypt_key");
31f18b77 9444
f67539c2 9445 KVMonitor *svc = nullptr;
31f18b77
FG
9446 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9447
9448 if (has_secrets) {
3a9019d9 9449 if (params.count("cephx_secret") == 0) {
31f18b77
FG
9450 ss << "requires a cephx secret.";
9451 return -EINVAL;
9452 }
3a9019d9 9453 cephx_secret = params.at("cephx_secret");
31f18b77 9454
3a9019d9
FG
9455 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9456 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
9457
9458 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9459 << " dmcrypt " << has_dmcrypt_key << dendl;
9460
9461 if (has_lockbox_secret && has_dmcrypt_key) {
9462 has_lockbox = true;
3a9019d9
FG
9463 lockbox_secret = params.at("cephx_lockbox_secret");
9464 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
9465 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9466 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9467 return -EINVAL;
9468 }
9469
9470 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9471
f67539c2 9472 err = mon.authmon()->validate_osd_new(id, uuid,
31f18b77
FG
9473 cephx_secret,
9474 lockbox_secret,
9475 cephx_entity,
9476 lockbox_entity,
9477 ss);
9478 if (err < 0) {
9479 return err;
9480 } else if (may_be_idempotent && err != EEXIST) {
9481 // for this to be idempotent, `id` should already be >= 0; no need
9482 // to use validate_id.
11fdf7f2 9483 ceph_assert(id >= 0);
31f18b77
FG
9484 ss << "osd." << id << " exists but secrets do not match";
9485 return -EEXIST;
9486 }
9487
9488 if (has_lockbox) {
f67539c2 9489 svc = mon.kvmon();
31f18b77
FG
9490 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9491 if (err < 0) {
9492 return err;
9493 } else if (may_be_idempotent && err != EEXIST) {
11fdf7f2 9494 ceph_assert(id >= 0);
31f18b77
FG
9495 ss << "osd." << id << " exists but dm-crypt key does not match.";
9496 return -EEXIST;
9497 }
9498 }
9499 }
11fdf7f2
TL
9500 ceph_assert(!has_secrets || !cephx_secret.empty());
9501 ceph_assert(!has_lockbox || !lockbox_secret.empty());
31f18b77
FG
9502
9503 if (may_be_idempotent) {
9504 // we have nothing to do for either the osdmon or the authmon,
9505 // and we have no lockbox - so the config key service will not be
9506 // touched. This is therefore an idempotent operation, and we can
9507 // just return right away.
9508 dout(10) << __func__ << " idempotent -- no op." << dendl;
11fdf7f2 9509 ceph_assert(id >= 0);
31f18b77
FG
9510 if (f) {
9511 f->open_object_section("created_osd");
9512 f->dump_int("osdid", id);
9513 f->close_section();
9514 } else {
9515 ss << id;
9516 }
9517 return EEXIST;
9518 }
11fdf7f2 9519 ceph_assert(!may_be_idempotent);
31f18b77
FG
9520
9521 // perform updates.
9522 if (has_secrets) {
11fdf7f2
TL
9523 ceph_assert(!cephx_secret.empty());
9524 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
31f18b77
FG
9525 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9526
f67539c2 9527 err = mon.authmon()->do_osd_new(cephx_entity,
31f18b77
FG
9528 lockbox_entity,
9529 has_lockbox);
11fdf7f2 9530 ceph_assert(0 == err);
31f18b77
FG
9531
9532 if (has_lockbox) {
11fdf7f2 9533 ceph_assert(nullptr != svc);
31f18b77
FG
9534 svc->do_osd_new(uuid, dmcrypt_key);
9535 }
9536 }
9537
9538 if (is_recreate_destroyed) {
11fdf7f2
TL
9539 ceph_assert(id >= 0);
9540 ceph_assert(osdmap.is_destroyed(id));
11fdf7f2
TL
9541 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9542 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9543 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9544 }
c07f9fc5
FG
9545 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9546 // due to http://tracker.ceph.com/issues/20751 some clusters may
9547 // have UP set for non-existent OSDs; make sure it is cleared
9548 // for a newly created osd.
9549 pending_inc.new_state[id] |= CEPH_OSD_UP;
9550 }
31f18b77
FG
9551 pending_inc.new_uuid[id] = uuid;
9552 } else {
11fdf7f2 9553 ceph_assert(id >= 0);
31f18b77 9554 int32_t new_id = -1;
3a9019d9 9555 do_osd_create(id, uuid, device_class, &new_id);
11fdf7f2
TL
9556 ceph_assert(new_id >= 0);
9557 ceph_assert(id == new_id);
31f18b77
FG
9558 }
9559
9560 if (f) {
9561 f->open_object_section("created_osd");
9562 f->dump_int("osdid", id);
9563 f->close_section();
9564 } else {
9565 ss << id;
9566 }
9567
9568 return 0;
9569}
9570
7c673cae
FG
9571bool OSDMonitor::prepare_command(MonOpRequestRef op)
9572{
9573 op->mark_osdmon_event(__func__);
9f95a23c 9574 auto m = op->get_req<MMonCommand>();
7c673cae 9575 stringstream ss;
11fdf7f2 9576 cmdmap_t cmdmap;
7c673cae
FG
9577 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9578 string rs = ss.str();
f67539c2 9579 mon.reply_command(op, -EINVAL, rs, get_last_committed());
7c673cae
FG
9580 return true;
9581 }
9582
11fdf7f2 9583 MonSession *session = op->get_session();
7c673cae 9584 if (!session) {
11fdf7f2 9585 derr << __func__ << " no session" << dendl;
f67539c2 9586 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
9587 return true;
9588 }
9589
9590 return prepare_command_impl(op, cmdmap);
9591}
9592
9593static int parse_reweights(CephContext *cct,
11fdf7f2 9594 const cmdmap_t& cmdmap,
7c673cae
FG
9595 const OSDMap& osdmap,
9596 map<int32_t, uint32_t>* weights)
9597{
9598 string weights_str;
9f95a23c 9599 if (!cmd_getval(cmdmap, "weights", weights_str)) {
7c673cae
FG
9600 return -EINVAL;
9601 }
9602 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9603 json_spirit::mValue json_value;
9604 if (!json_spirit::read(weights_str, json_value)) {
9605 return -EINVAL;
9606 }
9607 if (json_value.type() != json_spirit::obj_type) {
9608 return -EINVAL;
9609 }
9610 const auto obj = json_value.get_obj();
9611 try {
9612 for (auto& osd_weight : obj) {
9613 auto osd_id = std::stoi(osd_weight.first);
9614 if (!osdmap.exists(osd_id)) {
9615 return -ENOENT;
9616 }
9617 if (osd_weight.second.type() != json_spirit::str_type) {
9618 return -EINVAL;
9619 }
9620 auto weight = std::stoul(osd_weight.second.get_str());
9621 weights->insert({osd_id, weight});
9622 }
9623 } catch (const std::logic_error& e) {
9624 return -EINVAL;
9625 }
9626 return 0;
9627}
9628
31f18b77
FG
9629int OSDMonitor::prepare_command_osd_destroy(
9630 int32_t id,
9631 stringstream& ss)
9632{
f67539c2 9633 ceph_assert(paxos.is_plugged());
31f18b77
FG
9634
9635 // we check if the osd exists for the benefit of `osd purge`, which may
9636 // have previously removed the osd. If the osd does not exist, return
9637 // -ENOENT to convey this, and let the caller deal with it.
9638 //
9639 // we presume that all auth secrets and config keys were removed prior
9640 // to this command being called. if they exist by now, we also assume
9641 // they must have been created by some other command and do not pertain
9642 // to this non-existent osd.
9643 if (!osdmap.exists(id)) {
9644 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9645 return -ENOENT;
9646 }
9647
9648 uuid_d uuid = osdmap.get_uuid(id);
9649 dout(10) << __func__ << " destroying osd." << id
9650 << " uuid " << uuid << dendl;
9651
9652 // if it has been destroyed, we assume our work here is done.
9653 if (osdmap.is_destroyed(id)) {
9654 ss << "destroyed osd." << id;
9655 return 0;
9656 }
9657
9658 EntityName cephx_entity, lockbox_entity;
9659 bool idempotent_auth = false, idempotent_cks = false;
9660
f67539c2 9661 int err = mon.authmon()->validate_osd_destroy(id, uuid,
31f18b77
FG
9662 cephx_entity,
9663 lockbox_entity,
9664 ss);
9665 if (err < 0) {
9666 if (err == -ENOENT) {
9667 idempotent_auth = true;
31f18b77
FG
9668 } else {
9669 return err;
9670 }
9671 }
9672
f67539c2 9673 auto svc = mon.kvmon();
31f18b77
FG
9674 err = svc->validate_osd_destroy(id, uuid);
9675 if (err < 0) {
11fdf7f2 9676 ceph_assert(err == -ENOENT);
31f18b77
FG
9677 err = 0;
9678 idempotent_cks = true;
9679 }
9680
9681 if (!idempotent_auth) {
f67539c2 9682 err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
11fdf7f2 9683 ceph_assert(0 == err);
31f18b77
FG
9684 }
9685
9686 if (!idempotent_cks) {
9687 svc->do_osd_destroy(id, uuid);
9688 }
9689
9690 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9691 pending_inc.new_uuid[id] = uuid_d();
9692
9693 // we can only propose_pending() once per service, otherwise we'll be
9694 // defying PaxosService and all laws of nature. Therefore, as we may
9695 // be used during 'osd purge', let's keep the caller responsible for
9696 // proposing.
11fdf7f2 9697 ceph_assert(err == 0);
31f18b77
FG
9698 return 0;
9699}
9700
9701int OSDMonitor::prepare_command_osd_purge(
9702 int32_t id,
9703 stringstream& ss)
9704{
f67539c2 9705 ceph_assert(paxos.is_plugged());
31f18b77
FG
9706 dout(10) << __func__ << " purging osd." << id << dendl;
9707
11fdf7f2 9708 ceph_assert(!osdmap.is_up(id));
31f18b77
FG
9709
9710 /*
9711 * This may look a bit weird, but this is what's going to happen:
9712 *
9713 * 1. we make sure that removing from crush works
9714 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9715 * error, then we abort the whole operation, as no updates
9716 * have been made. However, we this function will have
9717 * side-effects, thus we need to make sure that all operations
9718 * performed henceforth will *always* succeed.
9719 * 3. we call `prepare_command_osd_remove()`. Although this
9720 * function can return an error, it currently only checks if the
9721 * osd is up - and we have made sure that it is not so, so there
9722 * is no conflict, and it is effectively an update.
9723 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9724 * the crush update we delayed from before.
9725 */
9726
9727 CrushWrapper newcrush;
9728 _get_pending_crush(newcrush);
9729
9730 bool may_be_idempotent = false;
9731
9732 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9733 if (err == -ENOENT) {
9734 err = 0;
9735 may_be_idempotent = true;
9736 } else if (err < 0) {
9737 ss << "error removing osd." << id << " from crush";
9738 return err;
9739 }
9740
9741 // no point destroying the osd again if it has already been marked destroyed
9742 if (!osdmap.is_destroyed(id)) {
9743 err = prepare_command_osd_destroy(id, ss);
9744 if (err < 0) {
9745 if (err == -ENOENT) {
9746 err = 0;
9747 } else {
9748 return err;
9749 }
9750 } else {
9751 may_be_idempotent = false;
9752 }
9753 }
11fdf7f2 9754 ceph_assert(0 == err);
31f18b77
FG
9755
9756 if (may_be_idempotent && !osdmap.exists(id)) {
9757 dout(10) << __func__ << " osd." << id << " does not exist and "
9758 << "we are idempotent." << dendl;
9759 return -ENOENT;
9760 }
9761
9762 err = prepare_command_osd_remove(id);
9763 // we should not be busy, as we should have made sure this id is not up.
11fdf7f2 9764 ceph_assert(0 == err);
31f18b77
FG
9765
9766 do_osd_crush_remove(newcrush);
9767 return 0;
9768}
9769
7c673cae 9770bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
11fdf7f2 9771 const cmdmap_t& cmdmap)
7c673cae
FG
9772{
9773 op->mark_osdmon_event(__func__);
9f95a23c 9774 auto m = op->get_req<MMonCommand>();
7c673cae
FG
9775 bool ret = false;
9776 stringstream ss;
9777 string rs;
9778 bufferlist rdata;
9779 int err = 0;
9780
9781 string format;
9f95a23c 9782 cmd_getval(cmdmap, "format", format, string("plain"));
7c673cae
FG
9783 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9784
9785 string prefix;
9f95a23c 9786 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
9787
9788 int64_t osdid;
11fdf7f2 9789 string osd_name;
b32b8144
FG
9790 bool osdid_present = false;
9791 if (prefix != "osd pg-temp" &&
9792 prefix != "osd pg-upmap" &&
9793 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9f95a23c 9794 osdid_present = cmd_getval(cmdmap, "id", osdid);
b32b8144 9795 }
7c673cae
FG
9796 if (osdid_present) {
9797 ostringstream oss;
9798 oss << "osd." << osdid;
11fdf7f2 9799 osd_name = oss.str();
7c673cae
FG
9800 }
9801
9802 // Even if there's a pending state with changes that could affect
9803 // a command, considering that said state isn't yet committed, we
9804 // just don't care about those changes if the command currently being
9805 // handled acts as a no-op against the current committed state.
9806 // In a nutshell, we assume this command happens *before*.
9807 //
9808 // Let me make this clearer:
9809 //
9810 // - If we have only one client, and that client issues some
9811 // operation that would conflict with this operation but is
9812 // still on the pending state, then we would be sure that said
9813 // operation wouldn't have returned yet, so the client wouldn't
9814 // issue this operation (unless the client didn't wait for the
9815 // operation to finish, and that would be the client's own fault).
9816 //
9817 // - If we have more than one client, each client will observe
9818 // whatever is the state at the moment of the commit. So, if we
9819 // have two clients, one issuing an unlink and another issuing a
9820 // link, and if the link happens while the unlink is still on the
9821 // pending state, from the link's point-of-view this is a no-op.
9822 // If different clients are issuing conflicting operations and
9823 // they care about that, then the clients should make sure they
9824 // enforce some kind of concurrency mechanism -- from our
9825 // perspective that's what Douglas Adams would call an SEP.
9826 //
9827 // This should be used as a general guideline for most commands handled
9828 // in this function. Adapt as you see fit, but please bear in mind that
9829 // this is the expected behavior.
9830
9831
9832 if (prefix == "osd setcrushmap" ||
9833 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
9834 if (pending_inc.crush.length()) {
9835 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9836 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9837 return true;
9838 }
7c673cae
FG
9839 dout(10) << "prepare_command setting new crush map" << dendl;
9840 bufferlist data(m->get_data());
9841 CrushWrapper crush;
9842 try {
11fdf7f2 9843 auto bl = data.cbegin();
7c673cae
FG
9844 crush.decode(bl);
9845 }
9846 catch (const std::exception &e) {
9847 err = -EINVAL;
9848 ss << "Failed to parse crushmap: " << e.what();
9849 goto reply;
9850 }
31f18b77
FG
9851
9852 int64_t prior_version = 0;
9f95a23c 9853 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
31f18b77
FG
9854 if (prior_version == osdmap.get_crush_version() - 1) {
9855 // see if we are a resend of the last update. this is imperfect
9856 // (multiple racing updaters may not both get reliable success)
9857 // but we expect crush updaters (via this interface) to be rare-ish.
9858 bufferlist current, proposed;
f67539c2
TL
9859 osdmap.crush->encode(current, mon.get_quorum_con_features());
9860 crush.encode(proposed, mon.get_quorum_con_features());
31f18b77
FG
9861 if (current.contents_equal(proposed)) {
9862 dout(10) << __func__
9863 << " proposed matches current and version equals previous"
9864 << dendl;
9865 err = 0;
9866 ss << osdmap.get_crush_version();
9867 goto reply;
9868 }
9869 }
9870 if (prior_version != osdmap.get_crush_version()) {
9871 err = -EPERM;
9872 ss << "prior_version " << prior_version << " != crush version "
9873 << osdmap.get_crush_version();
9874 goto reply;
9875 }
9876 }
7c673cae 9877
3efd9988 9878 if (crush.has_legacy_rule_ids()) {
31f18b77
FG
9879 err = -EINVAL;
9880 ss << "crush maps with ruleset != ruleid are no longer allowed";
9881 goto reply;
9882 }
7c673cae
FG
9883 if (!validate_crush_against_features(&crush, ss)) {
9884 err = -EINVAL;
9885 goto reply;
9886 }
31f18b77 9887
3efd9988
FG
9888 err = osdmap.validate_crush_rules(&crush, &ss);
9889 if (err < 0) {
9890 goto reply;
7c673cae
FG
9891 }
9892
11fdf7f2 9893 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
9894 // sanity check: test some inputs to make sure this map isn't
9895 // totally broken
9896 dout(10) << " testing map" << dendl;
9897 stringstream ess;
9898 CrushTester tester(crush, ess);
b5b8bbf5 9899 tester.set_min_x(0);
224ce89b 9900 tester.set_max_x(50);
b5b8bbf5 9901 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 9902 int r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 9903 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
9904 if (r < 0) {
9905 dout(10) << " tester.test_with_fork returns " << r
9906 << ": " << ess.str() << dendl;
9907 ss << "crush smoke test failed with " << r << ": " << ess.str();
9908 err = r;
9909 goto reply;
9910 }
b5b8bbf5
FG
9911 dout(10) << __func__ << " crush somke test duration: "
9912 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
9913 }
9914
7c673cae 9915 pending_inc.crush = data;
31f18b77 9916 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
9917 goto update;
9918
3efd9988
FG
9919 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9920 CrushWrapper newcrush;
9921 _get_pending_crush(newcrush);
9922 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9923 int bid = -1 - b;
9924 if (newcrush.bucket_exists(bid) &&
11fdf7f2 9925 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
3efd9988
FG
9926 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9927 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9928 }
9929 }
9930 if (!validate_crush_against_features(&newcrush, ss)) {
9931 err = -EINVAL;
9932 goto reply;
9933 }
9934 pending_inc.crush.clear();
f67539c2 9935 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
3efd9988
FG
9936 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9937 get_last_committed() + 1));
9938 return true;
7c673cae 9939 } else if (prefix == "osd crush set-device-class") {
7c673cae 9940 string device_class;
9f95a23c 9941 if (!cmd_getval(cmdmap, "class", device_class)) {
7c673cae
FG
9942 err = -EINVAL; // no value!
9943 goto reply;
9944 }
9945
224ce89b
WB
9946 bool stop = false;
9947 vector<string> idvec;
9f95a23c 9948 cmd_getval(cmdmap, "ids", idvec);
7c673cae
FG
9949 CrushWrapper newcrush;
9950 _get_pending_crush(newcrush);
224ce89b
WB
9951 set<int> updated;
9952 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9953 set<int> osds;
9954 // wildcard?
9955 if (j == 0 &&
9956 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9957 osdmap.get_all_osds(osds);
9958 stop = true;
9959 } else {
9960 // try traditional single osd way
9961 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9962 if (osd < 0) {
9963 // ss has reason for failure
9964 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9965 err = -EINVAL;
9966 continue;
9967 }
9968 osds.insert(osd);
9969 }
7c673cae 9970
224ce89b
WB
9971 for (auto &osd : osds) {
9972 if (!osdmap.exists(osd)) {
9973 ss << "osd." << osd << " does not exist. ";
9974 continue;
9975 }
7c673cae 9976
224ce89b
WB
9977 ostringstream oss;
9978 oss << "osd." << osd;
9979 string name = oss.str();
7c673cae 9980
3a9019d9
FG
9981 if (newcrush.get_max_devices() < osd + 1) {
9982 newcrush.set_max_devices(osd + 1);
9983 }
224ce89b
WB
9984 string action;
9985 if (newcrush.item_exists(osd)) {
9986 action = "updating";
9987 } else {
9988 action = "creating";
9989 newcrush.set_item_name(osd, name);
9990 }
7c673cae 9991
224ce89b
WB
9992 dout(5) << action << " crush item id " << osd << " name '" << name
9993 << "' device_class '" << device_class << "'"
9994 << dendl;
9995 err = newcrush.update_device_class(osd, device_class, name, &ss);
9996 if (err < 0) {
9997 goto reply;
9998 }
9999 if (err == 0 && !_have_pending_crush()) {
10000 if (!stop) {
10001 // for single osd only, wildcard makes too much noise
10002 ss << "set-device-class item id " << osd << " name '" << name
11fdf7f2 10003 << "' device_class '" << device_class << "': no change. ";
224ce89b
WB
10004 }
10005 } else {
10006 updated.insert(osd);
10007 }
10008 }
7c673cae
FG
10009 }
10010
f67539c2
TL
10011 pending_inc.crush.clear();
10012 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10013 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10014 getline(ss, rs);
10015 wait_for_finished_proposal(
10016 op,
10017 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10018 return true;
c07f9fc5
FG
10019 } else if (prefix == "osd crush rm-device-class") {
10020 bool stop = false;
10021 vector<string> idvec;
9f95a23c 10022 cmd_getval(cmdmap, "ids", idvec);
c07f9fc5
FG
10023 CrushWrapper newcrush;
10024 _get_pending_crush(newcrush);
10025 set<int> updated;
10026
10027 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10028 set<int> osds;
10029
10030 // wildcard?
10031 if (j == 0 &&
10032 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10033 osdmap.get_all_osds(osds);
10034 stop = true;
10035 } else {
10036 // try traditional single osd way
10037 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10038 if (osd < 0) {
10039 // ss has reason for failure
10040 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10041 err = -EINVAL;
10042 goto reply;
10043 }
10044 osds.insert(osd);
10045 }
10046
10047 for (auto &osd : osds) {
10048 if (!osdmap.exists(osd)) {
10049 ss << "osd." << osd << " does not exist. ";
10050 continue;
10051 }
10052
10053 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
10054 if (!class_name) {
10055 ss << "osd." << osd << " belongs to no class, ";
10056 continue;
10057 }
10058 // note that we do not verify if class_is_in_use here
10059 // in case the device is misclassified and user wants
10060 // to overridely reset...
10061
11fdf7f2 10062 err = newcrush.remove_device_class(cct, osd, &ss);
c07f9fc5
FG
10063 if (err < 0) {
10064 // ss has reason for failure
10065 goto reply;
10066 }
10067 updated.insert(osd);
10068 }
10069 }
10070
f67539c2
TL
10071 pending_inc.crush.clear();
10072 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10073 ss << "done removing class of osd(s): " << updated;
10074 getline(ss, rs);
10075 wait_for_finished_proposal(
10076 op,
10077 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10078 return true;
11fdf7f2
TL
10079 } else if (prefix == "osd crush class create") {
10080 string device_class;
9f95a23c 10081 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
10082 err = -EINVAL; // no value!
10083 goto reply;
10084 }
9f95a23c 10085 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
10086 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10087 << "luminous' before using crush device classes";
10088 err = -EPERM;
10089 goto reply;
10090 }
10091 if (!_have_pending_crush() &&
10092 _get_stable_crush().class_exists(device_class)) {
10093 ss << "class '" << device_class << "' already exists";
10094 goto reply;
10095 }
10096 CrushWrapper newcrush;
10097 _get_pending_crush(newcrush);
10098 if (newcrush.class_exists(device_class)) {
10099 ss << "class '" << device_class << "' already exists";
10100 goto update;
10101 }
10102 int class_id = newcrush.get_or_create_class_id(device_class);
10103 pending_inc.crush.clear();
f67539c2 10104 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10105 ss << "created class " << device_class << " with id " << class_id
10106 << " to crush map";
10107 goto update;
10108 } else if (prefix == "osd crush class rm") {
10109 string device_class;
9f95a23c 10110 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
10111 err = -EINVAL; // no value!
10112 goto reply;
10113 }
9f95a23c 10114 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
10115 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10116 << "luminous' before using crush device classes";
10117 err = -EPERM;
10118 goto reply;
10119 }
10120
10121 if (!osdmap.crush->class_exists(device_class)) {
10122 err = 0;
10123 goto reply;
10124 }
10125
10126 CrushWrapper newcrush;
10127 _get_pending_crush(newcrush);
10128 if (!newcrush.class_exists(device_class)) {
10129 err = 0; // make command idempotent
10130 goto wait;
10131 }
10132 int class_id = newcrush.get_class_id(device_class);
10133 stringstream ts;
10134 if (newcrush.class_is_in_use(class_id, &ts)) {
10135 err = -EBUSY;
10136 ss << "class '" << device_class << "' " << ts.str();
10137 goto reply;
10138 }
10139
10140 // check if class is used by any erasure-code-profiles
10141 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10142 osdmap.get_erasure_code_profiles();
10143 auto ec_profiles = pending_inc.get_erasure_code_profiles();
10144#ifdef HAVE_STDLIB_MAP_SPLICING
10145 ec_profiles.merge(old_ec_profiles);
10146#else
10147 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10148 make_move_iterator(end(old_ec_profiles)));
10149#endif
10150 list<string> referenced_by;
10151 for (auto &i: ec_profiles) {
10152 for (auto &j: i.second) {
10153 if ("crush-device-class" == j.first && device_class == j.second) {
10154 referenced_by.push_back(i.first);
10155 }
10156 }
10157 }
10158 if (!referenced_by.empty()) {
10159 err = -EBUSY;
10160 ss << "class '" << device_class
10161 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10162 goto reply;
10163 }
10164
10165 set<int> osds;
10166 newcrush.get_devices_by_class(device_class, &osds);
10167 for (auto& p: osds) {
10168 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
10169 if (err < 0) {
10170 // ss has reason for failure
10171 goto reply;
10172 }
10173 }
10174
10175 if (osds.empty()) {
10176 // empty class, remove directly
10177 err = newcrush.remove_class_name(device_class);
10178 if (err < 0) {
10179 ss << "class '" << device_class << "' cannot be removed '"
10180 << cpp_strerror(err) << "'";
10181 goto reply;
10182 }
10183 }
10184
10185 pending_inc.crush.clear();
f67539c2 10186 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10187 ss << "removed class " << device_class << " with id " << class_id
10188 << " from crush map";
10189 goto update;
35e4c445
FG
10190 } else if (prefix == "osd crush class rename") {
10191 string srcname, dstname;
9f95a23c 10192 if (!cmd_getval(cmdmap, "srcname", srcname)) {
35e4c445
FG
10193 err = -EINVAL;
10194 goto reply;
10195 }
9f95a23c 10196 if (!cmd_getval(cmdmap, "dstname", dstname)) {
35e4c445
FG
10197 err = -EINVAL;
10198 goto reply;
10199 }
10200
10201 CrushWrapper newcrush;
10202 _get_pending_crush(newcrush);
181888fb
FG
10203 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10204 // suppose this is a replay and return success
10205 // so command is idempotent
10206 ss << "already renamed to '" << dstname << "'";
10207 err = 0;
35e4c445
FG
10208 goto reply;
10209 }
c07f9fc5 10210
35e4c445
FG
10211 err = newcrush.rename_class(srcname, dstname);
10212 if (err < 0) {
10213 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10214 << cpp_strerror(err);
10215 goto reply;
10216 }
10217
10218 pending_inc.crush.clear();
f67539c2 10219 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
35e4c445
FG
10220 ss << "rename class '" << srcname << "' to '" << dstname << "'";
10221 goto update;
7c673cae
FG
10222 } else if (prefix == "osd crush add-bucket") {
10223 // os crush add-bucket <name> <type>
10224 string name, typestr;
11fdf7f2 10225 vector<string> argvec;
9f95a23c
TL
10226 cmd_getval(cmdmap, "name", name);
10227 cmd_getval(cmdmap, "type", typestr);
10228 cmd_getval(cmdmap, "args", argvec);
11fdf7f2
TL
10229 map<string,string> loc;
10230 if (!argvec.empty()) {
10231 CrushWrapper::parse_loc_map(argvec, &loc);
10232 dout(0) << "will create and move bucket '" << name
10233 << "' to location " << loc << dendl;
10234 }
7c673cae
FG
10235
10236 if (!_have_pending_crush() &&
10237 _get_stable_crush().name_exists(name)) {
10238 ss << "bucket '" << name << "' already exists";
10239 goto reply;
10240 }
10241
10242 CrushWrapper newcrush;
10243 _get_pending_crush(newcrush);
10244
10245 if (newcrush.name_exists(name)) {
10246 ss << "bucket '" << name << "' already exists";
10247 goto update;
10248 }
10249 int type = newcrush.get_type_id(typestr);
10250 if (type < 0) {
10251 ss << "type '" << typestr << "' does not exist";
10252 err = -EINVAL;
10253 goto reply;
10254 }
10255 if (type == 0) {
10256 ss << "type '" << typestr << "' is for devices, not buckets";
10257 err = -EINVAL;
10258 goto reply;
10259 }
10260 int bucketno;
10261 err = newcrush.add_bucket(0, 0,
10262 CRUSH_HASH_DEFAULT, type, 0, NULL,
10263 NULL, &bucketno);
10264 if (err < 0) {
10265 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10266 goto reply;
10267 }
10268 err = newcrush.set_item_name(bucketno, name);
10269 if (err < 0) {
10270 ss << "error setting bucket name to '" << name << "'";
10271 goto reply;
10272 }
10273
11fdf7f2
TL
10274 if (!loc.empty()) {
10275 if (!newcrush.check_item_loc(cct, bucketno, loc,
10276 (int *)NULL)) {
10277 err = newcrush.move_bucket(cct, bucketno, loc);
10278 if (err < 0) {
10279 ss << "error moving bucket '" << name << "' to location " << loc;
10280 goto reply;
10281 }
10282 } else {
10283 ss << "no need to move item id " << bucketno << " name '" << name
10284 << "' to location " << loc << " in crush map";
10285 }
10286 }
10287
7c673cae 10288 pending_inc.crush.clear();
f67539c2 10289 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10290 if (loc.empty()) {
10291 ss << "added bucket " << name << " type " << typestr
10292 << " to crush map";
10293 } else {
10294 ss << "added bucket " << name << " type " << typestr
10295 << " to location " << loc;
10296 }
7c673cae
FG
10297 goto update;
10298 } else if (prefix == "osd crush rename-bucket") {
10299 string srcname, dstname;
9f95a23c
TL
10300 cmd_getval(cmdmap, "srcname", srcname);
10301 cmd_getval(cmdmap, "dstname", dstname);
7c673cae
FG
10302
10303 err = crush_rename_bucket(srcname, dstname, &ss);
10304 if (err == -EALREADY) // equivalent to success for idempotency
10305 err = 0;
10306 if (err)
10307 goto reply;
10308 else
10309 goto update;
c07f9fc5
FG
10310 } else if (prefix == "osd crush weight-set create" ||
10311 prefix == "osd crush weight-set create-compat") {
10312 CrushWrapper newcrush;
10313 _get_pending_crush(newcrush);
10314 int64_t pool;
10315 int positions;
10316 if (newcrush.has_non_straw2_buckets()) {
10317 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
10318 err = -EPERM;
10319 goto reply;
10320 }
c07f9fc5 10321 if (prefix == "osd crush weight-set create") {
9f95a23c
TL
10322 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10323 osdmap.require_min_compat_client < ceph_release_t::luminous) {
c07f9fc5 10324 ss << "require_min_compat_client "
9f95a23c 10325 << osdmap.require_min_compat_client
c07f9fc5
FG
10326 << " < luminous, which is required for per-pool weight-sets. "
10327 << "Try 'ceph osd set-require-min-compat-client luminous' "
10328 << "before using the new interface";
10329 err = -EPERM;
10330 goto reply;
10331 }
10332 string poolname, mode;
9f95a23c 10333 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10334 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10335 if (pool < 0) {
10336 ss << "pool '" << poolname << "' not found";
10337 err = -ENOENT;
10338 goto reply;
10339 }
9f95a23c 10340 cmd_getval(cmdmap, "mode", mode);
c07f9fc5
FG
10341 if (mode != "flat" && mode != "positional") {
10342 ss << "unrecognized weight-set mode '" << mode << "'";
10343 err = -EINVAL;
10344 goto reply;
10345 }
10346 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10347 } else {
10348 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10349 positions = 1;
224ce89b 10350 }
11fdf7f2
TL
10351 if (!newcrush.create_choose_args(pool, positions)) {
10352 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10353 ss << "compat weight-set already created";
10354 } else {
10355 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10356 << "' already created";
10357 }
10358 goto reply;
10359 }
c07f9fc5 10360 pending_inc.crush.clear();
f67539c2 10361 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
c07f9fc5 10362 goto update;
224ce89b 10363
c07f9fc5
FG
10364 } else if (prefix == "osd crush weight-set rm" ||
10365 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
10366 CrushWrapper newcrush;
10367 _get_pending_crush(newcrush);
c07f9fc5
FG
10368 int64_t pool;
10369 if (prefix == "osd crush weight-set rm") {
10370 string poolname;
9f95a23c 10371 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10372 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10373 if (pool < 0) {
10374 ss << "pool '" << poolname << "' not found";
10375 err = -ENOENT;
10376 goto reply;
10377 }
10378 } else {
10379 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 10380 }
c07f9fc5
FG
10381 newcrush.rm_choose_args(pool);
10382 pending_inc.crush.clear();
f67539c2 10383 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
c07f9fc5 10384 goto update;
224ce89b 10385
c07f9fc5
FG
10386 } else if (prefix == "osd crush weight-set reweight" ||
10387 prefix == "osd crush weight-set reweight-compat") {
10388 string poolname, item;
10389 vector<double> weight;
9f95a23c
TL
10390 cmd_getval(cmdmap, "pool", poolname);
10391 cmd_getval(cmdmap, "item", item);
10392 cmd_getval(cmdmap, "weight", weight);
c07f9fc5
FG
10393 CrushWrapper newcrush;
10394 _get_pending_crush(newcrush);
10395 int64_t pool;
10396 if (prefix == "osd crush weight-set reweight") {
10397 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10398 if (pool < 0) {
10399 ss << "pool '" << poolname << "' not found";
10400 err = -ENOENT;
10401 goto reply;
10402 }
10403 if (!newcrush.have_choose_args(pool)) {
10404 ss << "no weight-set for pool '" << poolname << "'";
10405 err = -ENOENT;
10406 goto reply;
10407 }
10408 auto arg_map = newcrush.choose_args_get(pool);
10409 int positions = newcrush.get_choose_args_positions(arg_map);
10410 if (weight.size() != (size_t)positions) {
10411 ss << "must specify exact " << positions << " weight values";
10412 err = -EINVAL;
10413 goto reply;
10414 }
10415 } else {
10416 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10417 if (!newcrush.have_choose_args(pool)) {
10418 ss << "no backward-compatible weight-set";
10419 err = -ENOENT;
10420 goto reply;
10421 }
224ce89b 10422 }
c07f9fc5
FG
10423 if (!newcrush.name_exists(item)) {
10424 ss << "item '" << item << "' does not exist";
10425 err = -ENOENT;
224ce89b
WB
10426 goto reply;
10427 }
c07f9fc5 10428 err = newcrush.choose_args_adjust_item_weightf(
11fdf7f2 10429 cct,
c07f9fc5
FG
10430 newcrush.choose_args_get(pool),
10431 newcrush.get_item_id(item),
10432 weight,
10433 &ss);
224ce89b 10434 if (err < 0) {
224ce89b
WB
10435 goto reply;
10436 }
c07f9fc5 10437 err = 0;
224ce89b 10438 pending_inc.crush.clear();
f67539c2 10439 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
224ce89b 10440 goto update;
7c673cae
FG
10441 } else if (osdid_present &&
10442 (prefix == "osd crush set" || prefix == "osd crush add")) {
10443 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10444 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10445 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10446
10447 if (!osdmap.exists(osdid)) {
10448 err = -ENOENT;
11fdf7f2
TL
10449 ss << osd_name
10450 << " does not exist. Create it before updating the crush map";
7c673cae
FG
10451 goto reply;
10452 }
10453
10454 double weight;
9f95a23c 10455 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10456 ss << "unable to parse weight value '"
11fdf7f2 10457 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10458 err = -EINVAL;
10459 goto reply;
10460 }
10461
10462 string args;
10463 vector<string> argvec;
9f95a23c 10464 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10465 map<string,string> loc;
10466 CrushWrapper::parse_loc_map(argvec, &loc);
10467
10468 if (prefix == "osd crush set"
10469 && !_get_stable_crush().item_exists(osdid)) {
10470 err = -ENOENT;
11fdf7f2 10471 ss << "unable to set item id " << osdid << " name '" << osd_name
7c673cae
FG
10472 << "' weight " << weight << " at location " << loc
10473 << ": does not exist";
10474 goto reply;
10475 }
10476
10477 dout(5) << "adding/updating crush item id " << osdid << " name '"
11fdf7f2 10478 << osd_name << "' weight " << weight << " at location "
7c673cae
FG
10479 << loc << dendl;
10480 CrushWrapper newcrush;
10481 _get_pending_crush(newcrush);
10482
10483 string action;
10484 if (prefix == "osd crush set" ||
11fdf7f2 10485 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
7c673cae 10486 action = "set";
11fdf7f2 10487 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10488 } else {
10489 action = "add";
11fdf7f2 10490 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10491 if (err == 0)
10492 err = 1;
10493 }
10494
10495 if (err < 0)
10496 goto reply;
10497
10498 if (err == 0 && !_have_pending_crush()) {
11fdf7f2
TL
10499 ss << action << " item id " << osdid << " name '" << osd_name
10500 << "' weight " << weight << " at location " << loc << ": no change";
7c673cae
FG
10501 goto reply;
10502 }
10503
10504 pending_inc.crush.clear();
f67539c2 10505 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10506 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10507 << weight << " at location " << loc << " to crush map";
7c673cae
FG
10508 getline(ss, rs);
10509 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10510 get_last_committed() + 1));
10511 return true;
10512
10513 } else if (prefix == "osd crush create-or-move") {
10514 do {
10515 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10516 if (!osdmap.exists(osdid)) {
10517 err = -ENOENT;
11fdf7f2
TL
10518 ss << osd_name
10519 << " does not exist. create it before updating the crush map";
7c673cae
FG
10520 goto reply;
10521 }
10522
10523 double weight;
9f95a23c 10524 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10525 ss << "unable to parse weight value '"
11fdf7f2 10526 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10527 err = -EINVAL;
10528 goto reply;
10529 }
10530
10531 string args;
10532 vector<string> argvec;
9f95a23c 10533 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10534 map<string,string> loc;
10535 CrushWrapper::parse_loc_map(argvec, &loc);
10536
11fdf7f2
TL
10537 dout(0) << "create-or-move crush item name '" << osd_name
10538 << "' initial_weight " << weight << " at location " << loc
10539 << dendl;
7c673cae
FG
10540
10541 CrushWrapper newcrush;
10542 _get_pending_crush(newcrush);
10543
11fdf7f2
TL
10544 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10545 g_conf()->osd_crush_update_weight_set);
7c673cae 10546 if (err == 0) {
11fdf7f2
TL
10547 ss << "create-or-move updated item name '" << osd_name
10548 << "' weight " << weight
7c673cae
FG
10549 << " at location " << loc << " to crush map";
10550 break;
10551 }
10552 if (err > 0) {
10553 pending_inc.crush.clear();
f67539c2 10554 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10555 ss << "create-or-move updating item name '" << osd_name
10556 << "' weight " << weight
7c673cae
FG
10557 << " at location " << loc << " to crush map";
10558 getline(ss, rs);
10559 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10560 get_last_committed() + 1));
10561 return true;
10562 }
10563 } while (false);
10564
10565 } else if (prefix == "osd crush move") {
10566 do {
10567 // osd crush move <name> <loc1> [<loc2> ...]
11fdf7f2 10568 string name;
7c673cae 10569 vector<string> argvec;
9f95a23c
TL
10570 cmd_getval(cmdmap, "name", name);
10571 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10572 map<string,string> loc;
10573 CrushWrapper::parse_loc_map(argvec, &loc);
10574
10575 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10576 CrushWrapper newcrush;
10577 _get_pending_crush(newcrush);
10578
10579 if (!newcrush.name_exists(name)) {
10580 err = -ENOENT;
10581 ss << "item " << name << " does not exist";
10582 break;
10583 }
10584 int id = newcrush.get_item_id(name);
10585
11fdf7f2 10586 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
7c673cae 10587 if (id >= 0) {
11fdf7f2
TL
10588 err = newcrush.create_or_move_item(
10589 cct, id, 0, name, loc,
10590 g_conf()->osd_crush_update_weight_set);
7c673cae 10591 } else {
11fdf7f2 10592 err = newcrush.move_bucket(cct, id, loc);
7c673cae
FG
10593 }
10594 if (err >= 0) {
10595 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10596 pending_inc.crush.clear();
f67539c2 10597 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10598 getline(ss, rs);
10599 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10600 get_last_committed() + 1));
10601 return true;
10602 }
10603 } else {
10604 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10605 err = 0;
10606 }
10607 } while (false);
31f18b77 10608 } else if (prefix == "osd crush swap-bucket") {
11fdf7f2 10609 string source, dest;
9f95a23c
TL
10610 cmd_getval(cmdmap, "source", source);
10611 cmd_getval(cmdmap, "dest", dest);
11fdf7f2
TL
10612
10613 bool force = false;
9f95a23c 10614 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2 10615
31f18b77
FG
10616 CrushWrapper newcrush;
10617 _get_pending_crush(newcrush);
10618 if (!newcrush.name_exists(source)) {
10619 ss << "source item " << source << " does not exist";
10620 err = -ENOENT;
10621 goto reply;
10622 }
10623 if (!newcrush.name_exists(dest)) {
10624 ss << "dest item " << dest << " does not exist";
10625 err = -ENOENT;
10626 goto reply;
10627 }
10628 int sid = newcrush.get_item_id(source);
10629 int did = newcrush.get_item_id(dest);
10630 int sparent;
11fdf7f2 10631 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
31f18b77
FG
10632 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10633 err = -EPERM;
10634 goto reply;
10635 }
10636 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
11fdf7f2 10637 !force) {
31f18b77
FG
10638 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10639 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10640 << "; pass --yes-i-really-mean-it to proceed anyway";
10641 err = -EPERM;
10642 goto reply;
10643 }
11fdf7f2 10644 int r = newcrush.swap_bucket(cct, sid, did);
31f18b77
FG
10645 if (r < 0) {
10646 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 10647 err = r;
31f18b77
FG
10648 goto reply;
10649 }
10650 ss << "swapped bucket of " << source << " to " << dest;
10651 pending_inc.crush.clear();
f67539c2 10652 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
31f18b77
FG
10653 wait_for_finished_proposal(op,
10654 new Monitor::C_Command(mon, op, err, ss.str(),
10655 get_last_committed() + 1));
10656 return true;
10657 } else if (prefix == "osd crush link") {
10658 // osd crush link <name> <loc1> [<loc2> ...]
10659 string name;
9f95a23c 10660 cmd_getval(cmdmap, "name", name);
31f18b77 10661 vector<string> argvec;
9f95a23c 10662 cmd_getval(cmdmap, "args", argvec);
31f18b77
FG
10663 map<string,string> loc;
10664 CrushWrapper::parse_loc_map(argvec, &loc);
10665
10666 // Need an explicit check for name_exists because get_item_id returns
10667 // 0 on unfound.
10668 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
10669 if (!osdmap.crush->name_exists(name)) {
10670 err = -ENOENT;
10671 ss << "item " << name << " does not exist";
10672 goto reply;
10673 } else {
10674 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10675 }
11fdf7f2 10676 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
7c673cae
FG
10677 ss << "no need to move item id " << id << " name '" << name
10678 << "' to location " << loc << " in crush map";
10679 err = 0;
10680 goto reply;
10681 }
10682
10683 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10684 CrushWrapper newcrush;
10685 _get_pending_crush(newcrush);
10686
10687 if (!newcrush.name_exists(name)) {
10688 err = -ENOENT;
10689 ss << "item " << name << " does not exist";
10690 goto reply;
10691 } else {
10692 int id = newcrush.get_item_id(name);
11fdf7f2
TL
10693 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10694 err = newcrush.link_bucket(cct, id, loc);
7c673cae
FG
10695 if (err >= 0) {
10696 ss << "linked item id " << id << " name '" << name
10697 << "' to location " << loc << " in crush map";
10698 pending_inc.crush.clear();
f67539c2 10699 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10700 } else {
10701 ss << "cannot link item id " << id << " name '" << name
10702 << "' to location " << loc;
10703 goto reply;
10704 }
10705 } else {
10706 ss << "no need to move item id " << id << " name '" << name
10707 << "' to location " << loc << " in crush map";
10708 err = 0;
10709 }
10710 }
10711 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10712 get_last_committed() + 1));
10713 return true;
10714 } else if (prefix == "osd crush rm" ||
10715 prefix == "osd crush remove" ||
10716 prefix == "osd crush unlink") {
10717 do {
10718 // osd crush rm <id> [ancestor]
10719 CrushWrapper newcrush;
10720 _get_pending_crush(newcrush);
10721
10722 string name;
9f95a23c 10723 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10724
10725 if (!osdmap.crush->name_exists(name)) {
10726 err = 0;
10727 ss << "device '" << name << "' does not appear in the crush map";
10728 break;
10729 }
10730 if (!newcrush.name_exists(name)) {
10731 err = 0;
10732 ss << "device '" << name << "' does not appear in the crush map";
10733 getline(ss, rs);
10734 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10735 get_last_committed() + 1));
10736 return true;
10737 }
10738 int id = newcrush.get_item_id(name);
31f18b77
FG
10739 int ancestor = 0;
10740
7c673cae
FG
10741 bool unlink_only = prefix == "osd crush unlink";
10742 string ancestor_str;
9f95a23c 10743 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
7c673cae
FG
10744 if (!newcrush.name_exists(ancestor_str)) {
10745 err = -ENOENT;
10746 ss << "ancestor item '" << ancestor_str
10747 << "' does not appear in the crush map";
10748 break;
10749 }
31f18b77 10750 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 10751 }
31f18b77
FG
10752
10753 err = prepare_command_osd_crush_remove(
10754 newcrush,
10755 id, ancestor,
10756 (ancestor < 0), unlink_only);
10757
7c673cae
FG
10758 if (err == -ENOENT) {
10759 ss << "item " << id << " does not appear in that position";
10760 err = 0;
10761 break;
10762 }
10763 if (err == 0) {
81eedcae
TL
10764 if (!unlink_only)
10765 pending_inc.new_crush_node_flags[id] = 0;
7c673cae
FG
10766 ss << "removed item id " << id << " name '" << name << "' from crush map";
10767 getline(ss, rs);
10768 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10769 get_last_committed() + 1));
10770 return true;
10771 }
10772 } while (false);
10773
10774 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
10775 CrushWrapper newcrush;
10776 _get_pending_crush(newcrush);
10777
11fdf7f2 10778 newcrush.reweight(cct);
7c673cae 10779 pending_inc.crush.clear();
f67539c2 10780 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10781 ss << "reweighted crush hierarchy";
10782 getline(ss, rs);
10783 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10784 get_last_committed() + 1));
10785 return true;
10786 } else if (prefix == "osd crush reweight") {
10787 // osd crush reweight <name> <weight>
10788 CrushWrapper newcrush;
10789 _get_pending_crush(newcrush);
10790
10791 string name;
9f95a23c 10792 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10793 if (!newcrush.name_exists(name)) {
10794 err = -ENOENT;
10795 ss << "device '" << name << "' does not appear in the crush map";
10796 goto reply;
10797 }
10798
10799 int id = newcrush.get_item_id(name);
10800 if (id < 0) {
10801 ss << "device '" << name << "' is not a leaf in the crush map";
10802 err = -EINVAL;
10803 goto reply;
10804 }
10805 double w;
9f95a23c 10806 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 10807 ss << "unable to parse weight value '"
11fdf7f2 10808 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10809 err = -EINVAL;
10810 goto reply;
10811 }
10812
11fdf7f2
TL
10813 err = newcrush.adjust_item_weightf(cct, id, w,
10814 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10815 if (err < 0)
10816 goto reply;
10817 pending_inc.crush.clear();
f67539c2 10818 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10819 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10820 << " in crush map";
10821 getline(ss, rs);
10822 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10823 get_last_committed() + 1));
10824 return true;
10825 } else if (prefix == "osd crush reweight-subtree") {
10826 // osd crush reweight <name> <weight>
10827 CrushWrapper newcrush;
10828 _get_pending_crush(newcrush);
10829
10830 string name;
9f95a23c 10831 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10832 if (!newcrush.name_exists(name)) {
10833 err = -ENOENT;
10834 ss << "device '" << name << "' does not appear in the crush map";
10835 goto reply;
10836 }
10837
10838 int id = newcrush.get_item_id(name);
10839 if (id >= 0) {
10840 ss << "device '" << name << "' is not a subtree in the crush map";
10841 err = -EINVAL;
10842 goto reply;
10843 }
10844 double w;
9f95a23c 10845 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 10846 ss << "unable to parse weight value '"
11fdf7f2 10847 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10848 err = -EINVAL;
10849 goto reply;
10850 }
10851
11fdf7f2
TL
10852 err = newcrush.adjust_subtree_weightf(cct, id, w,
10853 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10854 if (err < 0)
10855 goto reply;
10856 pending_inc.crush.clear();
f67539c2 10857 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10858 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10859 << " in crush map";
10860 getline(ss, rs);
10861 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10862 get_last_committed() + 1));
10863 return true;
10864 } else if (prefix == "osd crush tunables") {
10865 CrushWrapper newcrush;
10866 _get_pending_crush(newcrush);
10867
10868 err = 0;
10869 string profile;
9f95a23c 10870 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
10871 if (profile == "legacy" || profile == "argonaut") {
10872 newcrush.set_tunables_legacy();
10873 } else if (profile == "bobtail") {
10874 newcrush.set_tunables_bobtail();
10875 } else if (profile == "firefly") {
10876 newcrush.set_tunables_firefly();
10877 } else if (profile == "hammer") {
10878 newcrush.set_tunables_hammer();
10879 } else if (profile == "jewel") {
10880 newcrush.set_tunables_jewel();
10881 } else if (profile == "optimal") {
10882 newcrush.set_tunables_optimal();
10883 } else if (profile == "default") {
10884 newcrush.set_tunables_default();
10885 } else {
10886 ss << "unrecognized profile '" << profile << "'";
10887 err = -EINVAL;
10888 goto reply;
10889 }
10890
10891 if (!validate_crush_against_features(&newcrush, ss)) {
10892 err = -EINVAL;
10893 goto reply;
10894 }
10895
10896 pending_inc.crush.clear();
f67539c2 10897 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10898 ss << "adjusted tunables profile to " << profile;
10899 getline(ss, rs);
10900 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10901 get_last_committed() + 1));
10902 return true;
10903 } else if (prefix == "osd crush set-tunable") {
10904 CrushWrapper newcrush;
10905 _get_pending_crush(newcrush);
10906
10907 err = 0;
10908 string tunable;
9f95a23c 10909 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
10910
10911 int64_t value = -1;
9f95a23c 10912 if (!cmd_getval(cmdmap, "value", value)) {
7c673cae 10913 err = -EINVAL;
11fdf7f2
TL
10914 ss << "failed to parse integer value "
10915 << cmd_vartype_stringify(cmdmap.at("value"));
7c673cae
FG
10916 goto reply;
10917 }
10918
10919 if (tunable == "straw_calc_version") {
224ce89b 10920 if (value != 0 && value != 1) {
7c673cae
FG
10921 ss << "value must be 0 or 1; got " << value;
10922 err = -EINVAL;
10923 goto reply;
10924 }
10925 newcrush.set_straw_calc_version(value);
10926 } else {
10927 ss << "unrecognized tunable '" << tunable << "'";
10928 err = -EINVAL;
10929 goto reply;
10930 }
10931
10932 if (!validate_crush_against_features(&newcrush, ss)) {
10933 err = -EINVAL;
10934 goto reply;
10935 }
10936
10937 pending_inc.crush.clear();
f67539c2 10938 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10939 ss << "adjusted tunable " << tunable << " to " << value;
10940 getline(ss, rs);
10941 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10942 get_last_committed() + 1));
10943 return true;
10944
10945 } else if (prefix == "osd crush rule create-simple") {
10946 string name, root, type, mode;
9f95a23c
TL
10947 cmd_getval(cmdmap, "name", name);
10948 cmd_getval(cmdmap, "root", root);
10949 cmd_getval(cmdmap, "type", type);
10950 cmd_getval(cmdmap, "mode", mode);
7c673cae
FG
10951 if (mode == "")
10952 mode = "firstn";
10953
10954 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
10955 // The name is uniquely associated to a ruleid and the rule it contains
10956 // From the user point of view, the rule is more meaningfull.
10957 ss << "rule " << name << " already exists";
7c673cae
FG
10958 err = 0;
10959 goto reply;
10960 }
10961
10962 CrushWrapper newcrush;
10963 _get_pending_crush(newcrush);
10964
10965 if (newcrush.rule_exists(name)) {
31f18b77
FG
10966 // The name is uniquely associated to a ruleid and the rule it contains
10967 // From the user point of view, the rule is more meaningfull.
10968 ss << "rule " << name << " already exists";
7c673cae
FG
10969 err = 0;
10970 } else {
224ce89b 10971 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
10972 pg_pool_t::TYPE_REPLICATED, &ss);
10973 if (ruleno < 0) {
10974 err = ruleno;
10975 goto reply;
10976 }
10977
10978 pending_inc.crush.clear();
f67539c2 10979 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10980 }
10981 getline(ss, rs);
10982 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10983 get_last_committed() + 1));
10984 return true;
10985
224ce89b
WB
10986 } else if (prefix == "osd crush rule create-replicated") {
10987 string name, root, type, device_class;
9f95a23c
TL
10988 cmd_getval(cmdmap, "name", name);
10989 cmd_getval(cmdmap, "root", root);
10990 cmd_getval(cmdmap, "type", type);
10991 cmd_getval(cmdmap, "class", device_class);
224ce89b
WB
10992
10993 if (osdmap.crush->rule_exists(name)) {
10994 // The name is uniquely associated to a ruleid and the rule it contains
10995 // From the user point of view, the rule is more meaningfull.
10996 ss << "rule " << name << " already exists";
10997 err = 0;
10998 goto reply;
10999 }
11000
11001 CrushWrapper newcrush;
11002 _get_pending_crush(newcrush);
11003
11004 if (newcrush.rule_exists(name)) {
11005 // The name is uniquely associated to a ruleid and the rule it contains
11006 // From the user point of view, the rule is more meaningfull.
11007 ss << "rule " << name << " already exists";
11008 err = 0;
11009 } else {
11010 int ruleno = newcrush.add_simple_rule(
11011 name, root, type, device_class,
11012 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11013 if (ruleno < 0) {
11014 err = ruleno;
11015 goto reply;
11016 }
11017
11018 pending_inc.crush.clear();
f67539c2 11019 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
224ce89b
WB
11020 }
11021 getline(ss, rs);
11022 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11023 get_last_committed() + 1));
11024 return true;
11025
7c673cae
FG
11026 } else if (prefix == "osd erasure-code-profile rm") {
11027 string name;
9f95a23c 11028 cmd_getval(cmdmap, "name", name);
7c673cae
FG
11029
11030 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11031 goto wait;
11032
11033 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11034 err = -EBUSY;
11035 goto reply;
11036 }
11037
11038 if (osdmap.has_erasure_code_profile(name) ||
11039 pending_inc.new_erasure_code_profiles.count(name)) {
11040 if (osdmap.has_erasure_code_profile(name)) {
11041 pending_inc.old_erasure_code_profiles.push_back(name);
11042 } else {
11043 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11044 pending_inc.new_erasure_code_profiles.erase(name);
11045 }
11046
11047 getline(ss, rs);
11048 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11049 get_last_committed() + 1));
11050 return true;
11051 } else {
11052 ss << "erasure-code-profile " << name << " does not exist";
11053 err = 0;
11054 goto reply;
11055 }
11056
11057 } else if (prefix == "osd erasure-code-profile set") {
11058 string name;
9f95a23c 11059 cmd_getval(cmdmap, "name", name);
7c673cae 11060 vector<string> profile;
9f95a23c 11061 cmd_getval(cmdmap, "profile", profile);
11fdf7f2
TL
11062
11063 bool force = false;
9f95a23c 11064 cmd_getval(cmdmap, "force", force);
11fdf7f2 11065
7c673cae
FG
11066 map<string,string> profile_map;
11067 err = parse_erasure_code_profile(profile, &profile_map, &ss);
11068 if (err)
11069 goto reply;
adb31ebb
TL
11070 if (auto found = profile_map.find("crush-failure-domain");
11071 found != profile_map.end()) {
11072 const auto& failure_domain = found->second;
11073 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11074 if (failure_domain_type < 0) {
11075 ss << "erasure-code-profile " << profile_map
11076 << " contains an invalid failure-domain " << std::quoted(failure_domain);
11077 err = -EINVAL;
11078 goto reply;
11079 }
11080 }
11081
7c673cae
FG
11082 if (profile_map.find("plugin") == profile_map.end()) {
11083 ss << "erasure-code-profile " << profile_map
11084 << " must contain a plugin entry" << std::endl;
11085 err = -EINVAL;
11086 goto reply;
11087 }
11088 string plugin = profile_map["plugin"];
11089
11090 if (pending_inc.has_erasure_code_profile(name)) {
11091 dout(20) << "erasure code profile " << name << " try again" << dendl;
11092 goto wait;
11093 } else {
7c673cae
FG
11094 err = normalize_profile(name, profile_map, force, &ss);
11095 if (err)
11096 goto reply;
11097
11098 if (osdmap.has_erasure_code_profile(name)) {
11099 ErasureCodeProfile existing_profile_map =
11100 osdmap.get_erasure_code_profile(name);
11101 err = normalize_profile(name, existing_profile_map, force, &ss);
11102 if (err)
11103 goto reply;
11104
11105 if (existing_profile_map == profile_map) {
11106 err = 0;
11107 goto reply;
11108 }
11109 if (!force) {
11110 err = -EPERM;
11111 ss << "will not override erasure code profile " << name
11112 << " because the existing profile "
11113 << existing_profile_map
11114 << " is different from the proposed profile "
11115 << profile_map;
11116 goto reply;
11117 }
11118 }
11119
11120 dout(20) << "erasure code profile set " << name << "="
11121 << profile_map << dendl;
11122 pending_inc.set_erasure_code_profile(name, profile_map);
11123 }
11124
11125 getline(ss, rs);
11126 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11127 get_last_committed() + 1));
11128 return true;
11129
11130 } else if (prefix == "osd crush rule create-erasure") {
11131 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11132 if (err == -EAGAIN)
11133 goto wait;
11134 if (err)
11135 goto reply;
11136 string name, poolstr;
9f95a23c 11137 cmd_getval(cmdmap, "name", name);
7c673cae 11138 string profile;
9f95a23c 11139 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
11140 if (profile == "")
11141 profile = "default";
11142 if (profile == "default") {
11143 if (!osdmap.has_erasure_code_profile(profile)) {
11144 if (pending_inc.has_erasure_code_profile(profile)) {
11145 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11146 goto wait;
11147 }
11148
11149 map<string,string> profile_map;
11fdf7f2 11150 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
11151 profile_map,
11152 &ss);
11153 if (err)
11154 goto reply;
11155 err = normalize_profile(name, profile_map, true, &ss);
11156 if (err)
11157 goto reply;
11158 dout(20) << "erasure code profile set " << profile << "="
11159 << profile_map << dendl;
11160 pending_inc.set_erasure_code_profile(profile, profile_map);
11161 goto wait;
11162 }
11163 }
11164
31f18b77
FG
11165 int rule;
11166 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
11167 if (err < 0) {
11168 switch(err) {
11169 case -EEXIST: // return immediately
11170 ss << "rule " << name << " already exists";
11171 err = 0;
11172 goto reply;
11173 break;
11174 case -EALREADY: // wait for pending to be proposed
11175 ss << "rule " << name << " already exists";
11176 err = 0;
11177 break;
11178 default: // non recoverable error
11179 goto reply;
11180 break;
11181 }
11182 } else {
31f18b77 11183 ss << "created rule " << name << " at " << rule;
7c673cae
FG
11184 }
11185
11186 getline(ss, rs);
11187 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11188 get_last_committed() + 1));
11189 return true;
11190
11191 } else if (prefix == "osd crush rule rm") {
11192 string name;
9f95a23c 11193 cmd_getval(cmdmap, "name", name);
7c673cae
FG
11194
11195 if (!osdmap.crush->rule_exists(name)) {
11196 ss << "rule " << name << " does not exist";
11197 err = 0;
11198 goto reply;
11199 }
11200
11201 CrushWrapper newcrush;
11202 _get_pending_crush(newcrush);
11203
11204 if (!newcrush.rule_exists(name)) {
11205 ss << "rule " << name << " does not exist";
11206 err = 0;
11207 } else {
11208 int ruleno = newcrush.get_rule_id(name);
11fdf7f2 11209 ceph_assert(ruleno >= 0);
7c673cae
FG
11210
11211 // make sure it is not in use.
11212 // FIXME: this is ok in some situations, but let's not bother with that
11213 // complexity now.
11214 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
3efd9988 11215 if (osdmap.crush_rule_in_use(ruleset)) {
7c673cae
FG
11216 ss << "crush ruleset " << name << " " << ruleset << " is in use";
11217 err = -EBUSY;
11218 goto reply;
11219 }
11220
11221 err = newcrush.remove_rule(ruleno);
11222 if (err < 0) {
11223 goto reply;
11224 }
11225
11226 pending_inc.crush.clear();
f67539c2 11227 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
11228 }
11229 getline(ss, rs);
11230 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11231 get_last_committed() + 1));
11232 return true;
11233
b5b8bbf5
FG
11234 } else if (prefix == "osd crush rule rename") {
11235 string srcname;
11236 string dstname;
9f95a23c
TL
11237 cmd_getval(cmdmap, "srcname", srcname);
11238 cmd_getval(cmdmap, "dstname", dstname);
b5b8bbf5
FG
11239 if (srcname.empty() || dstname.empty()) {
11240 ss << "must specify both source rule name and destination rule name";
11241 err = -EINVAL;
11242 goto reply;
11243 }
11244 if (srcname == dstname) {
11245 ss << "destination rule name is equal to source rule name";
11246 err = 0;
11247 goto reply;
11248 }
11249
11250 CrushWrapper newcrush;
11251 _get_pending_crush(newcrush);
181888fb
FG
11252 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11253 // srcname does not exist and dstname already exists
11254 // suppose this is a replay and return success
11255 // (so this command is idempotent)
11256 ss << "already renamed to '" << dstname << "'";
11257 err = 0;
11258 goto reply;
11259 }
11260
b5b8bbf5
FG
11261 err = newcrush.rename_rule(srcname, dstname, &ss);
11262 if (err < 0) {
11263 // ss has reason for failure
11264 goto reply;
11265 }
11266 pending_inc.crush.clear();
f67539c2 11267 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
b5b8bbf5
FG
11268 getline(ss, rs);
11269 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11270 get_last_committed() + 1));
11271 return true;
11272
7c673cae
FG
11273 } else if (prefix == "osd setmaxosd") {
11274 int64_t newmax;
9f95a23c 11275 if (!cmd_getval(cmdmap, "newmax", newmax)) {
7c673cae 11276 ss << "unable to parse 'newmax' value '"
11fdf7f2 11277 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
7c673cae
FG
11278 err = -EINVAL;
11279 goto reply;
11280 }
11281
11fdf7f2 11282 if (newmax > g_conf()->mon_max_osd) {
7c673cae
FG
11283 err = -ERANGE;
11284 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11fdf7f2 11285 << g_conf()->mon_max_osd << ")";
7c673cae
FG
11286 goto reply;
11287 }
11288
11289 // Don't allow shrinking OSD number as this will cause data loss
11290 // and may cause kernel crashes.
11291 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11292 if (newmax < osdmap.get_max_osd()) {
11293 // Check if the OSDs exist between current max and new value.
11294 // If there are any OSDs exist, then don't allow shrinking number
11295 // of OSDs.
11296 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11297 if (osdmap.exists(i)) {
11298 err = -EBUSY;
11299 ss << "cannot shrink max_osd to " << newmax
11300 << " because osd." << i << " (and possibly others) still in use";
11301 goto reply;
11302 }
11303 }
11304 }
11305
11306 pending_inc.new_max_osd = newmax;
11307 ss << "set new max_osd = " << pending_inc.new_max_osd;
11308 getline(ss, rs);
11309 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11310 get_last_committed() + 1));
11311 return true;
11312
11313 } else if (prefix == "osd set-full-ratio" ||
11314 prefix == "osd set-backfillfull-ratio" ||
11315 prefix == "osd set-nearfull-ratio") {
7c673cae 11316 double n;
9f95a23c 11317 if (!cmd_getval(cmdmap, "ratio", n)) {
7c673cae 11318 ss << "unable to parse 'ratio' value '"
11fdf7f2 11319 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
7c673cae
FG
11320 err = -EINVAL;
11321 goto reply;
11322 }
11323 if (prefix == "osd set-full-ratio")
11324 pending_inc.new_full_ratio = n;
11325 else if (prefix == "osd set-backfillfull-ratio")
11326 pending_inc.new_backfillfull_ratio = n;
11327 else if (prefix == "osd set-nearfull-ratio")
11328 pending_inc.new_nearfull_ratio = n;
11329 ss << prefix << " " << n;
11330 getline(ss, rs);
11331 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11332 get_last_committed() + 1));
11333 return true;
11334 } else if (prefix == "osd set-require-min-compat-client") {
7c673cae 11335 string v;
9f95a23c
TL
11336 cmd_getval(cmdmap, "version", v);
11337 ceph_release_t vno = ceph_release_from_name(v);
11338 if (!vno) {
7c673cae
FG
11339 ss << "version " << v << " is not recognized";
11340 err = -EINVAL;
11341 goto reply;
11342 }
11343 OSDMap newmap;
11344 newmap.deepish_copy_from(osdmap);
11345 newmap.apply_incremental(pending_inc);
31f18b77
FG
11346 newmap.require_min_compat_client = vno;
11347 auto mvno = newmap.get_min_compat_client();
11348 if (vno < mvno) {
9f95a23c
TL
11349 ss << "osdmap current utilizes features that require " << mvno
11350 << "; cannot set require_min_compat_client below that to " << vno;
7c673cae
FG
11351 err = -EPERM;
11352 goto reply;
11353 }
11fdf7f2 11354 bool sure = false;
9f95a23c 11355 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11356 if (!sure) {
31f18b77 11357 FeatureMap m;
f67539c2
TL
11358 mon.get_combined_feature_map(&m);
11359 uint64_t features = ceph_release_features(to_integer<int>(vno));
31f18b77
FG
11360 bool first = true;
11361 bool ok = true;
11362 for (int type : {
11363 CEPH_ENTITY_TYPE_CLIENT,
11364 CEPH_ENTITY_TYPE_MDS,
11365 CEPH_ENTITY_TYPE_MGR }) {
11366 auto p = m.m.find(type);
11367 if (p == m.m.end()) {
11368 continue;
11369 }
11370 for (auto& q : p->second) {
11371 uint64_t missing = ~q.first & features;
11372 if (missing) {
11373 if (first) {
11374 ss << "cannot set require_min_compat_client to " << v << ": ";
11375 } else {
11376 ss << "; ";
11377 }
11378 first = false;
11379 ss << q.second << " connected " << ceph_entity_type_name(type)
11380 << "(s) look like " << ceph_release_name(
11381 ceph_release_from_features(q.first))
11382 << " (missing 0x" << std::hex << missing << std::dec << ")";
11383 ok = false;
11384 }
11385 }
11386 }
11387 if (!ok) {
11388 ss << "; add --yes-i-really-mean-it to do it anyway";
11389 err = -EPERM;
11390 goto reply;
11391 }
11392 }
9f95a23c 11393 ss << "set require_min_compat_client to " << vno;
31f18b77 11394 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
11395 getline(ss, rs);
11396 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11397 get_last_committed() + 1));
11398 return true;
11399 } else if (prefix == "osd pause") {
11400 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11401
11402 } else if (prefix == "osd unpause") {
11403 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11404
11405 } else if (prefix == "osd set") {
11fdf7f2 11406 bool sure = false;
9f95a23c 11407 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11408
7c673cae 11409 string key;
9f95a23c
TL
11410 cmd_getval(cmdmap, "key", key);
11411 if (key == "pause")
7c673cae
FG
11412 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11413 else if (key == "noup")
11414 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11415 else if (key == "nodown")
11416 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11417 else if (key == "noout")
11418 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11419 else if (key == "noin")
11420 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11421 else if (key == "nobackfill")
11422 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11423 else if (key == "norebalance")
11424 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11425 else if (key == "norecover")
11426 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11427 else if (key == "noscrub")
11428 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11429 else if (key == "nodeep-scrub")
11430 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11431 else if (key == "notieragent")
11432 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11433 else if (key == "nosnaptrim")
11434 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11435 else if (key == "pglog_hardlimit") {
11436 if (!osdmap.get_num_up_osds() && !sure) {
f64942e4
AA
11437 ss << "Not advisable to continue since no OSDs are up. Pass "
11438 << "--yes-i-really-mean-it if you really wish to continue.";
11439 err = -EPERM;
11440 goto reply;
11441 }
11442 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11443 // we are reusing a jewel feature bit that was retired in luminous.
9f95a23c 11444 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
f64942e4 11445 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11fdf7f2 11446 || sure)) {
f64942e4
AA
11447 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11448 } else {
11449 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11450 err = -EPERM;
11451 goto reply;
11452 }
7c673cae
FG
11453 } else {
11454 ss << "unrecognized flag '" << key << "'";
11455 err = -EINVAL;
11456 }
11457
11458 } else if (prefix == "osd unset") {
11459 string key;
9f95a23c
TL
11460 cmd_getval(cmdmap, "key", key);
11461 if (key == "pause")
7c673cae
FG
11462 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11463 else if (key == "noup")
11464 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11465 else if (key == "nodown")
11466 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11467 else if (key == "noout")
11468 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11469 else if (key == "noin")
11470 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11471 else if (key == "nobackfill")
11472 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11473 else if (key == "norebalance")
11474 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11475 else if (key == "norecover")
11476 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11477 else if (key == "noscrub")
11478 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11479 else if (key == "nodeep-scrub")
11480 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11481 else if (key == "notieragent")
11482 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11483 else if (key == "nosnaptrim")
11484 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
224ce89b 11485 else {
7c673cae
FG
11486 ss << "unrecognized flag '" << key << "'";
11487 err = -EINVAL;
11488 }
11489
31f18b77
FG
11490 } else if (prefix == "osd require-osd-release") {
11491 string release;
9f95a23c 11492 cmd_getval(cmdmap, "release", release);
11fdf7f2 11493 bool sure = false;
9f95a23c
TL
11494 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11495 ceph_release_t rel = ceph_release_from_name(release.c_str());
11496 if (!rel) {
31f18b77
FG
11497 ss << "unrecognized release " << release;
11498 err = -EINVAL;
11499 goto reply;
11500 }
d2e6a577
FG
11501 if (rel == osdmap.require_osd_release) {
11502 // idempotent
11503 err = 0;
11504 goto reply;
11505 }
9f95a23c 11506 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11fdf7f2
TL
11507 if (!osdmap.get_num_up_osds() && !sure) {
11508 ss << "Not advisable to continue since no OSDs are up. Pass "
11509 << "--yes-i-really-mean-it if you really wish to continue.";
11510 err = -EPERM;
11511 goto reply;
11512 }
9f95a23c 11513 if (rel == ceph_release_t::mimic) {
f67539c2 11514 if (!mon.monmap->get_required_features().contains_all(
11fdf7f2
TL
11515 ceph::features::mon::FEATURE_MIMIC)) {
11516 ss << "not all mons are mimic";
11517 err = -EPERM;
11518 goto reply;
11519 }
11520 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11521 && !sure) {
11522 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11523 err = -EPERM;
11524 goto reply;
11525 }
9f95a23c 11526 } else if (rel == ceph_release_t::nautilus) {
f67539c2 11527 if (!mon.monmap->get_required_features().contains_all(
11fdf7f2
TL
11528 ceph::features::mon::FEATURE_NAUTILUS)) {
11529 ss << "not all mons are nautilus";
11530 err = -EPERM;
11531 goto reply;
11532 }
11533 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11534 && !sure) {
11535 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
31f18b77
FG
11536 err = -EPERM;
11537 goto reply;
11538 }
9f95a23c 11539 } else if (rel == ceph_release_t::octopus) {
f67539c2 11540 if (!mon.monmap->get_required_features().contains_all(
9f95a23c
TL
11541 ceph::features::mon::FEATURE_OCTOPUS)) {
11542 ss << "not all mons are octopus";
11543 err = -EPERM;
11544 goto reply;
11545 }
11546 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11547 && !sure) {
11548 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11549 err = -EPERM;
11550 goto reply;
11551 }
f67539c2
TL
11552 } else if (rel == ceph_release_t::pacific) {
11553 if (!mon.monmap->get_required_features().contains_all(
11554 ceph::features::mon::FEATURE_PACIFIC)) {
11555 ss << "not all mons are pacific";
11556 err = -EPERM;
11557 goto reply;
11558 }
11559 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11560 && !sure) {
11561 ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11562 err = -EPERM;
11563 goto reply;
11564 }
31f18b77
FG
11565 } else {
11566 ss << "not supported for this release yet";
11567 err = -EPERM;
11568 goto reply;
11569 }
11570 if (rel < osdmap.require_osd_release) {
11571 ss << "require_osd_release cannot be lowered once it has been set";
11572 err = -EPERM;
11573 goto reply;
11574 }
11575 pending_inc.new_require_osd_release = rel;
11576 goto update;
7c673cae 11577 } else if (prefix == "osd down" ||
9f95a23c
TL
11578 prefix == "osd out" ||
11579 prefix == "osd in" ||
11580 prefix == "osd rm" ||
11581 prefix == "osd stop") {
7c673cae
FG
11582
11583 bool any = false;
31f18b77
FG
11584 bool stop = false;
11585 bool verbose = true;
9f95a23c 11586 bool definitely_dead = false;
7c673cae
FG
11587
11588 vector<string> idvec;
9f95a23c
TL
11589 cmd_getval(cmdmap, "ids", idvec);
11590 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11591 derr << "definitely_dead " << (int)definitely_dead << dendl;
31f18b77
FG
11592 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11593 set<int> osds;
11594
11595 // wildcard?
11596 if (j == 0 &&
11597 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11598 if (prefix == "osd in") {
11599 // touch out osds only
81eedcae 11600 osdmap.get_out_existing_osds(osds);
31f18b77
FG
11601 } else {
11602 osdmap.get_all_osds(osds);
11603 }
11604 stop = true;
11605 verbose = false; // so the output is less noisy.
11606 } else {
11607 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11608 if (osd < 0) {
11609 ss << "invalid osd id" << osd;
11610 err = -EINVAL;
11611 continue;
11612 } else if (!osdmap.exists(osd)) {
11613 ss << "osd." << osd << " does not exist. ";
11614 continue;
11615 }
11616
11617 osds.insert(osd);
7c673cae 11618 }
31f18b77
FG
11619
11620 for (auto &osd : osds) {
11621 if (prefix == "osd down") {
11622 if (osdmap.is_down(osd)) {
11623 if (verbose)
11624 ss << "osd." << osd << " is already down. ";
11625 } else {
11626 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11627 ss << "marked down osd." << osd << ". ";
11628 any = true;
11629 }
9f95a23c
TL
11630 if (definitely_dead) {
11631 if (!pending_inc.new_xinfo.count(osd)) {
11632 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11633 }
11634 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11635 any = true;
11636 }
11637 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11638 }
31f18b77
FG
11639 } else if (prefix == "osd out") {
11640 if (osdmap.is_out(osd)) {
11641 if (verbose)
11642 ss << "osd." << osd << " is already out. ";
11643 } else {
11644 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11645 if (osdmap.osd_weight[osd]) {
11646 if (pending_inc.new_xinfo.count(osd) == 0) {
11647 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11648 }
11649 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 11650 }
31f18b77 11651 ss << "marked out osd." << osd << ". ";
224ce89b
WB
11652 std::ostringstream msg;
11653 msg << "Client " << op->get_session()->entity_name
11654 << " marked osd." << osd << " out";
11655 if (osdmap.is_up(osd)) {
11656 msg << ", while it was still marked up";
11657 } else {
3efd9988
FG
11658 auto period = ceph_clock_now() - down_pending_out[osd];
11659 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
11660 << " seconds";
11661 }
11662
f67539c2 11663 mon.clog->info() << msg.str();
31f18b77 11664 any = true;
7c673cae 11665 }
31f18b77
FG
11666 } else if (prefix == "osd in") {
11667 if (osdmap.is_in(osd)) {
11668 if (verbose)
11669 ss << "osd." << osd << " is already in. ";
11670 } else {
11671 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11672 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11673 if (pending_inc.new_xinfo.count(osd) == 0) {
11674 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11675 }
11676 pending_inc.new_xinfo[osd].old_weight = 0;
11677 } else {
11678 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 11679 }
31f18b77
FG
11680 ss << "marked in osd." << osd << ". ";
11681 any = true;
11682 }
11683 } else if (prefix == "osd rm") {
11684 err = prepare_command_osd_remove(osd);
11685
11686 if (err == -EBUSY) {
11687 if (any)
11688 ss << ", ";
11689 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 11690 } else {
11fdf7f2 11691 ceph_assert(err == 0);
31f18b77
FG
11692 if (any) {
11693 ss << ", osd." << osd;
11694 } else {
11695 ss << "removed osd." << osd;
11696 }
11697 any = true;
7c673cae 11698 }
9f95a23c
TL
11699 } else if (prefix == "osd stop") {
11700 if (osdmap.is_stop(osd)) {
11701 if (verbose)
11702 ss << "osd." << osd << " is already stopped. ";
11703 } else if (osdmap.is_down(osd)) {
11704 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11705 ss << "stop down osd." << osd << ". ";
11706 any = true;
11707 } else {
11708 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11709 ss << "stop osd." << osd << ". ";
11710 any = true;
11711 }
31f18b77
FG
11712 }
11713 }
11714 }
11715 if (any) {
11716 getline(ss, rs);
11717 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11718 get_last_committed() + 1));
11719 return true;
11720 }
81eedcae
TL
11721 } else if (prefix == "osd set-group" ||
11722 prefix == "osd unset-group" ||
11723 prefix == "osd add-noup" ||
31f18b77
FG
11724 prefix == "osd add-nodown" ||
11725 prefix == "osd add-noin" ||
81eedcae
TL
11726 prefix == "osd add-noout" ||
11727 prefix == "osd rm-noup" ||
11728 prefix == "osd rm-nodown" ||
11729 prefix == "osd rm-noin" ||
11730 prefix == "osd rm-noout") {
11731 bool do_set = prefix == "osd set-group" ||
11732 prefix.find("add") != string::npos;
11733 string flag_str;
11734 unsigned flags = 0;
11735 vector<string> who;
11736 if (prefix == "osd set-group" || prefix == "osd unset-group") {
9f95a23c
TL
11737 cmd_getval(cmdmap, "flags", flag_str);
11738 cmd_getval(cmdmap, "who", who);
81eedcae
TL
11739 vector<string> raw_flags;
11740 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11741 for (auto& f : raw_flags) {
11742 if (f == "noup")
11743 flags |= CEPH_OSD_NOUP;
11744 else if (f == "nodown")
11745 flags |= CEPH_OSD_NODOWN;
11746 else if (f == "noin")
11747 flags |= CEPH_OSD_NOIN;
11748 else if (f == "noout")
11749 flags |= CEPH_OSD_NOOUT;
11750 else {
11751 ss << "unrecognized flag '" << f << "', must be one of "
11752 << "{noup,nodown,noin,noout}";
11753 err = -EINVAL;
11754 goto reply;
11755 }
11756 }
31f18b77 11757 } else {
9f95a23c 11758 cmd_getval(cmdmap, "ids", who);
81eedcae
TL
11759 if (prefix.find("noup") != string::npos)
11760 flags = CEPH_OSD_NOUP;
11761 else if (prefix.find("nodown") != string::npos)
11762 flags = CEPH_OSD_NODOWN;
11763 else if (prefix.find("noin") != string::npos)
11764 flags = CEPH_OSD_NOIN;
11765 else if (prefix.find("noout") != string::npos)
11766 flags = CEPH_OSD_NOOUT;
11767 else
11768 ceph_assert(0 == "Unreachable!");
31f18b77 11769 }
81eedcae
TL
11770 if (flags == 0) {
11771 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11772 err = -EINVAL;
11773 goto reply;
11774 }
11775 if (who.empty()) {
11776 ss << "must specify at least one or more targets to set/unset";
11777 err = -EINVAL;
11778 goto reply;
11779 }
11780 set<int> osds;
11781 set<int> crush_nodes;
11782 set<int> device_classes;
11783 for (auto& w : who) {
11784 if (w == "any" || w == "all" || w == "*") {
31f18b77 11785 osdmap.get_all_osds(osds);
81eedcae 11786 break;
31f18b77 11787 }
81eedcae
TL
11788 std::stringstream ts;
11789 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11790 osds.insert(osd);
11791 } else if (osdmap.crush->name_exists(w)) {
11792 crush_nodes.insert(osdmap.crush->get_item_id(w));
11793 } else if (osdmap.crush->class_exists(w)) {
11794 device_classes.insert(osdmap.crush->get_class_id(w));
11795 } else {
11796 ss << "unable to parse osd id or crush node or device class: "
11797 << "\"" << w << "\". ";
7c673cae
FG
11798 }
11799 }
81eedcae
TL
11800 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11801 // ss has reason for failure
11802 err = -EINVAL;
11803 goto reply;
31f18b77 11804 }
31f18b77 11805 bool any = false;
81eedcae
TL
11806 for (auto osd : osds) {
11807 if (!osdmap.exists(osd)) {
11808 ss << "osd." << osd << " does not exist. ";
11809 continue;
11810 }
11811 if (do_set) {
11812 if (flags & CEPH_OSD_NOUP) {
11813 any |= osdmap.is_noup_by_osd(osd) ?
11814 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11815 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
31f18b77 11816 }
81eedcae
TL
11817 if (flags & CEPH_OSD_NODOWN) {
11818 any |= osdmap.is_nodown_by_osd(osd) ?
11819 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11820 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11821 }
11822 if (flags & CEPH_OSD_NOIN) {
11823 any |= osdmap.is_noin_by_osd(osd) ?
11824 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11825 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11826 }
11827 if (flags & CEPH_OSD_NOOUT) {
11828 any |= osdmap.is_noout_by_osd(osd) ?
11829 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11830 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
31f18b77 11831 }
31f18b77 11832 } else {
81eedcae
TL
11833 if (flags & CEPH_OSD_NOUP) {
11834 any |= osdmap.is_noup_by_osd(osd) ?
11835 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11836 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
31f18b77 11837 }
81eedcae
TL
11838 if (flags & CEPH_OSD_NODOWN) {
11839 any |= osdmap.is_nodown_by_osd(osd) ?
11840 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11841 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
31f18b77 11842 }
81eedcae
TL
11843 if (flags & CEPH_OSD_NOIN) {
11844 any |= osdmap.is_noin_by_osd(osd) ?
11845 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11846 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11847 }
11848 if (flags & CEPH_OSD_NOOUT) {
11849 any |= osdmap.is_noout_by_osd(osd) ?
11850 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11851 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
31f18b77
FG
11852 }
11853 }
11854 }
81eedcae
TL
11855 for (auto& id : crush_nodes) {
11856 auto old_flags = osdmap.get_crush_node_flags(id);
11857 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11858 pending_flags |= old_flags; // adopt existing flags first!
11859 if (do_set) {
11860 pending_flags |= flags;
11861 } else {
11862 pending_flags &= ~flags;
11863 }
11864 any = true;
11865 }
11866 for (auto& id : device_classes) {
11867 auto old_flags = osdmap.get_device_class_flags(id);
11868 auto& pending_flags = pending_inc.new_device_class_flags[id];
11869 pending_flags |= old_flags;
11870 if (do_set) {
11871 pending_flags |= flags;
11872 } else {
11873 pending_flags &= ~flags;
11874 }
11875 any = true;
11876 }
31f18b77
FG
11877 if (any) {
11878 getline(ss, rs);
11879 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11880 get_last_committed() + 1));
7c673cae
FG
11881 return true;
11882 }
11883 } else if (prefix == "osd pg-temp") {
11884 string pgidstr;
9f95a23c 11885 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11886 ss << "unable to parse 'pgid' value '"
11fdf7f2 11887 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11888 err = -EINVAL;
11889 goto reply;
11890 }
11891 pg_t pgid;
11892 if (!pgid.parse(pgidstr.c_str())) {
11893 ss << "invalid pgid '" << pgidstr << "'";
11894 err = -EINVAL;
11895 goto reply;
11896 }
11897 if (!osdmap.pg_exists(pgid)) {
11898 ss << "pg " << pgid << " does not exist";
11899 err = -ENOENT;
11900 goto reply;
11901 }
11902 if (pending_inc.new_pg_temp.count(pgid)) {
11903 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11904 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11905 return true;
11906 }
11907
11908 vector<int64_t> id_vec;
11909 vector<int32_t> new_pg_temp;
9f95a23c 11910 cmd_getval(cmdmap, "id", id_vec);
11fdf7f2
TL
11911 if (id_vec.empty()) {
11912 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11913 ss << "done cleaning up pg_temp of " << pgid;
11914 goto update;
7c673cae
FG
11915 }
11916 for (auto osd : id_vec) {
11917 if (!osdmap.exists(osd)) {
11918 ss << "osd." << osd << " does not exist";
11919 err = -ENOENT;
11920 goto reply;
11921 }
11922 new_pg_temp.push_back(osd);
11923 }
11924
224ce89b
WB
11925 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11926 if ((int)new_pg_temp.size() < pool_min_size) {
11927 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11928 << pool_min_size << ")";
11929 err = -EINVAL;
11930 goto reply;
11931 }
11932
11933 int pool_size = osdmap.get_pg_pool_size(pgid);
11934 if ((int)new_pg_temp.size() > pool_size) {
11935 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11936 << pool_size << ")";
11937 err = -EINVAL;
11938 goto reply;
11939 }
11940
7c673cae
FG
11941 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11942 new_pg_temp.begin(), new_pg_temp.end());
11943 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11944 goto update;
11945 } else if (prefix == "osd primary-temp") {
11946 string pgidstr;
9f95a23c 11947 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11948 ss << "unable to parse 'pgid' value '"
11fdf7f2 11949 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11950 err = -EINVAL;
11951 goto reply;
11952 }
11953 pg_t pgid;
11954 if (!pgid.parse(pgidstr.c_str())) {
11955 ss << "invalid pgid '" << pgidstr << "'";
11956 err = -EINVAL;
11957 goto reply;
11958 }
11959 if (!osdmap.pg_exists(pgid)) {
11960 ss << "pg " << pgid << " does not exist";
11961 err = -ENOENT;
11962 goto reply;
11963 }
11964
11965 int64_t osd;
9f95a23c 11966 if (!cmd_getval(cmdmap, "id", osd)) {
7c673cae 11967 ss << "unable to parse 'id' value '"
11fdf7f2 11968 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11969 err = -EINVAL;
11970 goto reply;
11971 }
11972 if (osd != -1 && !osdmap.exists(osd)) {
11973 ss << "osd." << osd << " does not exist";
11974 err = -ENOENT;
11975 goto reply;
11976 }
11977
9f95a23c
TL
11978 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11979 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 11980 ss << "require_min_compat_client "
9f95a23c 11981 << osdmap.require_min_compat_client
7c673cae
FG
11982 << " < firefly, which is required for primary-temp";
11983 err = -EPERM;
11984 goto reply;
7c673cae
FG
11985 }
11986
11987 pending_inc.new_primary_temp[pgid] = osd;
11988 ss << "set " << pgid << " primary_temp mapping to " << osd;
11989 goto update;
11fdf7f2
TL
11990 } else if (prefix == "pg repeer") {
11991 pg_t pgid;
11992 string pgidstr;
9f95a23c 11993 cmd_getval(cmdmap, "pgid", pgidstr);
11fdf7f2
TL
11994 if (!pgid.parse(pgidstr.c_str())) {
11995 ss << "invalid pgid '" << pgidstr << "'";
11996 err = -EINVAL;
11997 goto reply;
11998 }
11999 if (!osdmap.pg_exists(pgid)) {
12000 ss << "pg '" << pgidstr << "' does not exist";
12001 err = -ENOENT;
12002 goto reply;
12003 }
12004 vector<int> acting;
12005 int primary;
12006 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12007 if (primary < 0) {
12008 err = -EAGAIN;
12009 ss << "pg currently has no primary";
12010 goto reply;
12011 }
12012 if (acting.size() > 1) {
12013 // map to just primary; it will map back to what it wants
12014 pending_inc.new_pg_temp[pgid] = { primary };
12015 } else {
12016 // hmm, pick another arbitrary osd to induce a change. Note
12017 // that this won't work if there is only one suitable OSD in the cluster.
12018 int i;
12019 bool done = false;
12020 for (i = 0; i < osdmap.get_max_osd(); ++i) {
12021 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12022 continue;
12023 }
12024 pending_inc.new_pg_temp[pgid] = { primary, i };
12025 done = true;
12026 break;
12027 }
12028 if (!done) {
12029 err = -EAGAIN;
12030 ss << "not enough up OSDs in the cluster to force repeer";
12031 goto reply;
12032 }
12033 }
12034 goto update;
224ce89b
WB
12035 } else if (prefix == "osd pg-upmap" ||
12036 prefix == "osd rm-pg-upmap" ||
12037 prefix == "osd pg-upmap-items" ||
12038 prefix == "osd rm-pg-upmap-items") {
9f95a23c 12039 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
31f18b77 12040 ss << "min_compat_client "
9f95a23c 12041 << osdmap.require_min_compat_client
224ce89b
WB
12042 << " < luminous, which is required for pg-upmap. "
12043 << "Try 'ceph osd set-require-min-compat-client luminous' "
12044 << "before using the new interface";
7c673cae
FG
12045 err = -EPERM;
12046 goto reply;
12047 }
12048 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
12049 if (err == -EAGAIN)
12050 goto wait;
12051 if (err < 0)
12052 goto reply;
12053 string pgidstr;
9f95a23c 12054 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 12055 ss << "unable to parse 'pgid' value '"
11fdf7f2 12056 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
12057 err = -EINVAL;
12058 goto reply;
12059 }
12060 pg_t pgid;
12061 if (!pgid.parse(pgidstr.c_str())) {
12062 ss << "invalid pgid '" << pgidstr << "'";
12063 err = -EINVAL;
12064 goto reply;
12065 }
12066 if (!osdmap.pg_exists(pgid)) {
12067 ss << "pg " << pgid << " does not exist";
12068 err = -ENOENT;
12069 goto reply;
12070 }
94b18763
FG
12071 if (pending_inc.old_pools.count(pgid.pool())) {
12072 ss << "pool of " << pgid << " is pending removal";
12073 err = -ENOENT;
12074 getline(ss, rs);
12075 wait_for_finished_proposal(op,
12076 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12077 return true;
12078 }
224ce89b
WB
12079
12080 enum {
12081 OP_PG_UPMAP,
12082 OP_RM_PG_UPMAP,
12083 OP_PG_UPMAP_ITEMS,
12084 OP_RM_PG_UPMAP_ITEMS,
12085 } option;
12086
12087 if (prefix == "osd pg-upmap") {
12088 option = OP_PG_UPMAP;
12089 } else if (prefix == "osd rm-pg-upmap") {
12090 option = OP_RM_PG_UPMAP;
12091 } else if (prefix == "osd pg-upmap-items") {
12092 option = OP_PG_UPMAP_ITEMS;
12093 } else {
12094 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 12095 }
224ce89b
WB
12096
12097 // check pending upmap changes
12098 switch (option) {
12099 case OP_PG_UPMAP: // fall through
12100 case OP_RM_PG_UPMAP:
12101 if (pending_inc.new_pg_upmap.count(pgid) ||
12102 pending_inc.old_pg_upmap.count(pgid)) {
12103 dout(10) << __func__ << " waiting for pending update on "
12104 << pgid << dendl;
12105 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12106 return true;
7c673cae 12107 }
224ce89b 12108 break;
7c673cae 12109
224ce89b
WB
12110 case OP_PG_UPMAP_ITEMS: // fall through
12111 case OP_RM_PG_UPMAP_ITEMS:
12112 if (pending_inc.new_pg_upmap_items.count(pgid) ||
12113 pending_inc.old_pg_upmap_items.count(pgid)) {
12114 dout(10) << __func__ << " waiting for pending update on "
12115 << pgid << dendl;
12116 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12117 return true;
12118 }
12119 break;
7c673cae 12120
224ce89b 12121 default:
11fdf7f2 12122 ceph_abort_msg("invalid option");
7c673cae 12123 }
224ce89b
WB
12124
12125 switch (option) {
12126 case OP_PG_UPMAP:
12127 {
12128 vector<int64_t> id_vec;
9f95a23c 12129 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 12130 ss << "unable to parse 'id' value(s) '"
11fdf7f2 12131 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
12132 err = -EINVAL;
12133 goto reply;
12134 }
12135
12136 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12137 if ((int)id_vec.size() < pool_min_size) {
12138 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12139 << pool_min_size << ")";
12140 err = -EINVAL;
12141 goto reply;
12142 }
12143
12144 int pool_size = osdmap.get_pg_pool_size(pgid);
12145 if ((int)id_vec.size() > pool_size) {
12146 ss << "num of osds (" << id_vec.size() <<") > pool size ("
12147 << pool_size << ")";
12148 err = -EINVAL;
12149 goto reply;
12150 }
12151
12152 vector<int32_t> new_pg_upmap;
12153 for (auto osd : id_vec) {
12154 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12155 ss << "osd." << osd << " does not exist";
12156 err = -ENOENT;
12157 goto reply;
12158 }
12159 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12160 if (it != new_pg_upmap.end()) {
12161 ss << "osd." << osd << " already exists, ";
12162 continue;
12163 }
12164 new_pg_upmap.push_back(osd);
12165 }
12166
12167 if (new_pg_upmap.empty()) {
12168 ss << "no valid upmap items(pairs) is specified";
12169 err = -EINVAL;
12170 goto reply;
12171 }
12172
12173 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12174 new_pg_upmap.begin(), new_pg_upmap.end());
12175 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 12176 }
224ce89b
WB
12177 break;
12178
12179 case OP_RM_PG_UPMAP:
12180 {
12181 pending_inc.old_pg_upmap.insert(pgid);
12182 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 12183 }
224ce89b 12184 break;
7c673cae 12185
224ce89b
WB
12186 case OP_PG_UPMAP_ITEMS:
12187 {
12188 vector<int64_t> id_vec;
9f95a23c 12189 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 12190 ss << "unable to parse 'id' value(s) '"
11fdf7f2 12191 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
12192 err = -EINVAL;
12193 goto reply;
12194 }
12195
12196 if (id_vec.size() % 2) {
12197 ss << "you must specify pairs of osd ids to be remapped";
12198 err = -EINVAL;
12199 goto reply;
12200 }
12201
12202 int pool_size = osdmap.get_pg_pool_size(pgid);
12203 if ((int)(id_vec.size() / 2) > pool_size) {
12204 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12205 << pool_size << ")";
12206 err = -EINVAL;
12207 goto reply;
12208 }
12209
12210 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12211 ostringstream items;
12212 items << "[";
12213 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12214 int from = *p++;
12215 int to = *p;
12216 if (from == to) {
12217 ss << "from osd." << from << " == to osd." << to << ", ";
12218 continue;
12219 }
12220 if (!osdmap.exists(from)) {
12221 ss << "osd." << from << " does not exist";
12222 err = -ENOENT;
12223 goto reply;
12224 }
12225 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12226 ss << "osd." << to << " does not exist";
12227 err = -ENOENT;
12228 goto reply;
12229 }
c07f9fc5
FG
12230 pair<int32_t,int32_t> entry = make_pair(from, to);
12231 auto it = std::find(new_pg_upmap_items.begin(),
12232 new_pg_upmap_items.end(), entry);
12233 if (it != new_pg_upmap_items.end()) {
12234 ss << "osd." << from << " -> osd." << to << " already exists, ";
12235 continue;
12236 }
12237 new_pg_upmap_items.push_back(entry);
224ce89b
WB
12238 items << from << "->" << to << ",";
12239 }
12240 string out(items.str());
12241 out.resize(out.size() - 1); // drop last ','
12242 out += "]";
12243
12244 if (new_pg_upmap_items.empty()) {
12245 ss << "no valid upmap items(pairs) is specified";
12246 err = -EINVAL;
12247 goto reply;
12248 }
12249
12250 pending_inc.new_pg_upmap_items[pgid] =
12251 mempool::osdmap::vector<pair<int32_t,int32_t>>(
12252 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12253 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12254 }
12255 break;
12256
12257 case OP_RM_PG_UPMAP_ITEMS:
12258 {
12259 pending_inc.old_pg_upmap_items.insert(pgid);
12260 ss << "clear " << pgid << " pg_upmap_items mapping";
12261 }
12262 break;
12263
12264 default:
11fdf7f2 12265 ceph_abort_msg("invalid option");
7c673cae
FG
12266 }
12267
7c673cae
FG
12268 goto update;
12269 } else if (prefix == "osd primary-affinity") {
12270 int64_t id;
9f95a23c 12271 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12272 ss << "invalid osd id value '"
11fdf7f2 12273 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12274 err = -EINVAL;
12275 goto reply;
12276 }
12277 double w;
9f95a23c 12278 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 12279 ss << "unable to parse 'weight' value '"
11fdf7f2 12280 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
12281 err = -EINVAL;
12282 goto reply;
12283 }
12284 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12285 if (ww < 0L) {
12286 ss << "weight must be >= 0";
12287 err = -EINVAL;
12288 goto reply;
12289 }
9f95a23c
TL
12290 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12291 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 12292 ss << "require_min_compat_client "
9f95a23c 12293 << osdmap.require_min_compat_client
7c673cae
FG
12294 << " < firefly, which is required for primary-affinity";
12295 err = -EPERM;
12296 goto reply;
7c673cae 12297 }
7c673cae
FG
12298 if (osdmap.exists(id)) {
12299 pending_inc.new_primary_affinity[id] = ww;
f67539c2 12300 ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
7c673cae
FG
12301 getline(ss, rs);
12302 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12303 get_last_committed() + 1));
12304 return true;
12305 } else {
12306 ss << "osd." << id << " does not exist";
12307 err = -ENOENT;
12308 goto reply;
12309 }
12310 } else if (prefix == "osd reweight") {
12311 int64_t id;
9f95a23c 12312 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12313 ss << "unable to parse osd id value '"
11fdf7f2 12314 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12315 err = -EINVAL;
12316 goto reply;
12317 }
12318 double w;
9f95a23c 12319 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 12320 ss << "unable to parse weight value '"
11fdf7f2 12321 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
12322 err = -EINVAL;
12323 goto reply;
12324 }
12325 long ww = (int)((double)CEPH_OSD_IN*w);
12326 if (ww < 0L) {
12327 ss << "weight must be >= 0";
12328 err = -EINVAL;
12329 goto reply;
12330 }
12331 if (osdmap.exists(id)) {
12332 pending_inc.new_weight[id] = ww;
12333 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12334 getline(ss, rs);
12335 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12336 get_last_committed() + 1));
12337 return true;
12338 } else {
12339 ss << "osd." << id << " does not exist";
12340 err = -ENOENT;
12341 goto reply;
12342 }
12343 } else if (prefix == "osd reweightn") {
12344 map<int32_t, uint32_t> weights;
11fdf7f2 12345 err = parse_reweights(cct, cmdmap, osdmap, &weights);
7c673cae
FG
12346 if (err) {
12347 ss << "unable to parse 'weights' value '"
11fdf7f2 12348 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
7c673cae
FG
12349 goto reply;
12350 }
12351 pending_inc.new_weight.insert(weights.begin(), weights.end());
12352 wait_for_finished_proposal(
12353 op,
12354 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 12355 return true;
7c673cae
FG
12356 } else if (prefix == "osd lost") {
12357 int64_t id;
9f95a23c 12358 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12359 ss << "unable to parse osd id value '"
11fdf7f2 12360 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12361 err = -EINVAL;
12362 goto reply;
12363 }
11fdf7f2 12364 bool sure = false;
9f95a23c 12365 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 12366 if (!sure) {
7c673cae
FG
12367 ss << "are you SURE? this might mean real, permanent data loss. pass "
12368 "--yes-i-really-mean-it if you really do.";
12369 err = -EPERM;
12370 goto reply;
12371 } else if (!osdmap.exists(id)) {
12372 ss << "osd." << id << " does not exist";
12373 err = -ENOENT;
12374 goto reply;
12375 } else if (!osdmap.is_down(id)) {
12376 ss << "osd." << id << " is not down";
12377 err = -EBUSY;
12378 goto reply;
12379 } else {
12380 epoch_t e = osdmap.get_info(id).down_at;
12381 pending_inc.new_lost[id] = e;
12382 ss << "marked osd lost in epoch " << e;
12383 getline(ss, rs);
12384 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12385 get_last_committed() + 1));
12386 return true;
12387 }
12388
11fdf7f2
TL
12389 } else if (prefix == "osd destroy-actual" ||
12390 prefix == "osd purge-actual" ||
12391 prefix == "osd purge-new") {
31f18b77
FG
12392 /* Destroying an OSD means that we don't expect to further make use of
12393 * the OSDs data (which may even become unreadable after this operation),
12394 * and that we are okay with scrubbing all its cephx keys and config-key
12395 * data (which may include lockbox keys, thus rendering the osd's data
12396 * unreadable).
12397 *
12398 * The OSD will not be removed. Instead, we will mark it as destroyed,
12399 * such that a subsequent call to `create` will not reuse the osd id.
12400 * This will play into being able to recreate the OSD, at the same
12401 * crush location, with minimal data movement.
12402 */
12403
12404 // make sure authmon is writeable.
f67539c2 12405 if (!mon.authmon()->is_writeable()) {
31f18b77
FG
12406 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12407 << "osd destroy" << dendl;
f67539c2 12408 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
31f18b77
FG
12409 return false;
12410 }
12411
12412 int64_t id;
9f95a23c 12413 if (!cmd_getval(cmdmap, "id", id)) {
11fdf7f2
TL
12414 auto p = cmdmap.find("id");
12415 if (p == cmdmap.end()) {
12416 ss << "no osd id specified";
12417 } else {
12418 ss << "unable to parse osd id value '"
12419 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12420 }
31f18b77
FG
12421 err = -EINVAL;
12422 goto reply;
12423 }
12424
11fdf7f2 12425 bool is_destroy = (prefix == "osd destroy-actual");
31f18b77 12426 if (!is_destroy) {
11fdf7f2
TL
12427 ceph_assert("osd purge-actual" == prefix ||
12428 "osd purge-new" == prefix);
31f18b77
FG
12429 }
12430
11fdf7f2 12431 bool sure = false;
9f95a23c 12432 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
12433 if (!sure) {
12434 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12435 << "This will mean real, permanent data loss, as well "
12436 << "as deletion of cephx and lockbox keys. "
12437 << "Pass --yes-i-really-mean-it if you really do.";
31f18b77
FG
12438 err = -EPERM;
12439 goto reply;
d2e6a577 12440 } else if (!osdmap.exists(id)) {
31f18b77 12441 ss << "osd." << id << " does not exist";
d2e6a577 12442 err = 0; // idempotent
31f18b77
FG
12443 goto reply;
12444 } else if (osdmap.is_up(id)) {
12445 ss << "osd." << id << " is not `down`.";
12446 err = -EBUSY;
12447 goto reply;
12448 } else if (is_destroy && osdmap.is_destroyed(id)) {
12449 ss << "destroyed osd." << id;
12450 err = 0;
12451 goto reply;
12452 }
12453
11fdf7f2
TL
12454 if (prefix == "osd purge-new" &&
12455 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12456 ss << "osd." << id << " is not new";
12457 err = -EPERM;
12458 goto reply;
12459 }
12460
31f18b77
FG
12461 bool goto_reply = false;
12462
f67539c2 12463 paxos.plug();
31f18b77
FG
12464 if (is_destroy) {
12465 err = prepare_command_osd_destroy(id, ss);
12466 // we checked above that it should exist.
11fdf7f2 12467 ceph_assert(err != -ENOENT);
31f18b77
FG
12468 } else {
12469 err = prepare_command_osd_purge(id, ss);
12470 if (err == -ENOENT) {
12471 err = 0;
12472 ss << "osd." << id << " does not exist.";
12473 goto_reply = true;
12474 }
12475 }
f67539c2 12476 paxos.unplug();
31f18b77
FG
12477
12478 if (err < 0 || goto_reply) {
12479 goto reply;
12480 }
12481
12482 if (is_destroy) {
12483 ss << "destroyed osd." << id;
12484 } else {
12485 ss << "purged osd." << id;
12486 }
12487
12488 getline(ss, rs);
12489 wait_for_finished_proposal(op,
12490 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12491 force_immediate_propose();
12492 return true;
12493
12494 } else if (prefix == "osd new") {
12495
12496 // make sure authmon is writeable.
f67539c2 12497 if (!mon.authmon()->is_writeable()) {
31f18b77 12498 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 12499 << "osd new" << dendl;
f67539c2 12500 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
31f18b77
FG
12501 return false;
12502 }
12503
3a9019d9 12504 map<string,string> param_map;
31f18b77
FG
12505
12506 bufferlist bl = m->get_data();
3a9019d9
FG
12507 string param_json = bl.to_str();
12508 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 12509
3a9019d9 12510 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
12511 if (err < 0)
12512 goto reply;
12513
3a9019d9 12514 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77 12515
f67539c2 12516 paxos.plug();
3a9019d9 12517 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
f67539c2 12518 paxos.unplug();
31f18b77
FG
12519
12520 if (err < 0) {
12521 goto reply;
12522 }
12523
12524 if (f) {
12525 f->flush(rdata);
12526 } else {
12527 rdata.append(ss);
12528 }
12529
12530 if (err == EEXIST) {
12531 // idempotent operation
12532 err = 0;
12533 goto reply;
12534 }
12535
12536 wait_for_finished_proposal(op,
12537 new Monitor::C_Command(mon, op, 0, rs, rdata,
12538 get_last_committed() + 1));
12539 force_immediate_propose();
12540 return true;
12541
7c673cae 12542 } else if (prefix == "osd create") {
7c673cae
FG
12543
12544 // optional id provided?
31f18b77 12545 int64_t id = -1, cmd_id = -1;
9f95a23c 12546 if (cmd_getval(cmdmap, "id", cmd_id)) {
31f18b77
FG
12547 if (cmd_id < 0) {
12548 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
12549 err = -EINVAL;
12550 goto reply;
12551 }
31f18b77 12552 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
12553 }
12554
7c673cae
FG
12555 uuid_d uuid;
12556 string uuidstr;
9f95a23c 12557 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
7c673cae 12558 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
12559 ss << "invalid uuid value '" << uuidstr << "'";
12560 err = -EINVAL;
12561 goto reply;
7c673cae 12562 }
31f18b77
FG
12563 // we only care about the id if we also have the uuid, to
12564 // ensure the operation's idempotency.
12565 id = cmd_id;
7c673cae
FG
12566 }
12567
31f18b77
FG
12568 int32_t new_id = -1;
12569 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12570 if (err < 0) {
12571 if (err == -EAGAIN) {
12572 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12573 return true;
12574 }
12575 // a check has failed; reply to the user.
12576 goto reply;
12577
12578 } else if (err == EEXIST) {
12579 // this is an idempotent operation; we can go ahead and reply.
12580 if (f) {
12581 f->open_object_section("created_osd");
12582 f->dump_int("osdid", new_id);
12583 f->close_section();
12584 f->flush(rdata);
12585 } else {
12586 ss << new_id;
12587 rdata.append(ss);
7c673cae 12588 }
31f18b77
FG
12589 err = 0;
12590 goto reply;
7c673cae
FG
12591 }
12592
3a9019d9
FG
12593 string empty_device_class;
12594 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 12595
7c673cae
FG
12596 if (f) {
12597 f->open_object_section("created_osd");
31f18b77 12598 f->dump_int("osdid", new_id);
7c673cae
FG
12599 f->close_section();
12600 f->flush(rdata);
12601 } else {
31f18b77 12602 ss << new_id;
7c673cae
FG
12603 rdata.append(ss);
12604 }
31f18b77
FG
12605 wait_for_finished_proposal(op,
12606 new Monitor::C_Command(mon, op, 0, rs, rdata,
12607 get_last_committed() + 1));
7c673cae
FG
12608 return true;
12609
f67539c2
TL
12610 } else if (prefix == "osd blocklist clear" ||
12611 prefix == "osd blacklist clear") {
12612 pending_inc.new_blocklist.clear();
12613 std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12614 osdmap.get_blocklist(&blocklist);
12615 for (const auto &entry : blocklist) {
12616 pending_inc.old_blocklist.push_back(entry.first);
7c673cae 12617 }
f67539c2 12618 ss << " removed all blocklist entries";
7c673cae
FG
12619 getline(ss, rs);
12620 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12621 get_last_committed() + 1));
12622 return true;
f67539c2
TL
12623 } else if (prefix == "osd blocklist" ||
12624 prefix == "osd blacklist") {
7c673cae 12625 string addrstr;
9f95a23c 12626 cmd_getval(cmdmap, "addr", addrstr);
7c673cae
FG
12627 entity_addr_t addr;
12628 if (!addr.parse(addrstr.c_str(), 0)) {
12629 ss << "unable to parse address " << addrstr;
12630 err = -EINVAL;
12631 goto reply;
12632 }
12633 else {
9f95a23c 12634 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
f67539c2 12635 // always blocklist type ANY
11fdf7f2
TL
12636 addr.set_type(entity_addr_t::TYPE_ANY);
12637 } else {
12638 addr.set_type(entity_addr_t::TYPE_LEGACY);
12639 }
12640
f67539c2
TL
12641 string blocklistop;
12642 if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12643 cmd_getval(cmdmap, "blacklistop", blocklistop);
12644 }
12645 if (blocklistop == "add") {
7c673cae
FG
12646 utime_t expires = ceph_clock_now();
12647 double d;
12648 // default one hour
9f95a23c 12649 cmd_getval(cmdmap, "expire", d,
f67539c2 12650 g_conf()->mon_osd_blocklist_default_expire);
7c673cae
FG
12651 expires += d;
12652
f67539c2 12653 pending_inc.new_blocklist[addr] = expires;
224ce89b
WB
12654
12655 {
f67539c2
TL
12656 // cancel any pending un-blocklisting request too
12657 auto it = std::find(pending_inc.old_blocklist.begin(),
12658 pending_inc.old_blocklist.end(), addr);
12659 if (it != pending_inc.old_blocklist.end()) {
12660 pending_inc.old_blocklist.erase(it);
224ce89b
WB
12661 }
12662 }
12663
f67539c2 12664 ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
7c673cae
FG
12665 getline(ss, rs);
12666 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12667 get_last_committed() + 1));
12668 return true;
f67539c2
TL
12669 } else if (blocklistop == "rm") {
12670 if (osdmap.is_blocklisted(addr) ||
12671 pending_inc.new_blocklist.count(addr)) {
12672 if (osdmap.is_blocklisted(addr))
12673 pending_inc.old_blocklist.push_back(addr);
7c673cae 12674 else
f67539c2
TL
12675 pending_inc.new_blocklist.erase(addr);
12676 ss << "un-blocklisting " << addr;
7c673cae
FG
12677 getline(ss, rs);
12678 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12679 get_last_committed() + 1));
12680 return true;
12681 }
f67539c2 12682 ss << addr << " isn't blocklisted";
7c673cae
FG
12683 err = 0;
12684 goto reply;
12685 }
12686 }
12687 } else if (prefix == "osd pool mksnap") {
12688 string poolstr;
9f95a23c 12689 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12690 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12691 if (pool < 0) {
12692 ss << "unrecognized pool '" << poolstr << "'";
12693 err = -ENOENT;
12694 goto reply;
12695 }
12696 string snapname;
9f95a23c 12697 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
12698 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12699 if (p->is_unmanaged_snaps_mode()) {
12700 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12701 err = -EINVAL;
12702 goto reply;
12703 } else if (p->snap_exists(snapname.c_str())) {
12704 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12705 err = 0;
12706 goto reply;
12707 } else if (p->is_tier()) {
12708 ss << "pool " << poolstr << " is a cache tier";
12709 err = -EINVAL;
12710 goto reply;
12711 }
12712 pg_pool_t *pp = 0;
12713 if (pending_inc.new_pools.count(pool))
12714 pp = &pending_inc.new_pools[pool];
12715 if (!pp) {
12716 pp = &pending_inc.new_pools[pool];
12717 *pp = *p;
12718 }
12719 if (pp->snap_exists(snapname.c_str())) {
12720 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12721 } else {
12722 pp->add_snap(snapname.c_str(), ceph_clock_now());
12723 pp->set_snap_epoch(pending_inc.epoch);
12724 ss << "created pool " << poolstr << " snap " << snapname;
12725 }
12726 getline(ss, rs);
12727 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12728 get_last_committed() + 1));
12729 return true;
12730 } else if (prefix == "osd pool rmsnap") {
12731 string poolstr;
9f95a23c 12732 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12733 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12734 if (pool < 0) {
12735 ss << "unrecognized pool '" << poolstr << "'";
12736 err = -ENOENT;
12737 goto reply;
12738 }
12739 string snapname;
9f95a23c 12740 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
12741 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12742 if (p->is_unmanaged_snaps_mode()) {
12743 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12744 err = -EINVAL;
12745 goto reply;
12746 } else if (!p->snap_exists(snapname.c_str())) {
12747 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12748 err = 0;
12749 goto reply;
12750 }
12751 pg_pool_t *pp = 0;
12752 if (pending_inc.new_pools.count(pool))
12753 pp = &pending_inc.new_pools[pool];
12754 if (!pp) {
12755 pp = &pending_inc.new_pools[pool];
12756 *pp = *p;
12757 }
12758 snapid_t sn = pp->snap_exists(snapname.c_str());
12759 if (sn) {
12760 pp->remove_snap(sn);
12761 pp->set_snap_epoch(pending_inc.epoch);
12762 ss << "removed pool " << poolstr << " snap " << snapname;
12763 } else {
12764 ss << "already removed pool " << poolstr << " snap " << snapname;
12765 }
12766 getline(ss, rs);
12767 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12768 get_last_committed() + 1));
12769 return true;
12770 } else if (prefix == "osd pool create") {
11fdf7f2 12771 int64_t pg_num, pg_num_min;
7c673cae 12772 int64_t pgp_num;
9f95a23c
TL
12773 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12774 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12775 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
7c673cae
FG
12776
12777 string pool_type_str;
9f95a23c 12778 cmd_getval(cmdmap, "pool_type", pool_type_str);
7c673cae 12779 if (pool_type_str.empty())
11fdf7f2 12780 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
7c673cae
FG
12781
12782 string poolstr;
9f95a23c 12783 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12784 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12785 if (pool_id >= 0) {
12786 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12787 if (pool_type_str != p->get_type_name()) {
12788 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12789 err = -EINVAL;
12790 } else {
12791 ss << "pool '" << poolstr << "' already exists";
12792 err = 0;
12793 }
12794 goto reply;
12795 }
12796
12797 int pool_type;
12798 if (pool_type_str == "replicated") {
12799 pool_type = pg_pool_t::TYPE_REPLICATED;
12800 } else if (pool_type_str == "erasure") {
7c673cae
FG
12801 pool_type = pg_pool_t::TYPE_ERASURE;
12802 } else {
12803 ss << "unknown pool type '" << pool_type_str << "'";
12804 err = -EINVAL;
12805 goto reply;
12806 }
12807
31f18b77 12808 bool implicit_rule_creation = false;
94b18763 12809 int64_t expected_num_objects = 0;
31f18b77 12810 string rule_name;
9f95a23c 12811 cmd_getval(cmdmap, "rule", rule_name);
7c673cae 12812 string erasure_code_profile;
9f95a23c 12813 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
7c673cae
FG
12814
12815 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12816 if (erasure_code_profile == "")
12817 erasure_code_profile = "default";
12818 //handle the erasure code profile
12819 if (erasure_code_profile == "default") {
12820 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12821 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12822 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12823 goto wait;
12824 }
12825
12826 map<string,string> profile_map;
11fdf7f2 12827 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
12828 profile_map,
12829 &ss);
12830 if (err)
12831 goto reply;
12832 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12833 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12834 goto wait;
12835 }
12836 }
31f18b77
FG
12837 if (rule_name == "") {
12838 implicit_rule_creation = true;
7c673cae 12839 if (erasure_code_profile == "default") {
31f18b77 12840 rule_name = "erasure-code";
7c673cae 12841 } else {
31f18b77 12842 dout(1) << "implicitly use rule named after the pool: "
7c673cae 12843 << poolstr << dendl;
31f18b77 12844 rule_name = poolstr;
7c673cae
FG
12845 }
12846 }
9f95a23c 12847 cmd_getval(cmdmap, "expected_num_objects",
94b18763 12848 expected_num_objects, int64_t(0));
7c673cae 12849 } else {
31f18b77 12850 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
12851 // and put expected_num_objects to rule field
12852 if (erasure_code_profile != "") { // cmd is from CLI
12853 if (rule_name != "") {
12854 string interr;
12855 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12856 if (interr.length()) {
12857 ss << "error parsing integer value '" << rule_name << "': " << interr;
12858 err = -EINVAL;
12859 goto reply;
12860 }
12861 }
12862 rule_name = erasure_code_profile;
12863 } else { // cmd is well-formed
9f95a23c 12864 cmd_getval(cmdmap, "expected_num_objects",
94b18763
FG
12865 expected_num_objects, int64_t(0));
12866 }
7c673cae
FG
12867 }
12868
31f18b77
FG
12869 if (!implicit_rule_creation && rule_name != "") {
12870 int rule;
12871 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
12872 if (err == -EAGAIN) {
12873 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12874 return true;
12875 }
12876 if (err)
12877 goto reply;
12878 }
12879
7c673cae
FG
12880 if (expected_num_objects < 0) {
12881 ss << "'expected_num_objects' must be non-negative";
12882 err = -EINVAL;
12883 goto reply;
12884 }
12885
f6b5b4d7
TL
12886 set<int32_t> osds;
12887 osdmap.get_all_osds(osds);
12888 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12889 string type;
12890 if (!get_osd_objectstore_type(osd, &type)) {
12891 return type == "filestore";
12892 } else {
12893 return false;
12894 }
12895 });
12896
12897 if (has_filestore_osd &&
12898 expected_num_objects > 0 &&
12899 cct->_conf->filestore_merge_threshold > 0) {
91327a77
AA
12900 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12901 err = -EINVAL;
12902 goto reply;
12903 }
12904
f6b5b4d7
TL
12905 if (has_filestore_osd &&
12906 expected_num_objects == 0 &&
12907 cct->_conf->filestore_merge_threshold < 0) {
91327a77 12908 int osds = osdmap.get_num_osds();
f6b5b4d7
TL
12909 bool sure = false;
12910 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12911 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
91327a77 12912 ss << "For better initial performance on pools expected to store a "
f6b5b4d7
TL
12913 << "large number of objects, consider supplying the "
12914 << "expected_num_objects parameter when creating the pool."
12915 << " Pass --yes-i-really-mean-it to ignore it";
12916 err = -EPERM;
12917 goto reply;
91327a77
AA
12918 }
12919 }
12920
7c673cae 12921 int64_t fast_read_param;
9f95a23c 12922 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
7c673cae
FG
12923 FastReadType fast_read = FAST_READ_DEFAULT;
12924 if (fast_read_param == 0)
12925 fast_read = FAST_READ_OFF;
12926 else if (fast_read_param > 0)
12927 fast_read = FAST_READ_ON;
11fdf7f2
TL
12928
12929 int64_t repl_size = 0;
9f95a23c 12930 cmd_getval(cmdmap, "size", repl_size);
11fdf7f2
TL
12931 int64_t target_size_bytes = 0;
12932 double target_size_ratio = 0.0;
9f95a23c
TL
12933 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12934 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12935
12936 string pg_autoscale_mode;
12937 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
11fdf7f2
TL
12938
12939 err = prepare_new_pool(poolstr,
7c673cae 12940 -1, // default crush rule
31f18b77 12941 rule_name,
11fdf7f2
TL
12942 pg_num, pgp_num, pg_num_min,
12943 repl_size, target_size_bytes, target_size_ratio,
7c673cae
FG
12944 erasure_code_profile, pool_type,
12945 (uint64_t)expected_num_objects,
12946 fast_read,
9f95a23c 12947 pg_autoscale_mode,
7c673cae
FG
12948 &ss);
12949 if (err < 0) {
12950 switch(err) {
12951 case -EEXIST:
12952 ss << "pool '" << poolstr << "' already exists";
12953 break;
12954 case -EAGAIN:
12955 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12956 return true;
12957 case -ERANGE:
12958 goto reply;
12959 default:
12960 goto reply;
12961 break;
12962 }
12963 } else {
12964 ss << "pool '" << poolstr << "' created";
12965 }
12966 getline(ss, rs);
12967 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12968 get_last_committed() + 1));
12969 return true;
12970
12971 } else if (prefix == "osd pool delete" ||
12972 prefix == "osd pool rm") {
12973 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12974 string poolstr, poolstr2, sure;
9f95a23c
TL
12975 cmd_getval(cmdmap, "pool", poolstr);
12976 cmd_getval(cmdmap, "pool2", poolstr2);
7c673cae
FG
12977 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12978 if (pool < 0) {
12979 ss << "pool '" << poolstr << "' does not exist";
12980 err = 0;
12981 goto reply;
12982 }
12983
11fdf7f2 12984 bool force_no_fake = false;
9f95a23c 12985 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11fdf7f2 12986 bool force = false;
9f95a23c 12987 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
7c673cae 12988 if (poolstr2 != poolstr ||
11fdf7f2 12989 (!force && !force_no_fake)) {
7c673cae
FG
12990 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12991 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12992 << "followed by --yes-i-really-really-mean-it.";
12993 err = -EPERM;
12994 goto reply;
12995 }
12996 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12997 if (err == -EAGAIN) {
12998 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12999 return true;
13000 }
13001 if (err < 0)
13002 goto reply;
13003 goto update;
13004 } else if (prefix == "osd pool rename") {
13005 string srcpoolstr, destpoolstr;
9f95a23c
TL
13006 cmd_getval(cmdmap, "srcpool", srcpoolstr);
13007 cmd_getval(cmdmap, "destpool", destpoolstr);
7c673cae
FG
13008 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13009 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13010
13011 if (pool_src < 0) {
13012 if (pool_dst >= 0) {
13013 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13014 // of operations, assume this rename succeeded, as it is not changing
13015 // the current state. Make sure we output something understandable
13016 // for whoever is issuing the command, if they are paying attention,
13017 // in case it was not intentional; or to avoid a "wtf?" and a bug
13018 // report in case it was intentional, while expecting a failure.
13019 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13020 << destpoolstr << "' does -- assuming successful rename";
13021 err = 0;
13022 } else {
13023 ss << "unrecognized pool '" << srcpoolstr << "'";
13024 err = -ENOENT;
13025 }
13026 goto reply;
13027 } else if (pool_dst >= 0) {
13028 // source pool exists and so does the destination pool
13029 ss << "pool '" << destpoolstr << "' already exists";
13030 err = -EEXIST;
13031 goto reply;
13032 }
13033
13034 int ret = _prepare_rename_pool(pool_src, destpoolstr);
13035 if (ret == 0) {
13036 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13037 } else {
13038 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13039 << cpp_strerror(ret);
13040 }
13041 getline(ss, rs);
13042 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13043 get_last_committed() + 1));
13044 return true;
13045
13046 } else if (prefix == "osd pool set") {
13047 err = prepare_command_pool_set(cmdmap, ss);
13048 if (err == -EAGAIN)
13049 goto wait;
13050 if (err < 0)
13051 goto reply;
13052
13053 getline(ss, rs);
13054 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13055 get_last_committed() + 1));
13056 return true;
13057 } else if (prefix == "osd tier add") {
13058 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13059 if (err == -EAGAIN)
13060 goto wait;
13061 if (err)
13062 goto reply;
13063 string poolstr;
9f95a23c 13064 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13065 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13066 if (pool_id < 0) {
13067 ss << "unrecognized pool '" << poolstr << "'";
13068 err = -ENOENT;
13069 goto reply;
13070 }
13071 string tierpoolstr;
9f95a23c 13072 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13073 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13074 if (tierpool_id < 0) {
13075 ss << "unrecognized pool '" << tierpoolstr << "'";
13076 err = -ENOENT;
13077 goto reply;
13078 }
13079 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13080 ceph_assert(p);
7c673cae 13081 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13082 ceph_assert(tp);
7c673cae
FG
13083
13084 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13085 goto reply;
13086 }
13087
13088 // make sure new tier is empty
13089 string force_nonempty;
9f95a23c 13090 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
f67539c2 13091 const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 13092 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
13093 force_nonempty != "--force-nonempty") {
13094 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13095 err = -ENOTEMPTY;
13096 goto reply;
13097 }
11fdf7f2 13098 if (tp->is_erasure()) {
7c673cae
FG
13099 ss << "tier pool '" << tierpoolstr
13100 << "' is an ec pool, which cannot be a tier";
13101 err = -ENOTSUP;
13102 goto reply;
13103 }
13104 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13105 ((force_nonempty != "--force-nonempty") ||
11fdf7f2 13106 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
7c673cae
FG
13107 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13108 err = -ENOTEMPTY;
13109 goto reply;
13110 }
13111 // go
13112 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13113 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13114 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13115 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13116 return true;
13117 }
13118 np->tiers.insert(tierpool_id);
13119 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13120 ntp->tier_of = pool_id;
13121 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13122 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13123 get_last_committed() + 1));
13124 return true;
13125 } else if (prefix == "osd tier remove" ||
13126 prefix == "osd tier rm") {
13127 string poolstr;
9f95a23c 13128 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13129 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13130 if (pool_id < 0) {
13131 ss << "unrecognized pool '" << poolstr << "'";
13132 err = -ENOENT;
13133 goto reply;
13134 }
13135 string tierpoolstr;
9f95a23c 13136 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13137 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13138 if (tierpool_id < 0) {
13139 ss << "unrecognized pool '" << tierpoolstr << "'";
13140 err = -ENOENT;
13141 goto reply;
13142 }
13143 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13144 ceph_assert(p);
7c673cae 13145 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13146 ceph_assert(tp);
7c673cae
FG
13147
13148 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13149 goto reply;
13150 }
13151
13152 if (p->tiers.count(tierpool_id) == 0) {
13153 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13154 err = 0;
13155 goto reply;
13156 }
13157 if (tp->tier_of != pool_id) {
13158 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13159 << osdmap.get_pool_name(tp->tier_of) << "': "
13160 // be scary about it; this is an inconsistency and bells must go off
13161 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13162 err = -EINVAL;
13163 goto reply;
13164 }
13165 if (p->read_tier == tierpool_id) {
13166 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13167 err = -EBUSY;
13168 goto reply;
13169 }
13170 // go
13171 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13172 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13173 if (np->tiers.count(tierpool_id) == 0 ||
13174 ntp->tier_of != pool_id ||
13175 np->read_tier == tierpool_id) {
13176 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13177 return true;
13178 }
13179 np->tiers.erase(tierpool_id);
13180 ntp->clear_tier();
13181 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13182 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13183 get_last_committed() + 1));
13184 return true;
13185 } else if (prefix == "osd tier set-overlay") {
13186 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13187 if (err == -EAGAIN)
13188 goto wait;
13189 if (err)
13190 goto reply;
13191 string poolstr;
9f95a23c 13192 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13193 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13194 if (pool_id < 0) {
13195 ss << "unrecognized pool '" << poolstr << "'";
13196 err = -ENOENT;
13197 goto reply;
13198 }
13199 string overlaypoolstr;
9f95a23c 13200 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
7c673cae
FG
13201 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13202 if (overlaypool_id < 0) {
13203 ss << "unrecognized pool '" << overlaypoolstr << "'";
13204 err = -ENOENT;
13205 goto reply;
13206 }
13207 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13208 ceph_assert(p);
7c673cae 13209 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11fdf7f2 13210 ceph_assert(overlay_p);
7c673cae
FG
13211 if (p->tiers.count(overlaypool_id) == 0) {
13212 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13213 err = -EINVAL;
13214 goto reply;
13215 }
13216 if (p->read_tier == overlaypool_id) {
13217 err = 0;
13218 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13219 goto reply;
13220 }
13221 if (p->has_read_tier()) {
13222 ss << "pool '" << poolstr << "' has overlay '"
13223 << osdmap.get_pool_name(p->read_tier)
13224 << "'; please remove-overlay first";
13225 err = -EINVAL;
13226 goto reply;
13227 }
13228
13229 // go
13230 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13231 np->read_tier = overlaypool_id;
13232 np->write_tier = overlaypool_id;
13233 np->set_last_force_op_resend(pending_inc.epoch);
13234 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13235 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13236 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13237 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13238 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13239 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13240 get_last_committed() + 1));
13241 return true;
13242 } else if (prefix == "osd tier remove-overlay" ||
13243 prefix == "osd tier rm-overlay") {
13244 string poolstr;
9f95a23c 13245 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13246 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13247 if (pool_id < 0) {
13248 ss << "unrecognized pool '" << poolstr << "'";
13249 err = -ENOENT;
13250 goto reply;
13251 }
13252 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13253 ceph_assert(p);
7c673cae
FG
13254 if (!p->has_read_tier()) {
13255 err = 0;
13256 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13257 goto reply;
13258 }
13259
13260 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13261 goto reply;
13262 }
13263
13264 // go
13265 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13266 if (np->has_read_tier()) {
13267 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13268 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13269 nop->set_last_force_op_resend(pending_inc.epoch);
13270 }
13271 if (np->has_write_tier()) {
13272 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13273 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13274 nop->set_last_force_op_resend(pending_inc.epoch);
13275 }
13276 np->clear_read_tier();
13277 np->clear_write_tier();
13278 np->set_last_force_op_resend(pending_inc.epoch);
13279 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13280 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13281 get_last_committed() + 1));
13282 return true;
13283 } else if (prefix == "osd tier cache-mode") {
13284 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13285 if (err == -EAGAIN)
13286 goto wait;
13287 if (err)
13288 goto reply;
13289 string poolstr;
9f95a23c 13290 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13291 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13292 if (pool_id < 0) {
13293 ss << "unrecognized pool '" << poolstr << "'";
13294 err = -ENOENT;
13295 goto reply;
13296 }
13297 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13298 ceph_assert(p);
7c673cae
FG
13299 if (!p->is_tier()) {
13300 ss << "pool '" << poolstr << "' is not a tier";
13301 err = -EINVAL;
13302 goto reply;
13303 }
13304 string modestr;
9f95a23c 13305 cmd_getval(cmdmap, "mode", modestr);
7c673cae 13306 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 13307 if (int(mode) < 0) {
7c673cae
FG
13308 ss << "'" << modestr << "' is not a valid cache mode";
13309 err = -EINVAL;
13310 goto reply;
13311 }
13312
11fdf7f2 13313 bool sure = false;
9f95a23c 13314 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 13315
9f95a23c
TL
13316 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13317 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13318 ss << "'" << modestr << "' is no longer a supported cache mode";
13319 err = -EPERM;
13320 goto reply;
13321 }
7c673cae
FG
13322 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13323 mode != pg_pool_t::CACHEMODE_NONE &&
13324 mode != pg_pool_t::CACHEMODE_PROXY &&
13325 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11fdf7f2 13326 !sure) {
7c673cae
FG
13327 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13328 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13329 err = -EPERM;
13330 goto reply;
13331 }
13332
13333 // pool already has this cache-mode set and there are no pending changes
13334 if (p->cache_mode == mode &&
13335 (pending_inc.new_pools.count(pool_id) == 0 ||
13336 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13337 ss << "set cache-mode for pool '" << poolstr << "'"
13338 << " to " << pg_pool_t::get_cache_mode_name(mode);
13339 err = 0;
13340 goto reply;
13341 }
13342
13343 /* Mode description:
13344 *
13345 * none: No cache-mode defined
9f95a23c 13346 * forward: Forward all reads and writes to base pool [removed]
7c673cae
FG
13347 * writeback: Cache writes, promote reads from base pool
13348 * readonly: Forward writes to base pool
9f95a23c 13349 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
7c673cae
FG
13350 * proxy: Proxy all reads and writes to base pool
13351 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13352 *
13353 * Hence, these are the allowed transitions:
13354 *
13355 * none -> any
13356 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c 13357 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
7c673cae 13358 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c
TL
13359 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13360 * writeback -> readproxy || proxy
7c673cae
FG
13361 * readonly -> any
13362 */
13363
13364 // We check if the transition is valid against the current pool mode, as
13365 // it is the only committed state thus far. We will blantly squash
13366 // whatever mode is on the pending state.
13367
13368 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
9f95a23c 13369 (mode != pg_pool_t::CACHEMODE_PROXY &&
7c673cae
FG
13370 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13371 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13372 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13373 << "' pool; only '"
7c673cae
FG
13374 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13375 << "' allowed.";
13376 err = -EINVAL;
13377 goto reply;
13378 }
13379 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13380 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13381 mode != pg_pool_t::CACHEMODE_PROXY &&
13382 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13383
13384 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13385 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13386 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13387
13388 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13389 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13390 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13391
13392 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13393 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13394 mode != pg_pool_t::CACHEMODE_PROXY &&
13395 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13396
31f18b77 13397 const pool_stat_t* pstats =
f67539c2 13398 mon.mgrstatmon()->get_pool_stat(pool_id);
7c673cae 13399
31f18b77 13400 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
13401 ss << "unable to set cache-mode '"
13402 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13403 << "': dirty objects found";
13404 err = -EBUSY;
13405 goto reply;
13406 }
13407 }
13408 // go
13409 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13410 np->cache_mode = mode;
13411 // set this both when moving to and from cache_mode NONE. this is to
13412 // capture legacy pools that were set up before this flag existed.
13413 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13414 ss << "set cache-mode for pool '" << poolstr
13415 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13416 if (mode == pg_pool_t::CACHEMODE_NONE) {
13417 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11fdf7f2 13418 ceph_assert(base_pool);
7c673cae
FG
13419 if (base_pool->read_tier == pool_id ||
13420 base_pool->write_tier == pool_id)
13421 ss <<" (WARNING: pool is still configured as read or write tier)";
13422 }
13423 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13424 get_last_committed() + 1));
13425 return true;
13426 } else if (prefix == "osd tier add-cache") {
13427 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13428 if (err == -EAGAIN)
13429 goto wait;
13430 if (err)
13431 goto reply;
13432 string poolstr;
9f95a23c 13433 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13434 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13435 if (pool_id < 0) {
13436 ss << "unrecognized pool '" << poolstr << "'";
13437 err = -ENOENT;
13438 goto reply;
13439 }
13440 string tierpoolstr;
9f95a23c 13441 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13442 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13443 if (tierpool_id < 0) {
13444 ss << "unrecognized pool '" << tierpoolstr << "'";
13445 err = -ENOENT;
13446 goto reply;
13447 }
13448 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13449 ceph_assert(p);
7c673cae 13450 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13451 ceph_assert(tp);
7c673cae
FG
13452
13453 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13454 goto reply;
13455 }
13456
13457 int64_t size = 0;
9f95a23c 13458 if (!cmd_getval(cmdmap, "size", size)) {
7c673cae 13459 ss << "unable to parse 'size' value '"
11fdf7f2 13460 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
7c673cae
FG
13461 err = -EINVAL;
13462 goto reply;
13463 }
13464 // make sure new tier is empty
31f18b77 13465 const pool_stat_t *pstats =
f67539c2 13466 mon.mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 13467 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
13468 ss << "tier pool '" << tierpoolstr << "' is not empty";
13469 err = -ENOTEMPTY;
13470 goto reply;
13471 }
11fdf7f2 13472 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
7c673cae 13473 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 13474 if (int(mode) < 0) {
7c673cae
FG
13475 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13476 err = -EINVAL;
13477 goto reply;
13478 }
13479 HitSet::Params hsp;
11fdf7f2
TL
13480 auto& cache_hit_set_type =
13481 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13482 if (cache_hit_set_type == "bloom") {
7c673cae 13483 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 13484 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae 13485 hsp = HitSet::Params(bsp);
11fdf7f2 13486 } else if (cache_hit_set_type == "explicit_hash") {
7c673cae 13487 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11fdf7f2 13488 } else if (cache_hit_set_type == "explicit_object") {
7c673cae
FG
13489 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13490 } else {
11fdf7f2
TL
13491 ss << "osd tier cache default hit set type '"
13492 << cache_hit_set_type << "' is not a known type";
7c673cae
FG
13493 err = -EINVAL;
13494 goto reply;
13495 }
13496 // go
13497 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13498 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13499 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13500 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13501 return true;
13502 }
13503 np->tiers.insert(tierpool_id);
13504 np->read_tier = np->write_tier = tierpool_id;
13505 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13506 np->set_last_force_op_resend(pending_inc.epoch);
13507 ntp->set_last_force_op_resend(pending_inc.epoch);
13508 ntp->tier_of = pool_id;
13509 ntp->cache_mode = mode;
11fdf7f2
TL
13510 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13511 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13512 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13513 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13514 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13515 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
7c673cae
FG
13516 ntp->hit_set_params = hsp;
13517 ntp->target_max_bytes = size;
13518 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13519 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13520 get_last_committed() + 1));
13521 return true;
13522 } else if (prefix == "osd pool set-quota") {
13523 string poolstr;
9f95a23c 13524 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13525 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13526 if (pool_id < 0) {
13527 ss << "unrecognized pool '" << poolstr << "'";
13528 err = -ENOENT;
13529 goto reply;
13530 }
13531
13532 string field;
9f95a23c 13533 cmd_getval(cmdmap, "field", field);
7c673cae
FG
13534 if (field != "max_objects" && field != "max_bytes") {
13535 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13536 err = -EINVAL;
13537 goto reply;
13538 }
13539
13540 // val could contain unit designations, so we treat as a string
13541 string val;
9f95a23c 13542 cmd_getval(cmdmap, "val", val);
1adf2230
AA
13543 string tss;
13544 int64_t value;
13545 if (field == "max_objects") {
13546 value = strict_sistrtoll(val.c_str(), &tss);
13547 } else if (field == "max_bytes") {
13548 value = strict_iecstrtoll(val.c_str(), &tss);
13549 } else {
11fdf7f2 13550 ceph_abort_msg("unrecognized option");
1adf2230
AA
13551 }
13552 if (!tss.empty()) {
13553 ss << "error parsing value '" << val << "': " << tss;
13554 err = -EINVAL;
7c673cae
FG
13555 goto reply;
13556 }
13557
13558 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13559 if (field == "max_objects") {
13560 pi->quota_max_objects = value;
13561 } else if (field == "max_bytes") {
13562 pi->quota_max_bytes = value;
13563 } else {
11fdf7f2 13564 ceph_abort_msg("unrecognized option");
7c673cae
FG
13565 }
13566 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13567 rs = ss.str();
13568 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13569 get_last_committed() + 1));
13570 return true;
c07f9fc5
FG
13571 } else if (prefix == "osd pool application enable" ||
13572 prefix == "osd pool application disable" ||
13573 prefix == "osd pool application set" ||
13574 prefix == "osd pool application rm") {
13575 err = prepare_command_pool_application(prefix, cmdmap, ss);
11fdf7f2 13576 if (err == -EAGAIN) {
c07f9fc5 13577 goto wait;
11fdf7f2 13578 } else if (err < 0) {
7c673cae 13579 goto reply;
7c673cae 13580 } else {
11fdf7f2 13581 goto update;
7c673cae 13582 }
c07f9fc5
FG
13583 } else if (prefix == "osd force-create-pg") {
13584 pg_t pgid;
13585 string pgidstr;
9f95a23c 13586 cmd_getval(cmdmap, "pgid", pgidstr);
c07f9fc5
FG
13587 if (!pgid.parse(pgidstr.c_str())) {
13588 ss << "invalid pgid '" << pgidstr << "'";
13589 err = -EINVAL;
13590 goto reply;
13591 }
94b18763
FG
13592 if (!osdmap.pg_exists(pgid)) {
13593 ss << "pg " << pgid << " should not exist";
13594 err = -ENOENT;
13595 goto reply;
13596 }
11fdf7f2 13597 bool sure = false;
9f95a23c 13598 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
13599 if (!sure) {
13600 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13601 << "that the cluster will give up ever trying to recover the lost data. Do this "
13602 << "only if you are certain that all copies of the PG are in fact lost and you are "
13603 << "willing to accept that the data is permanently destroyed. Pass "
13604 << "--yes-i-really-mean-it to proceed.";
13605 err = -EPERM;
13606 goto reply;
13607 }
c07f9fc5
FG
13608 bool creating_now;
13609 {
13610 std::lock_guard<std::mutex> l(creating_pgs_lock);
9f95a23c
TL
13611 auto emplaced = creating_pgs.pgs.emplace(
13612 pgid,
13613 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13614 ceph_clock_now()));
c07f9fc5
FG
13615 creating_now = emplaced.second;
13616 }
13617 if (creating_now) {
13618 ss << "pg " << pgidstr << " now creating, ok";
11fdf7f2
TL
13619 // set the pool's CREATING flag so that (1) the osd won't ignore our
13620 // create message and (2) we won't propose any future pg_num changes
13621 // until after the PG has been instantiated.
13622 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13623 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13624 }
13625 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
c07f9fc5
FG
13626 err = 0;
13627 goto update;
13628 } else {
13629 ss << "pg " << pgid << " already creating";
13630 err = 0;
13631 goto reply;
13632 }
f67539c2
TL
13633 } else if (prefix == "osd force_healthy_stretch_mode") {
13634 bool sure = false;
13635 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13636 if (!sure) {
13637 ss << "This command will require peering across multiple CRUSH buckets "
13638 "(probably two data centers or availability zones?) and may result in PGs "
13639 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13640 err = -EPERM;
13641 goto reply;
13642 }
13643 try_end_recovery_stretch_mode(true);
13644 ss << "Triggering healthy stretch mode";
13645 err = 0;
13646 goto reply;
13647 } else if (prefix == "osd force_recovery_stretch_mode") {
13648 bool sure = false;
13649 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13650 if (!sure) {
13651 ss << "This command will increase pool sizes to try and spread them "
13652 "across multiple CRUSH buckets (probably two data centers or "
13653 "availability zones?) and should have happened automatically"
13654 "Pass --yes-i-really-mean-it to proceed.";
13655 err = -EPERM;
13656 goto reply;
13657 }
13658 mon.go_recovery_stretch_mode();
13659 ss << "Triggering recovery stretch mode";
13660 err = 0;
13661 goto reply;
7c673cae
FG
13662 } else {
13663 err = -EINVAL;
13664 }
13665
13666 reply:
13667 getline(ss, rs);
13668 if (err < 0 && rs.length() == 0)
13669 rs = cpp_strerror(err);
f67539c2 13670 mon.reply_command(op, err, rs, rdata, get_last_committed());
7c673cae
FG
13671 return ret;
13672
13673 update:
13674 getline(ss, rs);
13675 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13676 get_last_committed() + 1));
13677 return true;
13678
13679 wait:
13680 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13681 return true;
13682}
13683
28e407b8 13684bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
7c673cae
FG
13685{
13686 op->mark_osdmon_event(__func__);
28e407b8 13687
9f95a23c 13688 auto m = op->get_req<MPoolOp>();
11fdf7f2 13689 MonSession *session = op->get_session();
28e407b8
AA
13690 if (!session) {
13691 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13692 return true;
13693 }
13694
13695 switch (m->op) {
13696 case POOL_OP_CREATE_UNMANAGED_SNAP:
13697 case POOL_OP_DELETE_UNMANAGED_SNAP:
13698 {
13699 const std::string* pool_name = nullptr;
13700 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13701 if (pg_pool != nullptr) {
13702 pool_name = &osdmap.get_pool_name(m->pool);
13703 }
13704
f67539c2 13705 if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
28e407b8 13706 session->entity_name, session->caps,
11fdf7f2 13707 session->get_peer_socket_addr(),
28e407b8
AA
13708 pool_name)) {
13709 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13710 << "privileges. message: " << *m << std::endl
13711 << "caps: " << session->caps << dendl;
13712 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13713 return true;
13714 }
13715 }
13716 break;
13717 default:
13718 if (!session->is_capable("osd", MON_CAP_W)) {
13719 dout(0) << "got pool op from entity with insufficient privileges. "
13720 << "message: " << *m << std::endl
13721 << "caps: " << session->caps << dendl;
13722 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13723 return true;
13724 }
13725 break;
13726 }
13727
13728 return false;
13729}
13730
13731bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13732{
13733 op->mark_osdmon_event(__func__);
9f95a23c 13734 auto m = op->get_req<MPoolOp>();
28e407b8
AA
13735
13736 if (enforce_pool_op_caps(op)) {
13737 return true;
13738 }
13739
f67539c2 13740 if (m->fsid != mon.monmap->fsid) {
7c673cae 13741 dout(0) << __func__ << " drop message on fsid " << m->fsid
f67539c2 13742 << " != " << mon.monmap->fsid << " for " << *m << dendl;
7c673cae
FG
13743 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13744 return true;
13745 }
13746
13747 if (m->op == POOL_OP_CREATE)
13748 return preprocess_pool_op_create(op);
13749
11fdf7f2
TL
13750 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13751 if (p == nullptr) {
7c673cae 13752 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11fdf7f2
TL
13753 if (m->op == POOL_OP_DELETE) {
13754 _pool_op_reply(op, 0, osdmap.get_epoch());
13755 } else {
13756 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13757 }
7c673cae
FG
13758 return true;
13759 }
13760
13761 // check if the snap and snapname exist
13762 bool snap_exists = false;
7c673cae
FG
13763 if (p->snap_exists(m->name.c_str()))
13764 snap_exists = true;
13765
13766 switch (m->op) {
13767 case POOL_OP_CREATE_SNAP:
13768 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13769 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13770 return true;
13771 }
13772 if (snap_exists) {
13773 _pool_op_reply(op, 0, osdmap.get_epoch());
13774 return true;
13775 }
13776 return false;
13777 case POOL_OP_CREATE_UNMANAGED_SNAP:
13778 if (p->is_pool_snaps_mode()) {
13779 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13780 return true;
13781 }
13782 return false;
13783 case POOL_OP_DELETE_SNAP:
13784 if (p->is_unmanaged_snaps_mode()) {
13785 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13786 return true;
13787 }
13788 if (!snap_exists) {
13789 _pool_op_reply(op, 0, osdmap.get_epoch());
13790 return true;
13791 }
13792 return false;
13793 case POOL_OP_DELETE_UNMANAGED_SNAP:
13794 if (p->is_pool_snaps_mode()) {
13795 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13796 return true;
13797 }
9f95a23c 13798 if (_is_removed_snap(m->pool, m->snapid)) {
7c673cae
FG
13799 _pool_op_reply(op, 0, osdmap.get_epoch());
13800 return true;
13801 }
13802 return false;
13803 case POOL_OP_DELETE:
13804 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13805 _pool_op_reply(op, 0, osdmap.get_epoch());
13806 return true;
13807 }
13808 return false;
13809 case POOL_OP_AUID_CHANGE:
13810 return false;
13811 default:
13812 ceph_abort();
13813 break;
13814 }
13815
13816 return false;
13817}
13818
9f95a23c
TL
13819bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13820{
13821 if (!osdmap.have_pg_pool(pool)) {
13822 dout(10) << __func__ << " pool " << pool << " snap " << snap
13823 << " - pool dne" << dendl;
13824 return true;
13825 }
13826 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13827 dout(10) << __func__ << " pool " << pool << " snap " << snap
13828 << " - in osdmap removed_snaps_queue" << dendl;
13829 return true;
13830 }
13831 snapid_t begin, end;
13832 int r = lookup_purged_snap(pool, snap, &begin, &end);
13833 if (r == 0) {
13834 dout(10) << __func__ << " pool " << pool << " snap " << snap
13835 << " - purged, [" << begin << "," << end << ")" << dendl;
13836 return true;
13837 }
13838 return false;
13839}
13840
13841bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13842{
13843 if (pending_inc.old_pools.count(pool)) {
13844 dout(10) << __func__ << " pool " << pool << " snap " << snap
13845 << " - pool pending deletion" << dendl;
13846 return true;
13847 }
13848 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13849 dout(10) << __func__ << " pool " << pool << " snap " << snap
13850 << " - in pending new_removed_snaps" << dendl;
13851 return true;
13852 }
13853 return false;
13854}
13855
7c673cae
FG
13856bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13857{
13858 op->mark_osdmon_event(__func__);
9f95a23c 13859 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13860 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13861 if (pool >= 0) {
13862 _pool_op_reply(op, 0, osdmap.get_epoch());
13863 return true;
13864 }
13865
13866 return false;
13867}
13868
13869bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13870{
13871 op->mark_osdmon_event(__func__);
9f95a23c 13872 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13873 dout(10) << "prepare_pool_op " << *m << dendl;
13874 if (m->op == POOL_OP_CREATE) {
13875 return prepare_pool_op_create(op);
13876 } else if (m->op == POOL_OP_DELETE) {
13877 return prepare_pool_op_delete(op);
13878 }
13879
13880 int ret = 0;
13881 bool changed = false;
13882
13883 if (!osdmap.have_pg_pool(m->pool)) {
13884 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13885 return false;
13886 }
13887
13888 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13889
13890 switch (m->op) {
13891 case POOL_OP_CREATE_SNAP:
13892 if (pool->is_tier()) {
13893 ret = -EINVAL;
13894 _pool_op_reply(op, ret, osdmap.get_epoch());
13895 return false;
13896 } // else, fall through
13897 case POOL_OP_DELETE_SNAP:
13898 if (!pool->is_unmanaged_snaps_mode()) {
13899 bool snap_exists = pool->snap_exists(m->name.c_str());
13900 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13901 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13902 ret = 0;
13903 } else {
13904 break;
13905 }
13906 } else {
13907 ret = -EINVAL;
13908 }
13909 _pool_op_reply(op, ret, osdmap.get_epoch());
13910 return false;
13911
13912 case POOL_OP_DELETE_UNMANAGED_SNAP:
13913 // we won't allow removal of an unmanaged snapshot from a pool
13914 // not in unmanaged snaps mode.
13915 if (!pool->is_unmanaged_snaps_mode()) {
13916 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13917 return false;
13918 }
13919 /* fall-thru */
13920 case POOL_OP_CREATE_UNMANAGED_SNAP:
13921 // but we will allow creating an unmanaged snapshot on any pool
13922 // as long as it is not in 'pool' snaps mode.
13923 if (pool->is_pool_snaps_mode()) {
13924 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13925 return false;
13926 }
13927 }
13928
13929 // projected pool info
13930 pg_pool_t pp;
13931 if (pending_inc.new_pools.count(m->pool))
13932 pp = pending_inc.new_pools[m->pool];
13933 else
13934 pp = *osdmap.get_pg_pool(m->pool);
13935
13936 bufferlist reply_data;
13937
13938 // pool snaps vs unmanaged snaps are mutually exclusive
13939 switch (m->op) {
13940 case POOL_OP_CREATE_SNAP:
13941 case POOL_OP_DELETE_SNAP:
13942 if (pp.is_unmanaged_snaps_mode()) {
13943 ret = -EINVAL;
13944 goto out;
13945 }
13946 break;
13947
13948 case POOL_OP_CREATE_UNMANAGED_SNAP:
13949 case POOL_OP_DELETE_UNMANAGED_SNAP:
13950 if (pp.is_pool_snaps_mode()) {
13951 ret = -EINVAL;
13952 goto out;
13953 }
13954 }
13955
13956 switch (m->op) {
13957 case POOL_OP_CREATE_SNAP:
13958 if (!pp.snap_exists(m->name.c_str())) {
13959 pp.add_snap(m->name.c_str(), ceph_clock_now());
11fdf7f2
TL
13960 dout(10) << "create snap in pool " << m->pool << " " << m->name
13961 << " seq " << pp.get_snap_epoch() << dendl;
7c673cae
FG
13962 changed = true;
13963 }
13964 break;
13965
13966 case POOL_OP_DELETE_SNAP:
13967 {
13968 snapid_t s = pp.snap_exists(m->name.c_str());
13969 if (s) {
13970 pp.remove_snap(s);
11fdf7f2 13971 pending_inc.new_removed_snaps[m->pool].insert(s);
7c673cae
FG
13972 changed = true;
13973 }
13974 }
13975 break;
13976
13977 case POOL_OP_CREATE_UNMANAGED_SNAP:
13978 {
9f95a23c
TL
13979 uint64_t snapid = pp.add_unmanaged_snap(
13980 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 13981 encode(snapid, reply_data);
7c673cae
FG
13982 changed = true;
13983 }
13984 break;
13985
13986 case POOL_OP_DELETE_UNMANAGED_SNAP:
9f95a23c
TL
13987 if (!_is_removed_snap(m->pool, m->snapid) &&
13988 !_is_pending_removed_snap(m->pool, m->snapid)) {
28e407b8
AA
13989 if (m->snapid > pp.get_snap_seq()) {
13990 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13991 return false;
13992 }
9f95a23c
TL
13993 pp.remove_unmanaged_snap(
13994 m->snapid,
13995 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 13996 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
9f95a23c
TL
13997 // also record the new seq as purged: this avoids a discontinuity
13998 // after all of the snaps have been purged, since the seq assigned
13999 // during removal lives in the same namespace as the actual snaps.
14000 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
7c673cae
FG
14001 changed = true;
14002 }
14003 break;
14004
14005 case POOL_OP_AUID_CHANGE:
11fdf7f2
TL
14006 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14007 return false;
7c673cae
FG
14008
14009 default:
14010 ceph_abort();
14011 break;
14012 }
14013
14014 if (changed) {
14015 pp.set_snap_epoch(pending_inc.epoch);
14016 pending_inc.new_pools[m->pool] = pp;
14017 }
14018
14019 out:
14020 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14021 return true;
14022}
14023
14024bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14025{
14026 op->mark_osdmon_event(__func__);
14027 int err = prepare_new_pool(op);
14028 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14029 return true;
14030}
14031
14032int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14033 ostream *ss)
14034{
14035 const string& poolstr = osdmap.get_pool_name(pool_id);
14036
14037 // If the Pool is in use by CephFS, refuse to delete it
f67539c2 14038 FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
7c673cae
FG
14039 if (pending_fsmap.pool_in_use(pool_id)) {
14040 *ss << "pool '" << poolstr << "' is in use by CephFS";
14041 return -EBUSY;
14042 }
14043
14044 if (pool.tier_of >= 0) {
14045 *ss << "pool '" << poolstr << "' is a tier of '"
14046 << osdmap.get_pool_name(pool.tier_of) << "'";
14047 return -EBUSY;
14048 }
14049 if (!pool.tiers.empty()) {
14050 *ss << "pool '" << poolstr << "' has tiers";
14051 for(auto tier : pool.tiers) {
14052 *ss << " " << osdmap.get_pool_name(tier);
14053 }
14054 return -EBUSY;
14055 }
14056
11fdf7f2 14057 if (!g_conf()->mon_allow_pool_delete) {
7c673cae
FG
14058 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14059 return -EPERM;
14060 }
14061
14062 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14063 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14064 return -EPERM;
14065 }
14066
14067 *ss << "pool '" << poolstr << "' removed";
14068 return 0;
14069}
14070
14071/**
14072 * Check if it is safe to add a tier to a base pool
14073 *
14074 * @return
14075 * True if the operation should proceed, false if we should abort here
14076 * (abort doesn't necessarily mean error, could be idempotency)
14077 */
14078bool OSDMonitor::_check_become_tier(
14079 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14080 const int64_t base_pool_id, const pg_pool_t *base_pool,
14081 int *err,
14082 ostream *ss) const
14083{
14084 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14085 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14086
f67539c2 14087 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
7c673cae
FG
14088 if (pending_fsmap.pool_in_use(tier_pool_id)) {
14089 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14090 *err = -EBUSY;
14091 return false;
14092 }
14093
14094 if (base_pool->tiers.count(tier_pool_id)) {
11fdf7f2 14095 ceph_assert(tier_pool->tier_of == base_pool_id);
7c673cae
FG
14096 *err = 0;
14097 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14098 << base_pool_name << "'";
14099 return false;
14100 }
14101
14102 if (base_pool->is_tier()) {
14103 *ss << "pool '" << base_pool_name << "' is already a tier of '"
14104 << osdmap.get_pool_name(base_pool->tier_of) << "', "
14105 << "multiple tiers are not yet supported.";
14106 *err = -EINVAL;
14107 return false;
14108 }
14109
14110 if (tier_pool->has_tiers()) {
14111 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14112 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14113 it != tier_pool->tiers.end(); ++it)
14114 *ss << "'" << osdmap.get_pool_name(*it) << "',";
14115 *ss << " multiple tiers are not yet supported.";
14116 *err = -EINVAL;
14117 return false;
14118 }
14119
14120 if (tier_pool->is_tier()) {
14121 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14122 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14123 *err = -EINVAL;
14124 return false;
14125 }
14126
14127 *err = 0;
14128 return true;
14129}
14130
14131
14132/**
14133 * Check if it is safe to remove a tier from this base pool
14134 *
14135 * @return
14136 * True if the operation should proceed, false if we should abort here
14137 * (abort doesn't necessarily mean error, could be idempotency)
14138 */
14139bool OSDMonitor::_check_remove_tier(
14140 const int64_t base_pool_id, const pg_pool_t *base_pool,
14141 const pg_pool_t *tier_pool,
14142 int *err, ostream *ss) const
14143{
14144 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14145
14146 // Apply CephFS-specific checks
f67539c2 14147 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
7c673cae 14148 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
14149 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14150 // If the underlying pool is erasure coded and does not allow EC
14151 // overwrites, we can't permit the removal of the replicated tier that
14152 // CephFS relies on to access it
14153 *ss << "pool '" << base_pool_name <<
14154 "' does not allow EC overwrites and is in use by CephFS"
14155 " via its tier";
7c673cae
FG
14156 *err = -EBUSY;
14157 return false;
14158 }
14159
14160 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14161 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14162 "tier is still in use as a writeback cache. Change the cache "
14163 "mode and flush the cache before removing it";
14164 *err = -EBUSY;
14165 return false;
14166 }
14167 }
14168
14169 *err = 0;
14170 return true;
14171}
14172
14173int OSDMonitor::_prepare_remove_pool(
14174 int64_t pool, ostream *ss, bool no_fake)
14175{
224ce89b 14176 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
14177 const pg_pool_t *p = osdmap.get_pg_pool(pool);
14178 int r = _check_remove_pool(pool, *p, ss);
14179 if (r < 0)
14180 return r;
14181
14182 auto new_pool = pending_inc.new_pools.find(pool);
14183 if (new_pool != pending_inc.new_pools.end()) {
14184 // if there is a problem with the pending info, wait and retry
14185 // this op.
14186 const auto& p = new_pool->second;
14187 int r = _check_remove_pool(pool, p, ss);
14188 if (r < 0)
14189 return -EAGAIN;
14190 }
14191
14192 if (pending_inc.old_pools.count(pool)) {
224ce89b 14193 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
14194 << dendl;
14195 return 0;
14196 }
14197
11fdf7f2 14198 if (g_conf()->mon_fake_pool_delete && !no_fake) {
7c673cae
FG
14199 string old_name = osdmap.get_pool_name(pool);
14200 string new_name = old_name + "." + stringify(pool) + ".DELETED";
14201 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14202 << old_name << " -> " << new_name << dendl;
14203 pending_inc.new_pool_names[pool] = new_name;
14204 return 0;
14205 }
14206
14207 // remove
14208 pending_inc.old_pools.insert(pool);
14209
224ce89b 14210 // remove any pg_temp mappings for this pool
7c673cae
FG
14211 for (auto p = osdmap.pg_temp->begin();
14212 p != osdmap.pg_temp->end();
14213 ++p) {
11fdf7f2 14214 if (p->first.pool() == pool) {
224ce89b 14215 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
14216 << p->first << dendl;
14217 pending_inc.new_pg_temp[p->first].clear();
14218 }
14219 }
224ce89b 14220 // remove any primary_temp mappings for this pool
7c673cae
FG
14221 for (auto p = osdmap.primary_temp->begin();
14222 p != osdmap.primary_temp->end();
14223 ++p) {
11fdf7f2 14224 if (p->first.pool() == pool) {
224ce89b 14225 dout(10) << __func__ << " " << pool
7c673cae
FG
14226 << " removing obsolete primary_temp" << p->first << dendl;
14227 pending_inc.new_primary_temp[p->first] = -1;
14228 }
14229 }
224ce89b
WB
14230 // remove any pg_upmap mappings for this pool
14231 for (auto& p : osdmap.pg_upmap) {
11fdf7f2 14232 if (p.first.pool() == pool) {
224ce89b
WB
14233 dout(10) << __func__ << " " << pool
14234 << " removing obsolete pg_upmap "
14235 << p.first << dendl;
14236 pending_inc.old_pg_upmap.insert(p.first);
14237 }
14238 }
94b18763
FG
14239 // remove any pending pg_upmap mappings for this pool
14240 {
14241 auto it = pending_inc.new_pg_upmap.begin();
14242 while (it != pending_inc.new_pg_upmap.end()) {
11fdf7f2 14243 if (it->first.pool() == pool) {
94b18763
FG
14244 dout(10) << __func__ << " " << pool
14245 << " removing pending pg_upmap "
14246 << it->first << dendl;
14247 it = pending_inc.new_pg_upmap.erase(it);
14248 } else {
14249 it++;
14250 }
14251 }
14252 }
224ce89b
WB
14253 // remove any pg_upmap_items mappings for this pool
14254 for (auto& p : osdmap.pg_upmap_items) {
11fdf7f2 14255 if (p.first.pool() == pool) {
224ce89b
WB
14256 dout(10) << __func__ << " " << pool
14257 << " removing obsolete pg_upmap_items " << p.first
14258 << dendl;
14259 pending_inc.old_pg_upmap_items.insert(p.first);
14260 }
14261 }
94b18763
FG
14262 // remove any pending pg_upmap mappings for this pool
14263 {
14264 auto it = pending_inc.new_pg_upmap_items.begin();
14265 while (it != pending_inc.new_pg_upmap_items.end()) {
11fdf7f2 14266 if (it->first.pool() == pool) {
94b18763
FG
14267 dout(10) << __func__ << " " << pool
14268 << " removing pending pg_upmap_items "
14269 << it->first << dendl;
14270 it = pending_inc.new_pg_upmap_items.erase(it);
14271 } else {
14272 it++;
14273 }
14274 }
14275 }
35e4c445
FG
14276
14277 // remove any choose_args for this pool
14278 CrushWrapper newcrush;
14279 _get_pending_crush(newcrush);
14280 if (newcrush.have_choose_args(pool)) {
14281 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14282 newcrush.rm_choose_args(pool);
14283 pending_inc.crush.clear();
f67539c2 14284 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
35e4c445 14285 }
7c673cae
FG
14286 return 0;
14287}
14288
14289int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14290{
14291 dout(10) << "_prepare_rename_pool " << pool << dendl;
14292 if (pending_inc.old_pools.count(pool)) {
14293 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14294 return -ENOENT;
14295 }
14296 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14297 p != pending_inc.new_pool_names.end();
14298 ++p) {
14299 if (p->second == newname && p->first != pool) {
14300 return -EEXIST;
14301 }
14302 }
14303
14304 pending_inc.new_pool_names[pool] = newname;
14305 return 0;
14306}
14307
14308bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14309{
14310 op->mark_osdmon_event(__func__);
9f95a23c 14311 auto m = op->get_req<MPoolOp>();
7c673cae
FG
14312 ostringstream ss;
14313 int ret = _prepare_remove_pool(m->pool, &ss, false);
14314 if (ret == -EAGAIN) {
14315 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14316 return true;
14317 }
14318 if (ret < 0)
14319 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14320 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14321 pending_inc.epoch));
14322 return true;
14323}
14324
14325void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14326 int ret, epoch_t epoch, bufferlist *blp)
14327{
14328 op->mark_osdmon_event(__func__);
9f95a23c 14329 auto m = op->get_req<MPoolOp>();
7c673cae
FG
14330 dout(20) << "_pool_op_reply " << ret << dendl;
14331 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14332 ret, epoch, get_last_committed(), blp);
f67539c2 14333 mon.send_reply(op, reply);
7c673cae 14334}
81eedcae
TL
14335
14336void OSDMonitor::convert_pool_priorities(void)
14337{
14338 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14339 int64_t max_prio = 0;
14340 int64_t min_prio = 0;
14341 for (const auto &i : osdmap.get_pools()) {
14342 const auto &pool = i.second;
14343
14344 if (pool.opts.is_set(key)) {
9f95a23c 14345 int64_t prio = 0;
81eedcae
TL
14346 pool.opts.get(key, &prio);
14347 if (prio > max_prio)
14348 max_prio = prio;
14349 if (prio < min_prio)
14350 min_prio = prio;
14351 }
14352 }
14353 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14354 dout(20) << __func__ << " nothing to fix" << dendl;
14355 return;
14356 }
14357 // Current pool priorities exceeds new maximum
14358 for (const auto &i : osdmap.get_pools()) {
14359 const auto pool_id = i.first;
14360 pg_pool_t pool = i.second;
14361
14362 int64_t prio = 0;
14363 pool.opts.get(key, &prio);
14364 int64_t n;
14365
14366 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14367 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14368 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14369 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14370 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14371 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14372 } else {
14373 continue;
14374 }
14375 if (n == 0) {
14376 pool.opts.unset(key);
14377 } else {
14378 pool.opts.set(key, static_cast<int64_t>(n));
14379 }
14380 dout(10) << __func__ << " pool " << pool_id
14381 << " recovery_priority adjusted "
14382 << prio << " to " << n << dendl;
14383 pool.last_change = pending_inc.epoch;
14384 pending_inc.new_pools[pool_id] = pool;
14385 }
14386}
f67539c2
TL
14387
14388void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14389 int *errcode,
14390 set<pg_pool_t*>* pools,
14391 const string& new_crush_rule)
14392{
14393 dout(20) << __func__ << dendl;
14394 *okay = false;
14395 int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14396 if (new_crush_rule_result < 0) {
14397 ss << "unrecognized crush rule " << new_crush_rule_result;
14398 *errcode = new_crush_rule_result;
14399 return;
14400 }
14401 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14402 for (const auto& pooli : osdmap.pools) {
14403 int64_t poolid = pooli.first;
14404 const pg_pool_t *p = &pooli.second;
14405 if (!p->is_replicated()) {
14406 ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14407 *errcode = -EINVAL;
14408 return;
14409 }
14410 uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14411 if ((p->get_size() != default_size ||
14412 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14413 (p->get_crush_rule() != new_rule)) {
14414 ss << "we currently require stretch mode pools start out with the"
14415 " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14416 *errcode = -EINVAL;
14417 return;
14418 }
14419 pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14420 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14421 // the attempt may fail and then we have these pool updates...but they won't do anything
14422 // if there is a failure, so if it's hard to change the interface, no need to bother
14423 pools->insert(pp);
14424 }
14425 *okay = true;
14426 return;
14427}
14428
14429void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14430 int *errcode, bool commit,
14431 const string& dividing_bucket,
14432 uint32_t bucket_count,
14433 const set<pg_pool_t*>& pools,
14434 const string& new_crush_rule)
14435{
14436 dout(20) << __func__ << dendl;
14437 *okay = false;
14438 CrushWrapper crush;
14439 _get_pending_crush(crush);
14440 int dividing_id;
14441 int retval = crush.get_validated_type_id(dividing_bucket, &dividing_id);
14442 if (retval == -1) {
14443 ss << dividing_bucket << " is not a valid crush bucket type";
14444 *errcode = -ENOENT;
14445 ceph_assert(!commit || retval != -1);
14446 return;
14447 }
14448 vector<int> subtrees;
14449 crush.get_subtree_of_type(dividing_id, &subtrees);
14450 if (subtrees.size() != 2) {
14451 ss << "there are " << subtrees.size() << dividing_bucket
14452 << "'s in the cluster but stretch mode currently only works with 2!";
14453 *errcode = -EINVAL;
14454 ceph_assert(!commit || subtrees.size() == 2);
14455 return;
14456 }
14457
14458 int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14459 if (new_crush_rule_result < 0) {
14460 ss << "unrecognized crush rule " << new_crush_rule;
14461 *errcode = new_crush_rule_result;
14462 ceph_assert(!commit || (new_crush_rule_result > 0));
14463 return;
14464 }
14465 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14466
14467 int weight1 = crush.get_item_weight(subtrees[0]);
14468 int weight2 = crush.get_item_weight(subtrees[1]);
14469 if (weight1 != weight2) {
14470 // TODO: I'm really not sure this is a good idea?
14471 ss << "the 2 " << dividing_bucket
14472 << "instances in the cluster have differing weights "
14473 << weight1 << " and " << weight2
14474 <<" but stretch mode currently requires they be the same!";
14475 *errcode = -EINVAL;
14476 ceph_assert(!commit || (weight1 == weight2));
14477 return;
14478 }
14479 if (bucket_count != 2) {
14480 ss << "currently we only support 2-site stretch clusters!";
14481 *errcode = -EINVAL;
14482 ceph_assert(!commit || bucket_count == 2);
14483 return;
14484 }
14485 // TODO: check CRUSH rules for pools so that we are appropriately divided
14486 if (commit) {
14487 for (auto pool : pools) {
14488 pool->crush_rule = new_rule;
14489 pool->peering_crush_bucket_count = bucket_count;
14490 pool->peering_crush_bucket_target = bucket_count;
14491 pool->peering_crush_bucket_barrier = dividing_id;
14492 pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14493 pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14494 pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14495 }
14496 pending_inc.change_stretch_mode = true;
14497 pending_inc.stretch_mode_enabled = true;
14498 pending_inc.new_stretch_bucket_count = bucket_count;
14499 pending_inc.new_degraded_stretch_mode = 0;
14500 pending_inc.new_stretch_mode_bucket = dividing_id;
14501 }
14502 *okay = true;
14503 return;
14504}
14505
14506bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14507 set<int> *really_down_buckets,
14508 set<string> *really_down_mons)
14509{
14510 dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14511 ceph_assert(is_readable());
14512 if (dead_buckets.empty()) return false;
14513 set<int> down_cache;
14514 bool really_down = false;
14515 for (auto dbi : dead_buckets) {
14516 const string& bucket_name = dbi.first;
14517 ceph_assert(osdmap.crush->name_exists(bucket_name));
14518 int bucket_id = osdmap.crush->get_item_id(bucket_name);
14519 dout(20) << "Checking " << bucket_name << " id " << bucket_id
14520 << " to see if OSDs are also down" << dendl;
14521 bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14522 if (subtree_down) {
14523 dout(20) << "subtree is down!" << dendl;
14524 really_down = true;
14525 really_down_buckets->insert(bucket_id);
14526 really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14527 }
14528 }
14529 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14530 << " and mons " << *really_down_mons << " are really down" << dendl;
14531 return really_down;
14532}
14533
14534void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14535 const set<string>& live_zones)
14536{
14537 dout(20) << __func__ << dendl;
14538 stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14539 // update the general OSDMap changes
14540 pending_inc.change_stretch_mode = true;
14541 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14542 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14543 int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14544 ceph_assert(new_site_count == 1); // stretch count 2!
14545 pending_inc.new_degraded_stretch_mode = new_site_count;
14546 pending_inc.new_recovering_stretch_mode = 0;
14547 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14548
14549 // and then apply them to all the pg_pool_ts
14550 ceph_assert(live_zones.size() == 1); // only support 2 zones now
14551 const string& remaining_site_name = *(live_zones.begin());
14552 ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14553 int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14554 for (auto pgi : osdmap.pools) {
14555 if (pgi.second.peering_crush_bucket_count) {
14556 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14557 newp.peering_crush_bucket_count = new_site_count;
14558 newp.peering_crush_mandatory_member = remaining_site;
14559 newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14560 newp.last_force_op_resend = pending_inc.epoch;
14561 }
14562 }
14563 propose_pending();
14564}
14565
14566void OSDMonitor::trigger_recovery_stretch_mode()
14567{
14568 dout(20) << __func__ << dendl;
14569 stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14570 pending_inc.change_stretch_mode = true;
14571 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14572 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14573 pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14574 pending_inc.new_recovering_stretch_mode = 1;
14575 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14576
14577 for (auto pgi : osdmap.pools) {
14578 if (pgi.second.peering_crush_bucket_count) {
14579 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14580 newp.last_force_op_resend = pending_inc.epoch;
14581 }
14582 }
14583 propose_pending();
14584}
14585
b3b6e05e
TL
14586void OSDMonitor::set_degraded_stretch_mode()
14587{
14588 stretch_recovery_triggered.set_from_double(0);
14589}
14590
14591void OSDMonitor::set_recovery_stretch_mode()
14592{
14593 if (stretch_recovery_triggered.is_zero()) {
14594 stretch_recovery_triggered = ceph_clock_now();
14595 }
14596}
14597
14598void OSDMonitor::set_healthy_stretch_mode()
14599{
14600 stretch_recovery_triggered.set_from_double(0);
14601}
14602
f67539c2
TL
14603void OSDMonitor::notify_new_pg_digest()
14604{
14605 dout(20) << __func__ << dendl;
14606 if (!stretch_recovery_triggered.is_zero()) {
14607 try_end_recovery_stretch_mode(false);
14608 }
14609}
14610
14611struct CMonExitRecovery : public Context {
14612 OSDMonitor *m;
14613 bool force;
14614 CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14615 void finish(int r) {
14616 m->try_end_recovery_stretch_mode(force);
14617 }
14618};
14619
14620void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14621{
14622 dout(20) << __func__ << dendl;
14623 if (!mon.is_leader()) return;
14624 if (!mon.is_degraded_stretch_mode()) return;
14625 if (!mon.is_recovering_stretch_mode()) return;
14626 if (!is_readable()) {
14627 wait_for_readable_ctx(new CMonExitRecovery(this, force));
14628 return;
14629 }
14630
14631 if (osdmap.recovering_stretch_mode &&
14632 ((!stretch_recovery_triggered.is_zero() &&
14633 ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
14634 stretch_recovery_triggered) ||
14635 force)) {
14636 if (!mon.mgrstatmon()->is_readable()) {
14637 mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
14638 return;
14639 }
14640 const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
14641 double misplaced, degraded, inactive, unknown;
14642 pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
14643 if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
14644 // we can exit degraded stretch mode!
14645 mon.trigger_healthy_stretch_mode();
14646 }
14647 }
14648}
14649
14650void OSDMonitor::trigger_healthy_stretch_mode()
14651{
14652 ceph_assert(is_writeable());
14653 stretch_recovery_triggered.set_from_double(0);
14654 pending_inc.change_stretch_mode = true;
14655 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14656 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14657 pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
14658 pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
14659 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14660 for (auto pgi : osdmap.pools) {
14661 if (pgi.second.peering_crush_bucket_count) {
14662 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14663 newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
14664 newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14665 newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14666 newp.last_force_op_resend = pending_inc.epoch;
14667 }
14668 }
14669 propose_pending();
14670}