]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b 20#include <boost/algorithm/string.hpp>
11fdf7f2 21#include <experimental/iterator>
224ce89b 22#include <locale>
7c673cae
FG
23#include <sstream>
24
31f18b77
FG
25#include "mon/OSDMonitor.h"
26#include "mon/Monitor.h"
27#include "mon/MDSMonitor.h"
31f18b77
FG
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
f67539c2 30#include "mon/KVMonitor.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
9f95a23c 42#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
43#include "messages/MOSDFull.h"
44#include "messages/MOSDMap.h"
45#include "messages/MMonGetOSDMap.h"
46#include "messages/MOSDBoot.h"
47#include "messages/MOSDAlive.h"
48#include "messages/MPoolOp.h"
49#include "messages/MPoolOpReply.h"
11fdf7f2 50#include "messages/MOSDPGCreate2.h"
7c673cae
FG
51#include "messages/MOSDPGCreated.h"
52#include "messages/MOSDPGTemp.h"
11fdf7f2 53#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
54#include "messages/MMonCommand.h"
55#include "messages/MRemoveSnaps.h"
7c673cae 56#include "messages/MRoute.h"
9f95a23c
TL
57#include "messages/MMonGetPurgedSnaps.h"
58#include "messages/MMonGetPurgedSnapsReply.h"
7c673cae
FG
59
60#include "common/TextTable.h"
61#include "common/Timer.h"
62#include "common/ceph_argparse.h"
63#include "common/perf_counters.h"
eafe8130 64#include "common/PriorityCache.h"
7c673cae 65#include "common/strtol.h"
11fdf7f2 66#include "common/numa.h"
7c673cae
FG
67
68#include "common/config.h"
69#include "common/errno.h"
70
71#include "erasure-code/ErasureCodePlugin.h"
72#include "compressor/Compressor.h"
73#include "common/Checksummer.h"
74
75#include "include/compat.h"
11fdf7f2 76#include "include/ceph_assert.h"
7c673cae
FG
77#include "include/stringify.h"
78#include "include/util.h"
79#include "common/cmdparse.h"
80#include "include/str_list.h"
81#include "include/str_map.h"
224ce89b 82#include "include/scope_guard.h"
eafe8130 83#include "perfglue/heap_profiler.h"
7c673cae 84
28e407b8
AA
85#include "auth/cephx/CephxKeyServer.h"
86#include "osd/OSDCap.h"
87
7c673cae
FG
88#include "json_spirit/json_spirit_reader.h"
89
c07f9fc5
FG
90#include <boost/algorithm/string/predicate.hpp>
91
f67539c2
TL
92using std::dec;
93using std::hex;
94using std::list;
95using std::map;
96using std::make_pair;
97using std::ostringstream;
98using std::pair;
99using std::set;
100using std::string;
101using std::stringstream;
102using std::to_string;
103using std::vector;
104
105using ceph::bufferlist;
106using ceph::decode;
107using ceph::encode;
108using ceph::ErasureCodeInterfaceRef;
109using ceph::ErasureCodePluginRegistry;
110using ceph::ErasureCodeProfile;
111using ceph::Formatter;
112using ceph::JSONFormatter;
113using ceph::make_message;
114
7c673cae 115#define dout_subsys ceph_subsys_mon
3efd9988
FG
116static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
117static const string OSD_METADATA_PREFIX("osd_metadata");
11fdf7f2 118static const string OSD_SNAP_PREFIX("osd_snap");
7c673cae 119
9f95a23c
TL
120/*
121
122 OSD snapshot metadata
123 ---------------------
124
125 -- starting with mimic, removed in octopus --
126
127 "removed_epoch_%llu_%08lx" % (pool, epoch)
128 -> interval_set<snapid_t>
129
130 "removed_snap_%llu_%016llx" % (pool, last_snap)
131 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
132
133
134 -- starting with mimic --
135
136 "purged_snap_%llu_%016llx" % (pool, last_snap)
137 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
138
139 - note that the {removed,purged}_snap put the last snap in they key so
140 that we can use forward iteration only to search for an epoch in an
141 interval. e.g., to test if epoch N is removed/purged, we'll find a key
142 >= N that either does or doesn't contain the given snap.
143
144
145 -- starting with octopus --
146
147 "purged_epoch_%08lx" % epoch
148 -> map<int64_t,interval_set<snapid_t>>
149
150 */
151using namespace TOPNSPC::common;
c07f9fc5
FG
152namespace {
153
eafe8130
TL
154struct OSDMemCache : public PriorityCache::PriCache {
155 OSDMonitor *osdmon;
156 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
157 int64_t committed_bytes = 0;
158 double cache_ratio = 0;
159
160 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
161
162 virtual uint64_t _get_used_bytes() const = 0;
163
164 virtual int64_t request_cache_bytes(
165 PriorityCache::Priority pri, uint64_t total_cache) const {
166 int64_t assigned = get_cache_bytes(pri);
167
168 switch (pri) {
169 // All cache items are currently set to have PRI1 priority
170 case PriorityCache::Priority::PRI1:
171 {
172 int64_t request = _get_used_bytes();
173 return (request > assigned) ? request - assigned : 0;
174 }
175 default:
176 break;
177 }
178 return -EOPNOTSUPP;
179 }
180
181 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
182 return cache_bytes[pri];
183 }
184
185 virtual int64_t get_cache_bytes() const {
186 int64_t total = 0;
187
188 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
189 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
190 total += get_cache_bytes(pri);
191 }
192 return total;
193 }
194
195 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
196 cache_bytes[pri] = bytes;
197 }
198 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
199 cache_bytes[pri] += bytes;
200 }
201 virtual int64_t commit_cache_size(uint64_t total_cache) {
202 committed_bytes = PriorityCache::get_chunk(
203 get_cache_bytes(), total_cache);
204 return committed_bytes;
205 }
206 virtual int64_t get_committed_size() const {
207 return committed_bytes;
208 }
209 virtual double get_cache_ratio() const {
210 return cache_ratio;
211 }
212 virtual void set_cache_ratio(double ratio) {
213 cache_ratio = ratio;
214 }
20effc67
TL
215 virtual void shift_bins() {
216 }
217 virtual void import_bins(const std::vector<uint64_t> &bins) {
218 }
219 virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
220 }
221 virtual uint64_t get_bins(PriorityCache::Priority pri) const {
222 return 0;
223 }
224
eafe8130
TL
225 virtual string get_cache_name() const = 0;
226};
227
228struct IncCache : public OSDMemCache {
229 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
230
231 virtual uint64_t _get_used_bytes() const {
232 return osdmon->inc_osd_cache.get_bytes();
233 }
234
235 virtual string get_cache_name() const {
236 return "OSDMap Inc Cache";
237 }
238
239 uint64_t _get_num_osdmaps() const {
240 return osdmon->inc_osd_cache.get_size();
241 }
242};
243
244struct FullCache : public OSDMemCache {
245 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
246
247 virtual uint64_t _get_used_bytes() const {
248 return osdmon->full_osd_cache.get_bytes();
249 }
250
251 virtual string get_cache_name() const {
252 return "OSDMap Full Cache";
253 }
254
255 uint64_t _get_num_osdmaps() const {
256 return osdmon->full_osd_cache.get_size();
257 }
258};
259
260std::shared_ptr<IncCache> inc_cache;
261std::shared_ptr<FullCache> full_cache;
262
c07f9fc5
FG
263const uint32_t MAX_POOL_APPLICATIONS = 4;
264const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
265const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
266
28e407b8
AA
267bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
268 // Note: this doesn't include support for the application tag match
269 if ((grant.spec.allow & OSD_CAP_W) != 0) {
270 auto& match = grant.match;
271 if (match.is_match_all()) {
272 return true;
11fdf7f2 273 } else if (pool_name != nullptr &&
28e407b8
AA
274 !match.pool_namespace.pool_name.empty() &&
275 match.pool_namespace.pool_name == *pool_name) {
276 return true;
277 }
278 }
279 return false;
280}
281
282bool is_unmanaged_snap_op_permitted(CephContext* cct,
283 const KeyServer& key_server,
284 const EntityName& entity_name,
285 const MonCap& mon_caps,
11fdf7f2 286 const entity_addr_t& peer_socket_addr,
28e407b8
AA
287 const std::string* pool_name)
288{
289 typedef std::map<std::string, std::string> CommandArgs;
290
11fdf7f2 291 if (mon_caps.is_capable(
92f5a8d4 292 cct, entity_name, "osd",
11fdf7f2
TL
293 "osd pool op unmanaged-snap",
294 (pool_name == nullptr ?
295 CommandArgs{} /* pool DNE, require unrestricted cap */ :
296 CommandArgs{{"poolname", *pool_name}}),
297 false, true, false,
298 peer_socket_addr)) {
28e407b8
AA
299 return true;
300 }
301
302 AuthCapsInfo caps_info;
303 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
304 caps_info)) {
305 dout(10) << "unable to locate OSD cap data for " << entity_name
306 << " in auth db" << dendl;
307 return false;
308 }
309
310 string caps_str;
311 if (caps_info.caps.length() > 0) {
11fdf7f2 312 auto p = caps_info.caps.cbegin();
28e407b8
AA
313 try {
314 decode(caps_str, p);
f67539c2 315 } catch (const ceph::buffer::error &err) {
28e407b8
AA
316 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
317 << dendl;
318 return false;
319 }
320 }
321
322 OSDCap osd_cap;
323 if (!osd_cap.parse(caps_str, nullptr)) {
324 dout(10) << "unable to parse OSD cap data for " << entity_name
325 << " in auth db" << dendl;
326 return false;
327 }
328
329 // if the entity has write permissions in one or all pools, permit
330 // usage of unmanaged-snapshots
331 if (osd_cap.allow_all()) {
332 return true;
333 }
334
335 for (auto& grant : osd_cap.grants) {
336 if (grant.profile.is_valid()) {
337 for (auto& profile_grant : grant.profile_grants) {
338 if (is_osd_writable(profile_grant, pool_name)) {
339 return true;
340 }
341 }
342 } else if (is_osd_writable(grant, pool_name)) {
343 return true;
344 }
345 }
346
347 return false;
348}
349
c07f9fc5
FG
350} // anonymous namespace
351
522d829b
TL
352void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
353 epoch_t last_epoch_clean)
7c673cae 354{
522d829b
TL
355 if (ps >= pg_num) {
356 // removed PG
357 return;
7c673cae 358 }
522d829b 359 epoch_by_pg.resize(pg_num, 0);
7c673cae
FG
360 const auto old_lec = epoch_by_pg[ps];
361 if (old_lec >= last_epoch_clean) {
362 // stale lec
363 return;
364 }
365 epoch_by_pg[ps] = last_epoch_clean;
366 if (last_epoch_clean < floor) {
367 floor = last_epoch_clean;
368 } else if (last_epoch_clean > floor) {
369 if (old_lec == floor) {
370 // probably should increase floor?
371 auto new_floor = std::min_element(std::begin(epoch_by_pg),
372 std::end(epoch_by_pg));
373 floor = *new_floor;
374 }
375 }
376 if (ps != next_missing) {
377 return;
378 }
379 for (; next_missing < epoch_by_pg.size(); next_missing++) {
380 if (epoch_by_pg[next_missing] == 0) {
381 break;
382 }
383 }
384}
385
386void LastEpochClean::remove_pool(uint64_t pool)
387{
388 report_by_pool.erase(pool);
389}
390
522d829b
TL
391void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
392 epoch_t last_epoch_clean)
7c673cae
FG
393{
394 auto& lec = report_by_pool[pg.pool()];
522d829b 395 return lec.report(pg_num, pg.ps(), last_epoch_clean);
7c673cae
FG
396}
397
398epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
399{
400 auto floor = latest.get_epoch();
401 for (auto& pool : latest.get_pools()) {
402 auto reported = report_by_pool.find(pool.first);
403 if (reported == report_by_pool.end()) {
404 return 0;
405 }
406 if (reported->second.next_missing < pool.second.get_pg_num()) {
407 return 0;
408 }
409 if (reported->second.floor < floor) {
410 floor = reported->second.floor;
411 }
412 }
413 return floor;
414}
415
1911f103
TL
416void LastEpochClean::dump(Formatter *f) const
417{
418 f->open_array_section("per_pool");
419
f67539c2 420 for (auto& [pool, lec] : report_by_pool) {
1911f103 421 f->open_object_section("pool");
f67539c2
TL
422 f->dump_unsigned("poolid", pool);
423 f->dump_unsigned("floor", lec.floor);
1911f103
TL
424 f->close_section();
425 }
426
427 f->close_section();
428}
7c673cae 429
11fdf7f2
TL
430class C_UpdateCreatingPGs : public Context {
431public:
7c673cae
FG
432 OSDMonitor *osdmon;
433 utime_t start;
434 epoch_t epoch;
435 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
436 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
437 void finish(int r) override {
438 if (r >= 0) {
439 utime_t end = ceph_clock_now();
440 dout(10) << "osdmap epoch " << epoch << " mapping took "
441 << (end - start) << " seconds" << dendl;
442 osdmon->update_creating_pgs();
443 osdmon->check_pg_creates_subs();
444 }
445 }
446};
447
448#undef dout_prefix
449#define dout_prefix _prefix(_dout, mon, osdmap)
f67539c2
TL
450static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
451 return *_dout << "mon." << mon.name << "@" << mon.rank
452 << "(" << mon.get_state_name()
7c673cae
FG
453 << ").osd e" << osdmap.get_epoch() << " ";
454}
455
456OSDMonitor::OSDMonitor(
457 CephContext *cct,
f67539c2
TL
458 Monitor &mn,
459 Paxos &p,
7c673cae
FG
460 const string& service_name)
461 : PaxosService(mn, p, service_name),
462 cct(cct),
11fdf7f2
TL
463 inc_osd_cache(g_conf()->mon_osd_cache_size),
464 full_osd_cache(g_conf()->mon_osd_cache_size),
465 has_osdmap_manifest(false),
f67539c2 466 mapper(mn.cct, &mn.cpu_tp)
eafe8130
TL
467{
468 inc_cache = std::make_shared<IncCache>(this);
469 full_cache = std::make_shared<FullCache>(this);
470 cct->_conf.add_observer(this);
471 int r = _set_cache_sizes();
472 if (r < 0) {
473 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
474 << g_conf()->mon_osd_cache_size
475 << ") without priority cache management"
476 << dendl;
477 }
478}
479
480const char **OSDMonitor::get_tracked_conf_keys() const
481{
482 static const char* KEYS[] = {
483 "mon_memory_target",
484 "mon_memory_autotune",
485 "rocksdb_cache_size",
486 NULL
487 };
488 return KEYS;
489}
490
491void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
492 const std::set<std::string> &changed)
493{
494 dout(10) << __func__ << " " << changed << dendl;
495
496 if (changed.count("mon_memory_autotune")) {
497 _set_cache_autotuning();
498 }
499 if (changed.count("mon_memory_target") ||
500 changed.count("rocksdb_cache_size")) {
501 int r = _update_mon_cache_settings();
502 if (r < 0) {
503 derr << __func__ << " mon_memory_target:"
504 << g_conf()->mon_memory_target
505 << " rocksdb_cache_size:"
506 << g_conf()->rocksdb_cache_size
92f5a8d4 507 << ". Unable to update cache size."
eafe8130
TL
508 << dendl;
509 }
510 }
511}
512
513void OSDMonitor::_set_cache_autotuning()
514{
515 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
516 // Disable cache autotuning
517 std::lock_guard l(balancer_lock);
518 pcm = nullptr;
519 }
520
521 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
522 int r = register_cache_with_pcm();
523 if (r < 0) {
524 dout(10) << __func__
525 << " Error while registering osdmon caches with pcm."
526 << " Cache auto tuning not enabled."
527 << dendl;
528 mon_memory_autotune = false;
529 } else {
530 mon_memory_autotune = true;
531 }
532 }
533}
534
535int OSDMonitor::_update_mon_cache_settings()
536{
537 if (g_conf()->mon_memory_target <= 0 ||
538 g_conf()->mon_memory_target < mon_memory_min ||
539 g_conf()->rocksdb_cache_size <= 0) {
540 return -EINVAL;
541 }
542
92f5a8d4
TL
543 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
544 derr << __func__ << " not using pcm and rocksdb" << dendl;
545 return -EINVAL;
546 }
547
eafe8130
TL
548 uint64_t old_mon_memory_target = mon_memory_target;
549 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
550
551 // Set the new pcm memory cache sizes
552 mon_memory_target = g_conf()->mon_memory_target;
553 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
554
555 uint64_t base = mon_memory_base;
556 double fragmentation = mon_memory_fragmentation;
557 uint64_t target = mon_memory_target;
558 uint64_t min = mon_memory_min;
559 uint64_t max = min;
560
561 uint64_t ltarget = (1.0 - fragmentation) * target;
562 if (ltarget > base + min) {
563 max = ltarget - base;
564 }
565
566 int r = _set_cache_ratios();
567 if (r < 0) {
568 derr << __func__ << " Cache ratios for pcm could not be set."
569 << " Review the kv (rocksdb) and mon_memory_target sizes."
570 << dendl;
571 mon_memory_target = old_mon_memory_target;
572 rocksdb_cache_size = old_rocksdb_cache_size;
573 return -EINVAL;
574 }
575
576 if (mon_memory_autotune && pcm != nullptr) {
577 std::lock_guard l(balancer_lock);
578 // set pcm cache levels
579 pcm->set_target_memory(target);
580 pcm->set_min_memory(min);
581 pcm->set_max_memory(max);
582 // tune memory based on new values
583 pcm->tune_memory();
584 pcm->balance();
585 _set_new_cache_sizes();
92f5a8d4 586 dout(1) << __func__ << " Updated mon cache setting."
eafe8130
TL
587 << " target: " << target
588 << " min: " << min
589 << " max: " << max
590 << dendl;
591 }
592 return 0;
593}
594
595int OSDMonitor::_set_cache_sizes()
596{
597 if (g_conf()->mon_memory_autotune) {
598 // set the new osdmon cache targets to be managed by pcm
599 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
600 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
601 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
602 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
603 mon_memory_target = g_conf()->mon_memory_target;
604 mon_memory_min = g_conf()->mon_osd_cache_size_min;
605 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
606 derr << __func__ << " mon_memory_target:" << mon_memory_target
607 << " mon_memory_min:" << mon_memory_min
608 << ". Invalid size option(s) provided."
609 << dendl;
610 return -EINVAL;
611 }
612 // Set the initial inc and full LRU cache sizes
613 inc_osd_cache.set_bytes(mon_memory_min);
614 full_osd_cache.set_bytes(mon_memory_min);
615 mon_memory_autotune = g_conf()->mon_memory_autotune;
616 }
617 return 0;
618}
7c673cae
FG
619
620bool OSDMonitor::_have_pending_crush()
621{
622 return pending_inc.crush.length() > 0;
623}
624
625CrushWrapper &OSDMonitor::_get_stable_crush()
626{
627 return *osdmap.crush;
628}
629
20effc67 630CrushWrapper OSDMonitor::_get_pending_crush()
7c673cae
FG
631{
632 bufferlist bl;
633 if (pending_inc.crush.length())
634 bl = pending_inc.crush;
635 else
636 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
637
11fdf7f2 638 auto p = bl.cbegin();
20effc67
TL
639 CrushWrapper crush;
640 crush.decode(p);
641 return crush;
7c673cae
FG
642}
643
644void OSDMonitor::create_initial()
645{
f67539c2 646 dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
7c673cae
FG
647
648 OSDMap newmap;
649
650 bufferlist bl;
f67539c2 651 mon.store->get("mkfs", "osdmap", bl);
7c673cae
FG
652
653 if (bl.length()) {
654 newmap.decode(bl);
f67539c2 655 newmap.set_fsid(mon.monmap->fsid);
7c673cae 656 } else {
f67539c2 657 newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
7c673cae
FG
658 }
659 newmap.set_epoch(1);
660 newmap.created = newmap.modified = ceph_clock_now();
661
662 // new clusters should sort bitwise by default.
663 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
664
11fdf7f2
TL
665 newmap.flags |=
666 CEPH_OSDMAP_RECOVERY_DELETES |
667 CEPH_OSDMAP_PURGED_SNAPDIRS |
668 CEPH_OSDMAP_PGLOG_HARDLIMIT;
669 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
670 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
671 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
672 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
673 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
674 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
675
7c673cae 676 // new cluster should require latest by default
1e59de90
TL
677 if (g_conf().get_val<bool>("mon_debug_no_require_reef")) {
678 if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
679 derr << __func__ << " mon_debug_no_require_reef and quincy=true" << dendl;
20effc67 680 newmap.require_osd_release = ceph_release_t::pacific;
1e59de90
TL
681 } else {
682 derr << __func__ << " mon_debug_no_require_reef=true" << dendl;
683 newmap.require_osd_release = ceph_release_t::quincy;
11fdf7f2 684 }
31f18b77 685 } else {
1e59de90 686 newmap.require_osd_release = ceph_release_t::reef;
f67539c2
TL
687 }
688
20effc67
TL
689 ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
690 if (!r) {
691 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
7c673cae 692 }
20effc67 693 newmap.require_min_compat_client = r;
7c673cae
FG
694
695 // encode into pending incremental
28e407b8 696 uint64_t features = newmap.get_encoding_features();
7c673cae 697 newmap.encode(pending_inc.fullmap,
28e407b8 698 features | CEPH_FEATURE_RESERVED);
7c673cae
FG
699 pending_inc.full_crc = newmap.get_crc();
700 dout(20) << " full crc " << pending_inc.full_crc << dendl;
701}
702
11fdf7f2 703void OSDMonitor::get_store_prefixes(std::set<string>& s) const
7c673cae
FG
704{
705 s.insert(service_name);
706 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 707 s.insert(OSD_METADATA_PREFIX);
11fdf7f2 708 s.insert(OSD_SNAP_PREFIX);
7c673cae
FG
709}
710
711void OSDMonitor::update_from_paxos(bool *need_bootstrap)
712{
11fdf7f2
TL
713 // we really don't care if the version has been updated, because we may
714 // have trimmed without having increased the last committed; yet, we may
715 // need to update the in-memory manifest.
716 load_osdmap_manifest();
717
7c673cae
FG
718 version_t version = get_last_committed();
719 if (version == osdmap.epoch)
720 return;
11fdf7f2 721 ceph_assert(version > osdmap.epoch);
7c673cae
FG
722
723 dout(15) << "update_from_paxos paxos e " << version
724 << ", my e " << osdmap.epoch << dendl;
725
f67539c2
TL
726 int prev_num_up_osd = osdmap.num_up_osd;
727
31f18b77
FG
728 if (mapping_job) {
729 if (!mapping_job->is_done()) {
730 dout(1) << __func__ << " mapping job "
731 << mapping_job.get() << " did not complete, "
732 << mapping_job->shards << " left, canceling" << dendl;
733 mapping_job->abort();
734 }
735 mapping_job.reset();
736 }
7c673cae 737
224ce89b
WB
738 load_health();
739
7c673cae
FG
740 /*
741 * We will possibly have a stashed latest that *we* wrote, and we will
742 * always be sure to have the oldest full map in the first..last range
743 * due to encode_trim_extra(), which includes the oldest full map in the trim
744 * transaction.
745 *
746 * encode_trim_extra() does not however write the full map's
747 * version to 'full_latest'. This is only done when we are building the
748 * full maps from the incremental versions. But don't panic! We make sure
749 * that the following conditions find whichever full map version is newer.
750 */
751 version_t latest_full = get_version_latest_full();
752 if (latest_full == 0 && get_first_committed() > 1)
753 latest_full = get_first_committed();
754
755 if (get_first_committed() > 1 &&
756 latest_full < get_first_committed()) {
757 // the monitor could be just sync'ed with its peer, and the latest_full key
758 // is not encoded in the paxos commits in encode_pending(), so we need to
759 // make sure we get it pointing to a proper version.
760 version_t lc = get_last_committed();
761 version_t fc = get_first_committed();
762
763 dout(10) << __func__ << " looking for valid full map in interval"
764 << " [" << fc << ", " << lc << "]" << dendl;
765
766 latest_full = 0;
767 for (version_t v = lc; v >= fc; v--) {
768 string full_key = "full_" + stringify(v);
f67539c2 769 if (mon.store->exists(get_service_name(), full_key)) {
7c673cae
FG
770 dout(10) << __func__ << " found latest full map v " << v << dendl;
771 latest_full = v;
772 break;
773 }
774 }
775
11fdf7f2 776 ceph_assert(latest_full > 0);
7c673cae
FG
777 auto t(std::make_shared<MonitorDBStore::Transaction>());
778 put_version_latest_full(t, latest_full);
f67539c2 779 mon.store->apply_transaction(t);
7c673cae
FG
780 dout(10) << __func__ << " updated the on-disk full map version to "
781 << latest_full << dendl;
782 }
783
784 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
785 bufferlist latest_bl;
786 get_version_full(latest_full, latest_bl);
11fdf7f2 787 ceph_assert(latest_bl.length() != 0);
7c673cae 788 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
11fdf7f2 789 osdmap = OSDMap();
7c673cae
FG
790 osdmap.decode(latest_bl);
791 }
792
11fdf7f2 793 bufferlist bl;
f67539c2 794 if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
11fdf7f2
TL
795 auto p = bl.cbegin();
796 std::lock_guard<std::mutex> l(creating_pgs_lock);
797 creating_pgs.decode(p);
798 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
799 << creating_pgs.last_scan_epoch
800 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
31f18b77 801 } else {
11fdf7f2
TL
802 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
803 << dendl;
31f18b77
FG
804 }
805
7c673cae
FG
806 // walk through incrementals
807 MonitorDBStore::TransactionRef t;
808 size_t tx_size = 0;
809 while (version > osdmap.epoch) {
810 bufferlist inc_bl;
811 int err = get_version(osdmap.epoch+1, inc_bl);
11fdf7f2
TL
812 ceph_assert(err == 0);
813 ceph_assert(inc_bl.length());
eafe8130
TL
814 // set priority cache manager levels if the osdmap is
815 // being populated for the first time.
816 if (mon_memory_autotune && pcm == nullptr) {
817 int r = register_cache_with_pcm();
818 if (r < 0) {
819 dout(10) << __func__
820 << " Error while registering osdmon caches with pcm."
821 << " Proceeding without cache auto tuning."
822 << dendl;
823 }
824 }
7c673cae
FG
825
826 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
827 << dendl;
828 OSDMap::Incremental inc(inc_bl);
829 err = osdmap.apply_incremental(inc);
11fdf7f2 830 ceph_assert(err == 0);
7c673cae
FG
831
832 if (!t)
833 t.reset(new MonitorDBStore::Transaction);
834
835 // Write out the full map for all past epochs. Encode the full
836 // map with the same features as the incremental. If we don't
837 // know, use the quorum features. If we don't know those either,
838 // encode with all features.
839 uint64_t f = inc.encode_features;
840 if (!f)
f67539c2 841 f = mon.get_quorum_con_features();
7c673cae
FG
842 if (!f)
843 f = -1;
844 bufferlist full_bl;
845 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
846 tx_size += full_bl.length();
847
848 bufferlist orig_full_bl;
849 get_version_full(osdmap.epoch, orig_full_bl);
850 if (orig_full_bl.length()) {
851 // the primary provided the full map
11fdf7f2 852 ceph_assert(inc.have_crc);
7c673cae
FG
853 if (inc.full_crc != osdmap.crc) {
854 // This will happen if the mons were running mixed versions in
855 // the past or some other circumstance made the full encoded
856 // maps divergent. Reloading here will bring us back into
857 // sync with the primary for this and all future maps. OSDs
858 // will also be brought back into sync when they discover the
859 // crc mismatch and request a full map from a mon.
860 derr << __func__ << " full map CRC mismatch, resetting to canonical"
861 << dendl;
11fdf7f2
TL
862
863 dout(20) << __func__ << " my (bad) full osdmap:\n";
864 JSONFormatter jf(true);
865 jf.dump_object("osdmap", osdmap);
866 jf.flush(*_dout);
867 *_dout << "\nhexdump:\n";
868 full_bl.hexdump(*_dout);
869 *_dout << dendl;
870
7c673cae
FG
871 osdmap = OSDMap();
872 osdmap.decode(orig_full_bl);
11fdf7f2
TL
873
874 dout(20) << __func__ << " canonical full osdmap:\n";
875 JSONFormatter jf(true);
876 jf.dump_object("osdmap", osdmap);
877 jf.flush(*_dout);
878 *_dout << "\nhexdump:\n";
879 orig_full_bl.hexdump(*_dout);
880 *_dout << dendl;
7c673cae
FG
881 }
882 } else {
11fdf7f2 883 ceph_assert(!inc.have_crc);
7c673cae
FG
884 put_version_full(t, osdmap.epoch, full_bl);
885 }
886 put_version_latest_full(t, osdmap.epoch);
887
888 // share
889 dout(1) << osdmap << dendl;
890
891 if (osdmap.epoch == 1) {
892 t->erase("mkfs", "osdmap");
893 }
894
11fdf7f2 895 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
f67539c2 896 mon.store->apply_transaction(t);
7c673cae
FG
897 t = MonitorDBStore::TransactionRef();
898 tx_size = 0;
899 }
1e59de90 900 for (auto [osd, state] : inc.new_state) {
f67539c2 901 if (state & CEPH_OSD_UP) {
11fdf7f2 902 // could be marked up *or* down, but we're too lazy to check which
f67539c2 903 last_osd_report.erase(osd);
11fdf7f2 904 }
f67539c2 905 }
1e59de90 906 for (auto [osd, weight] : inc.new_weight) {
f67539c2
TL
907 if (weight == CEPH_OSD_OUT) {
908 // manually marked out, so drop it
909 osd_epochs.erase(osd);
7c673cae
FG
910 }
911 }
912 }
913
914 if (t) {
f67539c2 915 mon.store->apply_transaction(t);
7c673cae
FG
916 }
917
f67539c2 918 bool marked_osd_down = false;
7c673cae
FG
919 for (int o = 0; o < osdmap.get_max_osd(); o++) {
920 if (osdmap.is_out(o))
921 continue;
922 auto found = down_pending_out.find(o);
923 if (osdmap.is_down(o)) {
924 // populate down -> out map
925 if (found == down_pending_out.end()) {
926 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
927 down_pending_out[o] = ceph_clock_now();
f67539c2 928 marked_osd_down = true;
7c673cae
FG
929 }
930 } else {
931 if (found != down_pending_out.end()) {
932 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
933 down_pending_out.erase(found);
934 }
935 }
936 }
937 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
938
7c673cae
FG
939 check_osdmap_subs();
940 check_pg_creates_subs();
941
942 share_map_with_random_osd();
943 update_logger();
7c673cae
FG
944 process_failures();
945
946 // make sure our feature bits reflect the latest map
947 update_msgr_features();
948
f67539c2 949 if (!mon.is_leader()) {
7c673cae
FG
950 // will be called by on_active() on the leader, avoid doing so twice
951 start_mapping();
952 }
f67539c2
TL
953 if (osdmap.stretch_mode_enabled) {
954 dout(20) << "Stretch mode enabled in this map" << dendl;
b3b6e05e 955 mon.try_engage_stretch_mode();
f67539c2
TL
956 if (osdmap.degraded_stretch_mode) {
957 dout(20) << "Degraded stretch mode set in this map" << dendl;
958 if (!osdmap.recovering_stretch_mode) {
959 mon.set_degraded_stretch_mode();
1e59de90
TL
960 dout(20) << "prev_num_up_osd: " << prev_num_up_osd << dendl;
961 dout(20) << "osdmap.num_up_osd: " << osdmap.num_up_osd << dendl;
962 dout(20) << "osdmap.num_osd: " << osdmap.num_osd << dendl;
963 dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") << dendl;
f67539c2
TL
964 if (prev_num_up_osd < osdmap.num_up_osd &&
965 (osdmap.num_up_osd / (double)osdmap.num_osd) >
1e59de90
TL
966 cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") &&
967 mon.dead_mon_buckets.size() == 0) {
f67539c2
TL
968 // TODO: This works for 2-site clusters when the OSD maps are appropriately
969 // trimmed and everything is "normal" but not if you have a lot of out OSDs
970 // you're ignoring or in some really degenerate failure cases
1e59de90 971
f67539c2
TL
972 dout(10) << "Enabling recovery stretch mode in this map" << dendl;
973 mon.go_recovery_stretch_mode();
974 }
b3b6e05e
TL
975 } else {
976 mon.set_recovery_stretch_mode();
f67539c2 977 }
b3b6e05e
TL
978 } else {
979 mon.set_healthy_stretch_mode();
f67539c2
TL
980 }
981 if (marked_osd_down &&
982 (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
983 dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
984 mon.maybe_go_degraded_stretch_mode();
985 }
f67539c2 986 }
7c673cae
FG
987}
988
eafe8130
TL
989int OSDMonitor::register_cache_with_pcm()
990{
991 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
992 derr << __func__ << " Invalid memory size specified for mon caches."
993 << " Caches will not be auto-tuned."
994 << dendl;
995 return -EINVAL;
996 }
997 uint64_t base = mon_memory_base;
998 double fragmentation = mon_memory_fragmentation;
999 // For calculating total target memory, consider rocksdb cache size.
1000 uint64_t target = mon_memory_target;
1001 uint64_t min = mon_memory_min;
1002 uint64_t max = min;
1003
1004 // Apply the same logic as in bluestore to set the max amount
1005 // of memory to use for cache. Assume base memory for OSDMaps
1006 // and then add in some overhead for fragmentation.
1007 uint64_t ltarget = (1.0 - fragmentation) * target;
1008 if (ltarget > base + min) {
1009 max = ltarget - base;
1010 }
1011
f67539c2 1012 rocksdb_binned_kv_cache = mon.store->get_priority_cache();
eafe8130
TL
1013 if (!rocksdb_binned_kv_cache) {
1014 derr << __func__ << " not using rocksdb" << dendl;
1015 return -EINVAL;
1016 }
1017
1018 int r = _set_cache_ratios();
1019 if (r < 0) {
1020 derr << __func__ << " Cache ratios for pcm could not be set."
1021 << " Review the kv (rocksdb) and mon_memory_target sizes."
1022 << dendl;
1023 return -EINVAL;
1024 }
1025
1026 pcm = std::make_shared<PriorityCache::Manager>(
1027 cct, min, max, target, true);
1028 pcm->insert("kv", rocksdb_binned_kv_cache, true);
1029 pcm->insert("inc", inc_cache, true);
1030 pcm->insert("full", full_cache, true);
92f5a8d4 1031 dout(1) << __func__ << " pcm target: " << target
eafe8130
TL
1032 << " pcm max: " << max
1033 << " pcm min: " << min
1034 << " inc_osd_cache size: " << inc_osd_cache.get_size()
1035 << dendl;
1036 return 0;
1037}
1038
1039int OSDMonitor::_set_cache_ratios()
1040{
1041 double old_cache_kv_ratio = cache_kv_ratio;
1042
1043 // Set the cache ratios for kv(rocksdb), inc and full caches
1044 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1045 if (cache_kv_ratio >= 1.0) {
1046 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1047 << ") must be in range [0,<1.0]."
1048 << dendl;
1049 cache_kv_ratio = old_cache_kv_ratio;
1050 return -EINVAL;
1051 }
1052 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1053 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1054 inc_cache->set_cache_ratio(cache_inc_ratio);
1055 full_cache->set_cache_ratio(cache_full_ratio);
1056
92f5a8d4 1057 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
eafe8130
TL
1058 << " inc ratio " << cache_inc_ratio
1059 << " full ratio " << cache_full_ratio
1060 << dendl;
1061 return 0;
1062}
1063
7c673cae
FG
1064void OSDMonitor::start_mapping()
1065{
1066 // initiate mapping job
1067 if (mapping_job) {
1068 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1069 << dendl;
1070 mapping_job->abort();
1071 }
224ce89b
WB
1072 if (!osdmap.get_pools().empty()) {
1073 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1074 mapping_job = mapping.start_update(osdmap, mapper,
11fdf7f2 1075 g_conf()->mon_osd_mapping_pgs_per_chunk);
224ce89b
WB
1076 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1077 << " at " << fin->start << dendl;
1078 mapping_job->set_finish_event(fin);
1079 } else {
1080 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1081 mapping_job = nullptr;
1082 }
7c673cae
FG
1083}
1084
1085void OSDMonitor::update_msgr_features()
1086{
f67539c2
TL
1087 const int types[] = {
1088 entity_name_t::TYPE_OSD,
1089 entity_name_t::TYPE_CLIENT,
1090 entity_name_t::TYPE_MDS,
1091 entity_name_t::TYPE_MON
1092 };
1093 for (int type : types) {
7c673cae 1094 uint64_t mask;
f67539c2
TL
1095 uint64_t features = osdmap.get_features(type, &mask);
1096 if ((mon.messenger->get_policy(type).features_required & mask) != features) {
7c673cae 1097 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
f67539c2 1098 ceph::net::Policy p = mon.messenger->get_policy(type);
7c673cae 1099 p.features_required = (p.features_required & ~mask) | features;
f67539c2 1100 mon.messenger->set_policy(type, p);
7c673cae
FG
1101 }
1102 }
1103}
1104
1105void OSDMonitor::on_active()
1106{
1107 update_logger();
1108
f67539c2
TL
1109 if (mon.is_leader()) {
1110 mon.clog->debug() << "osdmap " << osdmap;
81eedcae
TL
1111 if (!priority_convert) {
1112 // Only do this once at start-up
1113 convert_pool_priorities();
1114 priority_convert = true;
1115 }
7c673cae
FG
1116 } else {
1117 list<MonOpRequestRef> ls;
1118 take_all_failures(ls);
1119 while (!ls.empty()) {
1120 MonOpRequestRef op = ls.front();
1121 op->mark_osdmon_event(__func__);
1122 dispatch(op);
1123 ls.pop_front();
1124 }
1125 }
1126 start_mapping();
1127}
1128
1129void OSDMonitor::on_restart()
1130{
1131 last_osd_report.clear();
1132}
1133
1134void OSDMonitor::on_shutdown()
1135{
1136 dout(10) << __func__ << dendl;
1137 if (mapping_job) {
1138 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1139 << dendl;
1140 mapping_job->abort();
1141 }
1142
1143 // discard failure info, waiters
1144 list<MonOpRequestRef> ls;
1145 take_all_failures(ls);
1146 ls.clear();
1147}
1148
1149void OSDMonitor::update_logger()
1150{
1151 dout(10) << "update_logger" << dendl;
1152
f67539c2
TL
1153 mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1154 mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1155 mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1156 mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
7c673cae
FG
1157}
1158
7c673cae
FG
1159void OSDMonitor::create_pending()
1160{
1161 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
f67539c2 1162 pending_inc.fsid = mon.monmap->fsid;
11fdf7f2
TL
1163 pending_metadata.clear();
1164 pending_metadata_rm.clear();
9f95a23c 1165 pending_pseudo_purged_snaps.clear();
7c673cae
FG
1166
1167 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1168
11fdf7f2
TL
1169 // safety checks (this shouldn't really happen)
1170 {
1171 if (osdmap.backfillfull_ratio <= 0) {
1172 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1173 if (pending_inc.new_backfillfull_ratio > 1.0)
1174 pending_inc.new_backfillfull_ratio /= 100;
1175 dout(1) << __func__ << " setting backfillfull_ratio = "
1176 << pending_inc.new_backfillfull_ratio << dendl;
7c673cae 1177 }
7c673cae 1178 if (osdmap.full_ratio <= 0) {
11fdf7f2 1179 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
7c673cae
FG
1180 if (pending_inc.new_full_ratio > 1.0)
1181 pending_inc.new_full_ratio /= 100;
1182 dout(1) << __func__ << " setting full_ratio = "
1183 << pending_inc.new_full_ratio << dendl;
1184 }
1185 if (osdmap.nearfull_ratio <= 0) {
11fdf7f2 1186 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
7c673cae
FG
1187 if (pending_inc.new_nearfull_ratio > 1.0)
1188 pending_inc.new_nearfull_ratio /= 100;
1189 dout(1) << __func__ << " setting nearfull_ratio = "
1190 << pending_inc.new_nearfull_ratio << dendl;
1191 }
1192 }
1193}
1194
1195creating_pgs_t
94b18763
FG
1196OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1197 const OSDMap& nextmap)
7c673cae 1198{
31f18b77 1199 dout(10) << __func__ << dendl;
7c673cae
FG
1200 creating_pgs_t pending_creatings;
1201 {
1202 std::lock_guard<std::mutex> l(creating_pgs_lock);
1203 pending_creatings = creating_pgs;
1204 }
31f18b77
FG
1205 // check for new or old pools
1206 if (pending_creatings.last_scan_epoch < inc.epoch) {
31f18b77
FG
1207 unsigned queued = 0;
1208 queued += scan_for_creating_pgs(osdmap.get_pools(),
1209 inc.old_pools,
1210 inc.modified,
1211 &pending_creatings);
1212 queued += scan_for_creating_pgs(inc.new_pools,
1213 inc.old_pools,
1214 inc.modified,
1215 &pending_creatings);
1216 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1217 for (auto deleted_pool : inc.old_pools) {
1218 auto removed = pending_creatings.remove_pool(deleted_pool);
1219 dout(10) << __func__ << " " << removed
1220 << " pg removed because containing pool deleted: "
1221 << deleted_pool << dendl;
1222 last_epoch_clean.remove_pool(deleted_pool);
1223 }
1224 // pgmon updates its creating_pgs in check_osd_map() which is called by
1225 // on_active() and check_osd_map() could be delayed if lease expires, so its
1226 // creating_pgs could be stale in comparison with the one of osdmon. let's
1227 // trim them here. otherwise, they will be added back after being erased.
1228 unsigned removed = 0;
1229 for (auto& pg : pending_created_pgs) {
1230 dout(20) << __func__ << " noting created pg " << pg << dendl;
1231 pending_creatings.created_pools.insert(pg.pool());
1232 removed += pending_creatings.pgs.erase(pg);
1233 }
1234 pending_created_pgs.clear();
1235 dout(10) << __func__ << " " << removed
1236 << " pgs removed because they're created" << dendl;
1237 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1238 }
1239
94b18763
FG
1240 // filter out any pgs that shouldn't exist.
1241 {
1242 auto i = pending_creatings.pgs.begin();
1243 while (i != pending_creatings.pgs.end()) {
1244 if (!nextmap.pg_exists(i->first)) {
1245 dout(10) << __func__ << " removing pg " << i->first
1246 << " which should not exist" << dendl;
1247 i = pending_creatings.pgs.erase(i);
1248 } else {
1249 ++i;
1250 }
1251 }
1252 }
1253
31f18b77 1254 // process queue
11fdf7f2 1255 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
31f18b77
FG
1256 const auto total = pending_creatings.pgs.size();
1257 while (pending_creatings.pgs.size() < max &&
1258 !pending_creatings.queue.empty()) {
1259 auto p = pending_creatings.queue.begin();
1260 int64_t poolid = p->first;
1261 dout(10) << __func__ << " pool " << poolid
1262 << " created " << p->second.created
1263 << " modified " << p->second.modified
1264 << " [" << p->second.start << "-" << p->second.end << ")"
1265 << dendl;
11fdf7f2
TL
1266 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1267 p->second.end - p->second.start);
31f18b77
FG
1268 ps_t first = p->second.start;
1269 ps_t end = first + n;
1270 for (ps_t ps = first; ps < end; ++ps) {
1271 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1272 // NOTE: use the *current* epoch as the PG creation epoch so that the
1273 // OSD does not have to generate a long set of PastIntervals.
9f95a23c
TL
1274 pending_creatings.pgs.emplace(
1275 pgid,
1276 creating_pgs_t::pg_create_info(inc.epoch,
1277 p->second.modified));
31f18b77
FG
1278 dout(10) << __func__ << " adding " << pgid << dendl;
1279 }
1280 p->second.start = end;
1281 if (p->second.done()) {
1282 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1283 pending_creatings.queue.erase(p);
1284 } else {
1285 dout(10) << __func__ << " pool " << poolid
1286 << " now [" << p->second.start << "-" << p->second.end << ")"
1287 << dendl;
1288 }
1289 }
1290 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1291 << " pools" << dendl;
9f95a23c 1292
f67539c2 1293 if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
9f95a23c
TL
1294 // walk creating pgs' history and past_intervals forward
1295 for (auto& i : pending_creatings.pgs) {
1296 // this mirrors PG::start_peering_interval()
1297 pg_t pgid = i.first;
1298
1299 // this is a bit imprecise, but sufficient?
1300 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1301 const pg_pool_t *pi;
1302 bool operator()(const set<pg_shard_t> &have) const {
1303 return have.size() >= pi->min_size;
1304 }
1305 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1306 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1307
1308 vector<int> up, acting;
1309 int up_primary, acting_primary;
1310 nextmap.pg_to_up_acting_osds(
1311 pgid, &up, &up_primary, &acting, &acting_primary);
1312 if (i.second.history.epoch_created == 0) {
1313 // new pg entry, set it up
1314 i.second.up = up;
1315 i.second.acting = acting;
1316 i.second.up_primary = up_primary;
1317 i.second.acting_primary = acting_primary;
1318 i.second.history = pg_history_t(i.second.create_epoch,
1319 i.second.create_stamp);
1320 dout(10) << __func__ << " pg " << pgid << " just added, "
1321 << " up " << i.second.up
1322 << " p " << i.second.up_primary
1323 << " acting " << i.second.acting
1324 << " p " << i.second.acting_primary
1325 << " history " << i.second.history
1326 << " past_intervals " << i.second.past_intervals
1327 << dendl;
1328 } else {
1329 std::stringstream debug;
1330 if (PastIntervals::check_new_interval(
1331 i.second.acting_primary, acting_primary,
1332 i.second.acting, acting,
1333 i.second.up_primary, up_primary,
1334 i.second.up, up,
1335 i.second.history.same_interval_since,
1336 i.second.history.last_epoch_clean,
1337 &nextmap,
1338 &osdmap,
1339 pgid,
1340 min_size_predicate,
1341 &i.second.past_intervals,
1342 &debug)) {
1343 epoch_t e = inc.epoch;
1344 i.second.history.same_interval_since = e;
1345 if (i.second.up != up) {
1346 i.second.history.same_up_since = e;
1347 }
1348 if (i.second.acting_primary != acting_primary) {
1349 i.second.history.same_primary_since = e;
1350 }
1351 if (pgid.is_split(
1352 osdmap.get_pg_num(pgid.pool()),
1353 nextmap.get_pg_num(pgid.pool()),
1354 nullptr)) {
1355 i.second.history.last_epoch_split = e;
1356 }
1357 dout(10) << __func__ << " pg " << pgid << " new interval,"
1358 << " up " << i.second.up << " -> " << up
1359 << " p " << i.second.up_primary << " -> " << up_primary
1360 << " acting " << i.second.acting << " -> " << acting
1361 << " p " << i.second.acting_primary << " -> "
1362 << acting_primary
1363 << " history " << i.second.history
1364 << " past_intervals " << i.second.past_intervals
1365 << dendl;
1366 dout(20) << " debug: " << debug.str() << dendl;
1367 i.second.up = up;
1368 i.second.acting = acting;
1369 i.second.up_primary = up_primary;
1370 i.second.acting_primary = acting_primary;
1371 }
1372 }
1373 }
1374 }
c07f9fc5
FG
1375 dout(10) << __func__
1376 << " " << (pending_creatings.pgs.size() - total)
1377 << "/" << pending_creatings.pgs.size()
31f18b77 1378 << " pgs added from queued pools" << dendl;
7c673cae
FG
1379 return pending_creatings;
1380}
1381
1382void OSDMonitor::maybe_prime_pg_temp()
1383{
1384 bool all = false;
1385 if (pending_inc.crush.length()) {
1386 dout(10) << __func__ << " new crush map, all" << dendl;
1387 all = true;
1388 }
1389
1390 if (!pending_inc.new_up_client.empty()) {
1391 dout(10) << __func__ << " new up osds, all" << dendl;
1392 all = true;
1393 }
1394
1395 // check for interesting OSDs
1396 set<int> osds;
31f18b77 1397 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
1398 !all && p != pending_inc.new_state.end();
1399 ++p) {
1400 if ((p->second & CEPH_OSD_UP) &&
1401 osdmap.is_up(p->first)) {
1402 osds.insert(p->first);
1403 }
1404 }
f67539c2 1405 for (auto p = pending_inc.new_weight.begin();
7c673cae
FG
1406 !all && p != pending_inc.new_weight.end();
1407 ++p) {
f67539c2 1408 if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
7c673cae
FG
1409 // weight reduction
1410 osds.insert(p->first);
1411 } else {
1412 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1413 << dendl;
1414 all = true;
1415 }
1416 }
1417
1418 if (!all && osds.empty())
1419 return;
1420
1421 if (!all) {
1422 unsigned estimate =
1423 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1424 if (estimate > mapping.get_num_pgs() *
11fdf7f2 1425 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
7c673cae
FG
1426 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1427 << osds.size() << " osds >= "
11fdf7f2 1428 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
7c673cae
FG
1429 << mapping.get_num_pgs() << " pgs, all"
1430 << dendl;
1431 all = true;
1432 } else {
1433 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1434 << osds.size() << " osds" << dendl;
1435 }
1436 }
1437
1438 OSDMap next;
1439 next.deepish_copy_from(osdmap);
1440 next.apply_incremental(pending_inc);
1441
224ce89b
WB
1442 if (next.get_pools().empty()) {
1443 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1444 } else if (all) {
7c673cae 1445 PrimeTempJob job(next, this);
494da23a 1446 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
11fdf7f2 1447 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
7c673cae
FG
1448 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1449 } else {
1450 dout(10) << __func__ << " did not finish in "
11fdf7f2 1451 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1452 << ", stopping" << dendl;
1453 job.abort();
1454 }
1455 } else {
1456 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1457 utime_t stop = ceph_clock_now();
11fdf7f2 1458 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
7c673cae
FG
1459 const int chunk = 1000;
1460 int n = chunk;
1461 std::unordered_set<pg_t> did_pgs;
1462 for (auto osd : osds) {
1463 auto& pgs = mapping.get_osd_acting_pgs(osd);
1464 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1465 for (auto pgid : pgs) {
1466 if (!did_pgs.insert(pgid).second) {
1467 continue;
1468 }
1469 prime_pg_temp(next, pgid);
1470 if (--n <= 0) {
1471 n = chunk;
1472 if (ceph_clock_now() > stop) {
1473 dout(10) << __func__ << " consumed more than "
11fdf7f2 1474 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1475 << " seconds, stopping"
1476 << dendl;
1477 return;
1478 }
1479 }
1480 }
1481 }
1482 }
1483}
1484
1485void OSDMonitor::prime_pg_temp(
1486 const OSDMap& next,
1487 pg_t pgid)
1488{
11fdf7f2
TL
1489 // TODO: remove this creating_pgs direct access?
1490 if (creating_pgs.pgs.count(pgid)) {
1491 return;
7c673cae
FG
1492 }
1493 if (!osdmap.pg_exists(pgid)) {
1494 return;
1495 }
1496
1497 vector<int> up, acting;
1498 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1499
1500 vector<int> next_up, next_acting;
1501 int next_up_primary, next_acting_primary;
1502 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1503 &next_acting, &next_acting_primary);
f64942e4
AA
1504 if (acting == next_acting &&
1505 !(up != acting && next_up == next_acting))
7c673cae
FG
1506 return; // no change since last epoch
1507
1508 if (acting.empty())
1509 return; // if previously empty now we can be no worse off
1510 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1511 if (pool && acting.size() < pool->min_size)
1512 return; // can be no worse off than before
1513
c07f9fc5
FG
1514 if (next_up == next_acting) {
1515 acting.clear();
11fdf7f2
TL
1516 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1517 << dendl;
c07f9fc5
FG
1518 }
1519
7c673cae
FG
1520 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1521 << " -> " << next_up << "/" << next_acting
1522 << ", priming " << acting
1523 << dendl;
1524 {
11fdf7f2 1525 std::lock_guard l(prime_pg_temp_lock);
7c673cae
FG
1526 // do not touch a mapping if a change is pending
1527 pending_inc.new_pg_temp.emplace(
1528 pgid,
1529 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1530 }
1531}
1532
1533/**
1534 * @note receiving a transaction in this function gives a fair amount of
1535 * freedom to the service implementation if it does need it. It shouldn't.
1536 */
1537void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1538{
1539 dout(10) << "encode_pending e " << pending_inc.epoch
1540 << dendl;
1541
11fdf7f2
TL
1542 if (do_prune(t)) {
1543 dout(1) << __func__ << " osdmap full prune encoded e"
1544 << pending_inc.epoch << dendl;
1545 }
1546
7c673cae
FG
1547 // finalize up pending_inc
1548 pending_inc.modified = ceph_clock_now();
1549
f67539c2 1550 int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
11fdf7f2 1551 ceph_assert(r == 0);
7c673cae
FG
1552
1553 if (mapping_job) {
1554 if (!mapping_job->is_done()) {
1555 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1556 << mapping_job.get() << " did not complete, "
1557 << mapping_job->shards << " left" << dendl;
1558 mapping_job->abort();
1559 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1560 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1561 << mapping_job.get() << " is prior epoch "
1562 << mapping.get_epoch() << dendl;
1563 } else {
11fdf7f2 1564 if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1565 maybe_prime_pg_temp();
1566 }
1567 }
11fdf7f2 1568 } else if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1569 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1570 << dendl;
1571 }
1572 mapping_job.reset();
1573
c07f9fc5
FG
1574 // ensure we don't have blank new_state updates. these are interrpeted as
1575 // CEPH_OSD_UP (and almost certainly not what we want!).
1576 auto p = pending_inc.new_state.begin();
1577 while (p != pending_inc.new_state.end()) {
1578 if (p->second == 0) {
1579 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1580 p = pending_inc.new_state.erase(p);
1581 } else {
11fdf7f2
TL
1582 if (p->second & CEPH_OSD_UP) {
1583 pending_inc.new_last_up_change = pending_inc.modified;
1584 }
c07f9fc5
FG
1585 ++p;
1586 }
1587 }
11fdf7f2
TL
1588 if (!pending_inc.new_up_client.empty()) {
1589 pending_inc.new_last_up_change = pending_inc.modified;
1590 }
1591 for (auto& i : pending_inc.new_weight) {
9f95a23c 1592 if (i.first >= osdmap.max_osd) {
11fdf7f2
TL
1593 if (i.second) {
1594 // new osd is already marked in
1595 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1596 break;
11fdf7f2
TL
1597 }
1598 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1599 // existing osd marked in or out
1600 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1601 break;
11fdf7f2
TL
1602 }
1603 }
7c673cae
FG
1604
1605 {
1606 OSDMap tmp;
1607 tmp.deepish_copy_from(osdmap);
1608 tmp.apply_incremental(pending_inc);
1609
11fdf7f2
TL
1610 // clean pg_temp mappings
1611 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1612
1613 // clean inappropriate pg_upmap/pg_upmap_items (if any)
494da23a
TL
1614 {
1615 // check every upmapped pg for now
1616 // until we could reliably identify certain cases to ignore,
1617 // which is obviously the hard part TBD..
1618 vector<pg_t> pgs_to_check;
1619 tmp.get_upmap_pgs(&pgs_to_check);
9f95a23c
TL
1620 if (pgs_to_check.size() <
1621 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
494da23a
TL
1622 // not enough pgs, do it inline
1623 tmp.clean_pg_upmaps(cct, &pending_inc);
1624 } else {
1625 CleanUpmapJob job(cct, tmp, pending_inc);
1626 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1627 job.wait();
1628 }
1629 }
11fdf7f2
TL
1630
1631 // update creating pgs first so that we can remove the created pgid and
1632 // process the pool flag removal below in the same osdmap epoch.
1633 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1634 bufferlist creatings_bl;
9f95a23c 1635 uint64_t features = CEPH_FEATURES_ALL;
f67539c2 1636 if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
9f95a23c
TL
1637 dout(20) << __func__ << " encoding pending pgs without octopus features"
1638 << dendl;
1639 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1640 }
1641 encode(pending_creatings, creatings_bl, features);
11fdf7f2
TL
1642 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1643
1644 // remove any old (or incompat) POOL_CREATING flags
1645 for (auto& i : tmp.get_pools()) {
9f95a23c 1646 if (tmp.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1647 // pre-nautilus OSDMaps shouldn't get this flag.
1648 if (pending_inc.new_pools.count(i.first)) {
1649 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1650 }
1651 }
1652 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1653 !pending_creatings.still_creating_pool(i.first)) {
1654 dout(10) << __func__ << " done creating pool " << i.first
1655 << ", clearing CREATING flag" << dendl;
1656 if (pending_inc.new_pools.count(i.first) == 0) {
1657 pending_inc.new_pools[i.first] = i.second;
1658 }
1659 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
3efd9988 1660 }
11fdf7f2
TL
1661 }
1662
11fdf7f2
TL
1663 // collect which pools are currently affected by
1664 // the near/backfill/full osd(s),
1665 // and set per-pool near/backfill/full flag instead
1666 set<int64_t> full_pool_ids;
1667 set<int64_t> backfillfull_pool_ids;
1668 set<int64_t> nearfull_pool_ids;
1669 tmp.get_full_pools(cct,
1670 &full_pool_ids,
1671 &backfillfull_pool_ids,
3efd9988 1672 &nearfull_pool_ids);
11fdf7f2
TL
1673 if (full_pool_ids.empty() ||
1674 backfillfull_pool_ids.empty() ||
1675 nearfull_pool_ids.empty()) {
1676 // normal case - no nearfull, backfillfull or full osds
3efd9988
FG
1677 // try cancel any improper nearfull/backfillfull/full pool
1678 // flags first
11fdf7f2
TL
1679 for (auto &pool: tmp.get_pools()) {
1680 auto p = pool.first;
1681 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1682 nearfull_pool_ids.empty()) {
1683 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1684 << "'s nearfull flag" << dendl;
1685 if (pending_inc.new_pools.count(p) == 0) {
1686 // load original pool info first!
1687 pending_inc.new_pools[p] = pool.second;
1688 }
1689 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1690 }
1691 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1692 backfillfull_pool_ids.empty()) {
1693 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1694 << "'s backfillfull flag" << dendl;
1695 if (pending_inc.new_pools.count(p) == 0) {
1696 pending_inc.new_pools[p] = pool.second;
1697 }
1698 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1699 }
1700 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1701 full_pool_ids.empty()) {
1702 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1703 // set by EQUOTA, skipping
1704 continue;
1705 }
1706 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1707 << "'s full flag" << dendl;
1708 if (pending_inc.new_pools.count(p) == 0) {
1709 pending_inc.new_pools[p] = pool.second;
1710 }
1711 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1712 }
3efd9988 1713 }
11fdf7f2
TL
1714 }
1715 if (!full_pool_ids.empty()) {
1716 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1717 << " as full" << dendl;
1718 for (auto &p: full_pool_ids) {
1719 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1720 continue;
1721 }
1722 if (pending_inc.new_pools.count(p) == 0) {
1723 pending_inc.new_pools[p] = tmp.pools[p];
1724 }
1725 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1726 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1727 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1728 }
1729 // cancel FLAG_FULL for pools which are no longer full too
1730 for (auto &pool: tmp.get_pools()) {
1731 auto p = pool.first;
1732 if (full_pool_ids.count(p)) {
1733 // skip pools we have just marked as full above
1734 continue;
1735 }
1736 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1737 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1738 // don't touch if currently is not full
1739 // or is running out of quota (and hence considered as full)
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s full flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
3efd9988 1748 }
11fdf7f2
TL
1749 }
1750 if (!backfillfull_pool_ids.empty()) {
1751 for (auto &p: backfillfull_pool_ids) {
1752 if (full_pool_ids.count(p)) {
1753 // skip pools we have already considered as full above
1754 continue;
1755 }
1756 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1757 // make sure FLAG_FULL is truly set, so we are safe not
1758 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1759 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1760 continue;
1761 }
1762 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1763 // don't bother if pool is already marked as backfillfull
1764 continue;
1765 }
1766 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1767 << "'s as backfillfull" << dendl;
1768 if (pending_inc.new_pools.count(p) == 0) {
1769 pending_inc.new_pools[p] = tmp.pools[p];
1770 }
1771 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1772 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1773 }
1774 // cancel FLAG_BACKFILLFULL for pools
1775 // which are no longer backfillfull too
1776 for (auto &pool: tmp.get_pools()) {
1777 auto p = pool.first;
1778 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as backfillfull/full above
1780 continue;
1781 }
1782 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1783 // and don't touch if currently is not backfillfull
1784 continue;
1785 }
1786 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1787 << "'s backfillfull flag" << dendl;
1788 if (pending_inc.new_pools.count(p) == 0) {
1789 pending_inc.new_pools[p] = pool.second;
1790 }
1791 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
3efd9988 1792 }
11fdf7f2
TL
1793 }
1794 if (!nearfull_pool_ids.empty()) {
1795 for (auto &p: nearfull_pool_ids) {
1796 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1797 continue;
1798 }
1799 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1800 // make sure FLAG_FULL is truly set, so we are safe not
1801 // to set a extra (redundant) FLAG_NEARFULL flag
1802 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1803 continue;
1804 }
1805 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1806 // don't bother if pool is already marked as nearfull
1807 continue;
1808 }
1809 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1810 << "'s as nearfull" << dendl;
1811 if (pending_inc.new_pools.count(p) == 0) {
1812 pending_inc.new_pools[p] = tmp.pools[p];
1813 }
1814 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1815 }
1816 // cancel FLAG_NEARFULL for pools
1817 // which are no longer nearfull too
1818 for (auto &pool: tmp.get_pools()) {
1819 auto p = pool.first;
1820 if (full_pool_ids.count(p) ||
1821 backfillfull_pool_ids.count(p) ||
1822 nearfull_pool_ids.count(p)) {
1823 // skip pools we have just marked as
1824 // nearfull/backfillfull/full above
1825 continue;
1826 }
1827 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1828 // and don't touch if currently is not nearfull
1829 continue;
1830 }
1831 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1832 << "'s nearfull flag" << dendl;
1833 if (pending_inc.new_pools.count(p) == 0) {
1834 pending_inc.new_pools[p] = pool.second;
1835 }
1836 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
7c673cae 1837 }
11fdf7f2 1838 }
7c673cae 1839
11fdf7f2 1840 // min_compat_client?
9f95a23c 1841 if (!tmp.require_min_compat_client) {
11fdf7f2
TL
1842 auto mv = tmp.get_min_compat_client();
1843 dout(1) << __func__ << " setting require_min_compat_client to currently "
9f95a23c 1844 << "required " << mv << dendl;
f67539c2 1845 mon.clog->info() << "setting require_min_compat_client to currently "
9f95a23c 1846 << "required " << mv;
11fdf7f2
TL
1847 pending_inc.new_require_min_compat_client = mv;
1848 }
1849
9f95a23c
TL
1850 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1851 tmp.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
1852 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1853 // add creating flags?
1854 for (auto& i : tmp.get_pools()) {
1855 if (pending_creatings.still_creating_pool(i.first)) {
1856 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1857 << dendl;
1858 if (pending_inc.new_pools.count(i.first) == 0) {
1859 pending_inc.new_pools[i.first] = i.second;
224ce89b 1860 }
11fdf7f2 1861 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
224ce89b 1862 }
11fdf7f2 1863 }
f67539c2
TL
1864 // adjust blocklist items to all be TYPE_ANY
1865 for (auto& i : tmp.blocklist) {
11fdf7f2
TL
1866 auto a = i.first;
1867 a.set_type(entity_addr_t::TYPE_ANY);
f67539c2
TL
1868 pending_inc.new_blocklist[a] = i.second;
1869 pending_inc.old_blocklist.push_back(i.first);
224ce89b 1870 }
7c673cae 1871 }
9f95a23c
TL
1872
1873 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1874 tmp.require_osd_release >= ceph_release_t::octopus) {
1875 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1876
1877 // adjust obsoleted cache modes
1878 for (auto& [poolid, pi] : tmp.pools) {
1879 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1880 if (pending_inc.new_pools.count(poolid) == 0) {
1881 pending_inc.new_pools[poolid] = pi;
1882 }
1883 dout(10) << __func__ << " switching pool " << poolid
1884 << " cachemode from forward -> proxy" << dendl;
1885 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1886 }
1887 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1888 if (pending_inc.new_pools.count(poolid) == 0) {
1889 pending_inc.new_pools[poolid] = pi;
1890 }
1891 dout(10) << __func__ << " switching pool " << poolid
1892 << " cachemode from readforward -> readproxy" << dendl;
1893 pending_inc.new_pools[poolid].cache_mode =
1894 pg_pool_t::CACHEMODE_READPROXY;
1895 }
1896 }
1897
1898 // clear removed_snaps for every pool
1899 for (auto& [poolid, pi] : tmp.pools) {
1900 if (pi.removed_snaps.empty()) {
1901 continue;
1902 }
1903 if (pending_inc.new_pools.count(poolid) == 0) {
1904 pending_inc.new_pools[poolid] = pi;
1905 }
1906 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1907 << dendl;
1908 pending_inc.new_pools[poolid].removed_snaps.clear();
1909 }
1910
1911 // create a combined purged snap epoch key for all purged snaps
1912 // prior to this epoch, and store it in the current epoch (i.e.,
1913 // the last pre-octopus epoch, just prior to the one we're
1914 // encoding now).
f67539c2 1915 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
9f95a23c
TL
1916 it->lower_bound("purged_snap_");
1917 map<int64_t,snap_interval_set_t> combined;
1918 while (it->valid()) {
1919 if (it->key().find("purged_snap_") != 0) {
1920 break;
1921 }
1922 string k = it->key();
1923 long long unsigned pool;
1924 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1925 if (n != 1) {
1926 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1927 } else {
1928 bufferlist v = it->value();
1929 auto p = v.cbegin();
1930 snapid_t begin, end;
1931 ceph::decode(begin, p);
1932 ceph::decode(end, p);
1933 combined[pool].insert(begin, end - begin);
1934 }
1935 it->next();
1936 }
1937 if (!combined.empty()) {
1938 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1939 bufferlist v;
1940 ceph::encode(combined, v);
1941 t->put(OSD_SNAP_PREFIX, k, v);
1942 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1943 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1944 << dendl;
1945 } else {
1946 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1947 << dendl;
1948 }
1949
1950 // clean out the old removed_snap_ and removed_epoch keys
1951 // ('`' is ASCII '_' + 1)
1952 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1953 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1954 }
7c673cae
FG
1955 }
1956
1957 // tell me about it
31f18b77 1958 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1959 i != pending_inc.new_state.end();
1960 ++i) {
1961 int s = i->second ? i->second : CEPH_OSD_UP;
f6b5b4d7 1962 if (s & CEPH_OSD_UP) {
7c673cae 1963 dout(2) << " osd." << i->first << " DOWN" << dendl;
f6b5b4d7
TL
1964 // Reset laggy parameters if failure interval exceeds a threshold.
1965 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1966 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1967 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1968 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1969 set_default_laggy_params(i->first);
1970 }
1971 }
1972 }
7c673cae
FG
1973 if (s & CEPH_OSD_EXISTS)
1974 dout(2) << " osd." << i->first << " DNE" << dendl;
1975 }
11fdf7f2 1976 for (auto i = pending_inc.new_up_client.begin();
7c673cae
FG
1977 i != pending_inc.new_up_client.end();
1978 ++i) {
1979 //FIXME: insert cluster addresses too
1980 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1981 }
1982 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1983 i != pending_inc.new_weight.end();
1984 ++i) {
1985 if (i->second == CEPH_OSD_OUT) {
1986 dout(2) << " osd." << i->first << " OUT" << dendl;
1987 } else if (i->second == CEPH_OSD_IN) {
1988 dout(2) << " osd." << i->first << " IN" << dendl;
1989 } else {
1990 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1991 }
1992 }
1993
1994 // features for osdmap and its incremental
28e407b8 1995 uint64_t features;
7c673cae
FG
1996
1997 // encode full map and determine its crc
1998 OSDMap tmp;
1999 {
2000 tmp.deepish_copy_from(osdmap);
2001 tmp.apply_incremental(pending_inc);
2002
2003 // determine appropriate features
28e407b8
AA
2004 features = tmp.get_encoding_features();
2005 dout(10) << __func__ << " encoding full map with "
9f95a23c 2006 << tmp.require_osd_release
28e407b8
AA
2007 << " features " << features << dendl;
2008
2009 // the features should be a subset of the mon quorum's features!
f67539c2 2010 ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
7c673cae
FG
2011
2012 bufferlist fullbl;
11fdf7f2 2013 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
2014 pending_inc.full_crc = tmp.get_crc();
2015
2016 // include full map in the txn. note that old monitors will
2017 // overwrite this. new ones will now skip the local full map
2018 // encode and reload from this.
2019 put_version_full(t, pending_inc.epoch, fullbl);
2020 }
2021
2022 // encode
11fdf7f2
TL
2023 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2024 bufferlist bl;
2025 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
2026
2027 dout(20) << " full_crc " << tmp.get_crc()
2028 << " inc_crc " << pending_inc.inc_crc << dendl;
2029
2030 /* put everything in the transaction */
2031 put_version(t, pending_inc.epoch, bl);
2032 put_last_committed(t, pending_inc.epoch);
2033
2034 // metadata, too!
2035 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2036 p != pending_metadata.end();
20effc67
TL
2037 ++p) {
2038 Metadata m;
2039 auto mp = p->second.cbegin();
2040 decode(m, mp);
2041 auto it = m.find("osd_objectstore");
2042 if (it != m.end()) {
2043 if (it->second == "filestore") {
2044 filestore_osds.insert(p->first);
2045 } else {
2046 filestore_osds.erase(p->first);
2047 }
2048 }
7c673cae 2049 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
20effc67 2050 }
7c673cae
FG
2051 for (set<int>::iterator p = pending_metadata_rm.begin();
2052 p != pending_metadata_rm.end();
20effc67
TL
2053 ++p) {
2054 filestore_osds.erase(*p);
7c673cae 2055 t->erase(OSD_METADATA_PREFIX, stringify(*p));
20effc67 2056 }
7c673cae
FG
2057 pending_metadata.clear();
2058 pending_metadata_rm.clear();
2059
9f95a23c
TL
2060 // purged_snaps
2061 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2062 !pending_inc.new_purged_snaps.empty()) {
2063 // all snaps purged this epoch (across all pools)
2064 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2065 bufferlist v;
2066 encode(pending_inc.new_purged_snaps, v);
2067 t->put(OSD_SNAP_PREFIX, k, v);
2068 }
2069 for (auto& i : pending_inc.new_purged_snaps) {
2070 for (auto q = i.second.begin();
2071 q != i.second.end();
2072 ++q) {
2073 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2074 pending_inc.epoch,
2075 t);
11fdf7f2 2076 }
9f95a23c
TL
2077 }
2078 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2079 for (auto snap : snaps) {
2080 insert_purged_snap_update(pool, snap, snap + 1,
2081 pending_inc.epoch,
2082 t);
7c673cae 2083 }
7c673cae 2084 }
224ce89b
WB
2085
2086 // health
2087 health_check_map_t next;
92f5a8d4 2088 tmp.check_health(cct, &next);
20effc67
TL
2089 // OSD_FILESTORE
2090 check_for_filestore_osds(&next);
224ce89b 2091 encode_health(next, t);
7c673cae
FG
2092}
2093
7c673cae
FG
2094int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2095{
2096 bufferlist bl;
f67539c2 2097 int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
7c673cae
FG
2098 if (r < 0)
2099 return r;
2100 try {
11fdf7f2
TL
2101 auto p = bl.cbegin();
2102 decode(m, p);
7c673cae 2103 }
f67539c2 2104 catch (ceph::buffer::error& e) {
7c673cae
FG
2105 if (err)
2106 *err << "osd." << osd << " metadata is corrupt";
2107 return -EIO;
2108 }
2109 return 0;
2110}
2111
c07f9fc5 2112void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 2113{
31f18b77
FG
2114 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2115 if (osdmap.is_up(osd)) {
2116 map<string,string> meta;
2117 load_metadata(osd, meta, nullptr);
2118 auto p = meta.find(field);
2119 if (p == meta.end()) {
c07f9fc5 2120 (*out)["unknown"]++;
31f18b77 2121 } else {
c07f9fc5 2122 (*out)[p->second]++;
31f18b77
FG
2123 }
2124 }
2125 }
c07f9fc5
FG
2126}
2127
2128void OSDMonitor::count_metadata(const string& field, Formatter *f)
2129{
2130 map<string,int> by_val;
2131 count_metadata(field, &by_val);
31f18b77
FG
2132 f->open_object_section(field.c_str());
2133 for (auto& p : by_val) {
2134 f->dump_int(p.first.c_str(), p.second);
2135 }
2136 f->close_section();
2137}
2138
f67539c2
TL
2139void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2140{
2141 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2142 if (osdmap.is_up(osd)) {
2143 map<string,string> meta;
2144 load_metadata(osd, meta, nullptr);
2145 auto p = meta.find("ceph_version_short");
2146 if (p == meta.end()) continue;
2147 versions[p->second].push_back(string("osd.") + stringify(osd));
2148 }
2149 }
2150}
2151
7c673cae
FG
2152int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2153{
2154 map<string, string> metadata;
2155 int r = load_metadata(osd, metadata, nullptr);
2156 if (r < 0)
2157 return r;
2158
2159 auto it = metadata.find("osd_objectstore");
2160 if (it == metadata.end())
2161 return -ENOENT;
2162 *type = it->second;
2163 return 0;
2164}
2165
20effc67
TL
2166void OSDMonitor::get_filestore_osd_list()
2167{
2168 for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2169 string objectstore_type;
2170 int r = get_osd_objectstore_type(osd, &objectstore_type);
2171 if (r == 0 && objectstore_type == "filestore") {
2172 filestore_osds.insert(osd);
2173 }
2174 }
2175}
2176
2177void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2178{
2179 if (g_conf()->mon_warn_on_filestore_osds &&
2180 filestore_osds.size() > 0) {
2181 ostringstream ss, deprecated_tip;
2182 list<string> detail;
2183 ss << filestore_osds.size()
2184 << " osd(s) "
2185 << (filestore_osds.size() == 1 ? "is" : "are")
2186 << " running Filestore";
2187 deprecated_tip << ss.str();
2188 ss << " [Deprecated]";
2189 auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2190 filestore_osds.size());
2191 deprecated_tip << ", which has been deprecated and"
2192 << " not been optimized for QoS"
2193 << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2194 detail.push_back(deprecated_tip.str());
2195 d.detail.swap(detail);
2196 }
2197}
2198
7c673cae
FG
2199bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2200 const pg_pool_t &pool,
2201 ostream *err)
2202{
2203 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2204 // since filestore osds could always join the pool later
2205 set<int> checked_osds;
11fdf7f2 2206 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
7c673cae 2207 vector<int> up, acting;
11fdf7f2 2208 pg_t pgid(ps, pool_id);
7c673cae
FG
2209 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2210 for (int osd : up) {
2211 if (checked_osds.find(osd) != checked_osds.end())
2212 continue;
2213 string objectstore_type;
2214 int r = get_osd_objectstore_type(osd, &objectstore_type);
2215 // allow with missing metadata, e.g. due to an osd never booting yet
2216 if (r < 0 || objectstore_type == "bluestore") {
2217 checked_osds.insert(osd);
2218 continue;
2219 }
2220 *err << "osd." << osd << " uses " << objectstore_type;
2221 return false;
2222 }
2223 }
2224 return true;
2225}
2226
2227int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2228{
2229 map<string,string> m;
2230 if (int r = load_metadata(osd, m, err))
2231 return r;
2232 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2233 f->dump_string(p->first.c_str(), p->second);
2234 return 0;
2235}
2236
2237void OSDMonitor::print_nodes(Formatter *f)
2238{
2239 // group OSDs by their hosts
2240 map<string, list<int> > osds; // hostname => osd
2241 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2242 map<string, string> m;
2243 if (load_metadata(osd, m, NULL)) {
2244 continue;
2245 }
2246 map<string, string>::iterator hostname = m.find("hostname");
2247 if (hostname == m.end()) {
2248 // not likely though
2249 continue;
2250 }
2251 osds[hostname->second].push_back(osd);
2252 }
2253
2254 dump_services(f, osds, "osd");
2255}
2256
2257void OSDMonitor::share_map_with_random_osd()
2258{
2259 if (osdmap.get_num_up_osds() == 0) {
2260 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2261 return;
2262 }
2263
f67539c2 2264 MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
7c673cae
FG
2265 if (!s) {
2266 dout(10) << __func__ << " no up osd on our session map" << dendl;
2267 return;
2268 }
2269
11fdf7f2
TL
2270 dout(10) << "committed, telling random " << s->name
2271 << " all about it" << dendl;
28e407b8
AA
2272
2273 // get feature of the peer
2274 // use quorum_con_features, if it's an anonymous connection.
2275 uint64_t features = s->con_features ? s->con_features :
f67539c2 2276 mon.get_quorum_con_features();
7c673cae 2277 // whatev, they'll request more if they need it
28e407b8 2278 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
7c673cae
FG
2279 s->con->send_message(m);
2280 // NOTE: do *not* record osd has up to this epoch (as we do
2281 // elsewhere) as they may still need to request older values.
2282}
2283
11fdf7f2 2284version_t OSDMonitor::get_trim_to() const
7c673cae 2285{
f67539c2
TL
2286 if (mon.get_quorum().empty()) {
2287 dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
31f18b77
FG
2288 return 0;
2289 }
7c673cae 2290
11fdf7f2
TL
2291 {
2292 std::lock_guard<std::mutex> l(creating_pgs_lock);
2293 if (!creating_pgs.pgs.empty()) {
f67539c2 2294 dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
7c673cae
FG
2295 return 0;
2296 }
7c673cae 2297 }
11fdf7f2
TL
2298
2299 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2300 dout(0) << __func__
2301 << " blocking osdmap trim"
f67539c2
TL
2302 << " ('mon_debug_block_osdmap_trim' set to 'true')"
2303 << " trim_to = 0" << dendl;
11fdf7f2
TL
2304 return 0;
2305 }
2306
7c673cae 2307 {
11fdf7f2 2308 epoch_t floor = get_min_last_epoch_clean();
7c673cae 2309 dout(10) << " min_last_epoch_clean " << floor << dendl;
11fdf7f2
TL
2310 if (g_conf()->mon_osd_force_trim_to > 0 &&
2311 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2312 floor = g_conf()->mon_osd_force_trim_to;
f67539c2
TL
2313 dout(10) << __func__
2314 << " explicit mon_osd_force_trim_to = " << floor << dendl;
7c673cae 2315 }
11fdf7f2 2316 unsigned min = g_conf()->mon_min_osdmap_epochs;
7c673cae
FG
2317 if (floor + min > get_last_committed()) {
2318 if (min < get_last_committed())
2319 floor = get_last_committed() - min;
2320 else
2321 floor = 0;
2322 }
f67539c2
TL
2323 if (floor > get_first_committed()) {
2324 dout(10) << __func__ << " trim_to = " << floor << dendl;
7c673cae 2325 return floor;
f67539c2 2326 }
7c673cae 2327 }
f67539c2 2328 dout(10) << __func__ << " trim_to = 0" << dendl;
7c673cae
FG
2329 return 0;
2330}
2331
2332epoch_t OSDMonitor::get_min_last_epoch_clean() const
2333{
2334 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2335 // also scan osd epochs
2336 // don't trim past the oldest reported osd epoch
f67539c2
TL
2337 for (auto [osd, epoch] : osd_epochs) {
2338 if (epoch < floor) {
2339 floor = epoch;
7c673cae
FG
2340 }
2341 }
2342 return floor;
2343}
2344
2345void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2346 version_t first)
2347{
2348 dout(10) << __func__ << " including full map for e " << first << dendl;
2349 bufferlist bl;
2350 get_version_full(first, bl);
2351 put_version_full(tx, first, bl);
11fdf7f2
TL
2352
2353 if (has_osdmap_manifest &&
2354 first > osdmap_manifest.get_first_pinned()) {
2355 _prune_update_trimmed(tx, first);
2356 }
7c673cae
FG
2357}
2358
11fdf7f2
TL
2359
2360/* full osdmap prune
2361 *
2362 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2363 */
2364
2365void OSDMonitor::load_osdmap_manifest()
2366{
2367 bool store_has_manifest =
f67539c2 2368 mon.store->exists(get_service_name(), "osdmap_manifest");
11fdf7f2
TL
2369
2370 if (!store_has_manifest) {
2371 if (!has_osdmap_manifest) {
2372 return;
2373 }
2374
2375 dout(20) << __func__
2376 << " dropping osdmap manifest from memory." << dendl;
2377 osdmap_manifest = osdmap_manifest_t();
2378 has_osdmap_manifest = false;
2379 return;
2380 }
2381
2382 dout(20) << __func__
2383 << " osdmap manifest detected in store; reload." << dendl;
2384
2385 bufferlist manifest_bl;
2386 int r = get_value("osdmap_manifest", manifest_bl);
2387 if (r < 0) {
2388 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2389 ceph_abort_msg("error reading manifest");
2390 }
2391 osdmap_manifest.decode(manifest_bl);
2392 has_osdmap_manifest = true;
2393
2394 dout(10) << __func__ << " store osdmap manifest pinned ("
2395 << osdmap_manifest.get_first_pinned()
2396 << " .. "
2397 << osdmap_manifest.get_last_pinned()
2398 << ")"
2399 << dendl;
2400}
2401
2402bool OSDMonitor::should_prune() const
2403{
2404 version_t first = get_first_committed();
2405 version_t last = get_last_committed();
2406 version_t min_osdmap_epochs =
2407 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2408 version_t prune_min =
2409 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2410 version_t prune_interval =
2411 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2412 version_t last_pinned = osdmap_manifest.get_last_pinned();
2413 version_t last_to_pin = last - min_osdmap_epochs;
2414
2415 // Make it or break it constraints.
2416 //
2417 // If any of these conditions fails, we will not prune, regardless of
2418 // whether we have an on-disk manifest with an on-going pruning state.
2419 //
2420 if ((last - first) <= min_osdmap_epochs) {
2421 // between the first and last committed epochs, we don't have
2422 // enough epochs to trim, much less to prune.
2423 dout(10) << __func__
2424 << " currently holding only " << (last - first)
2425 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2426 << "); do not prune."
2427 << dendl;
2428 return false;
2429
2430 } else if ((last_to_pin - first) < prune_min) {
2431 // between the first committed epoch and the last epoch we would prune,
2432 // we simply don't have enough versions over the minimum to prune maps.
2433 dout(10) << __func__
2434 << " could only prune " << (last_to_pin - first)
2435 << " epochs (" << first << ".." << last_to_pin << "), which"
2436 " is less than the required minimum (" << prune_min << ")"
2437 << dendl;
2438 return false;
2439
2440 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2441 dout(10) << __func__
2442 << " we have pruned as far as we can; do not prune."
2443 << dendl;
2444 return false;
2445
2446 } else if (last_pinned + prune_interval > last_to_pin) {
2447 dout(10) << __func__
2448 << " not enough epochs to form an interval (last pinned: "
2449 << last_pinned << ", last to pin: "
2450 << last_to_pin << ", interval: " << prune_interval << ")"
2451 << dendl;
2452 return false;
2453 }
2454
2455 dout(15) << __func__
2456 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2457 << " lc (" << first << ".." << last << ")"
2458 << dendl;
2459 return true;
2460}
2461
2462void OSDMonitor::_prune_update_trimmed(
2463 MonitorDBStore::TransactionRef tx,
2464 version_t first)
2465{
2466 dout(10) << __func__
2467 << " first " << first
2468 << " last_pinned " << osdmap_manifest.get_last_pinned()
11fdf7f2
TL
2469 << dendl;
2470
2471 osdmap_manifest_t manifest = osdmap_manifest;
2472
2473 if (!manifest.is_pinned(first)) {
2474 manifest.pin(first);
2475 }
2476
2477 set<version_t>::iterator p_end = manifest.pinned.find(first);
2478 set<version_t>::iterator p = manifest.pinned.begin();
2479 manifest.pinned.erase(p, p_end);
2480 ceph_assert(manifest.get_first_pinned() == first);
2481
2482 if (manifest.get_last_pinned() == first+1 ||
2483 manifest.pinned.size() == 1) {
2484 // we reached the end of the line, as pinned maps go; clean up our
2485 // manifest, and let `should_prune()` decide whether we should prune
2486 // again.
2487 tx->erase(get_service_name(), "osdmap_manifest");
2488 return;
2489 }
2490
2491 bufferlist bl;
2492 manifest.encode(bl);
2493 tx->put(get_service_name(), "osdmap_manifest", bl);
2494}
2495
2496void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2497{
2498 dout(1) << __func__ << dendl;
2499
2500 version_t pin_first;
2501
2502 // verify constrainsts on stable in-memory state
2503 if (!has_osdmap_manifest) {
2504 // we must have never pruned, OR if we pruned the state must no longer
2505 // be relevant (i.e., the state must have been removed alongside with
2506 // the trim that *must* have removed past the last pinned map in a
2507 // previous prune).
2508 ceph_assert(osdmap_manifest.pinned.empty());
f67539c2 2509 ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
11fdf7f2
TL
2510 pin_first = get_first_committed();
2511
2512 } else {
2513 // we must have pruned in the past AND its state is still relevant
2514 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2515 // and thus we still hold a manifest in the store).
2516 ceph_assert(!osdmap_manifest.pinned.empty());
2517 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2518 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2519
2520 dout(10) << __func__
2521 << " first_pinned " << osdmap_manifest.get_first_pinned()
2522 << " last_pinned " << osdmap_manifest.get_last_pinned()
2523 << dendl;
2524
2525 pin_first = osdmap_manifest.get_last_pinned();
2526 }
2527
2528 manifest.pin(pin_first);
2529}
2530
2531bool OSDMonitor::_prune_sanitize_options() const
2532{
2533 uint64_t prune_interval =
2534 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2535 uint64_t prune_min =
2536 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2537 uint64_t txsize =
2538 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2539
2540 bool r = true;
2541
2542 if (prune_interval == 0) {
2543 derr << __func__
2544 << " prune is enabled BUT prune interval is zero; abort."
2545 << dendl;
2546 r = false;
2547 } else if (prune_interval == 1) {
2548 derr << __func__
2549 << " prune interval is equal to one, which essentially means"
2550 " no pruning; abort."
2551 << dendl;
2552 r = false;
2553 }
2554 if (prune_min == 0) {
2555 derr << __func__
2556 << " prune is enabled BUT prune min is zero; abort."
2557 << dendl;
2558 r = false;
2559 }
2560 if (prune_interval > prune_min) {
2561 derr << __func__
2562 << " impossible to ascertain proper prune interval because"
2563 << " it is greater than the minimum prune epochs"
2564 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2565 << dendl;
2566 r = false;
2567 }
2568
2569 if (txsize < prune_interval - 1) {
2570 derr << __func__
f67539c2 2571 << " 'mon_osdmap_full_prune_txsize' (" << txsize
11fdf7f2
TL
2572 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2573 << "); abort." << dendl;
2574 r = false;
2575 }
2576 return r;
2577}
2578
2579bool OSDMonitor::is_prune_enabled() const {
2580 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2581}
2582
2583bool OSDMonitor::is_prune_supported() const {
f67539c2 2584 return mon.get_required_mon_features().contains_any(
11fdf7f2
TL
2585 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2586}
2587
2588/** do_prune
2589 *
2590 * @returns true if has side-effects; false otherwise.
2591 */
2592bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2593{
2594 bool enabled = is_prune_enabled();
2595
2596 dout(1) << __func__ << " osdmap full prune "
2597 << ( enabled ? "enabled" : "disabled")
2598 << dendl;
2599
2600 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2601 return false;
2602 }
2603
2604 // we are beyond the minimum prune versions, we need to remove maps because
2605 // otherwise the store will grow unbounded and we may end up having issues
2606 // with available disk space or store hangs.
2607
2608 // we will not pin all versions. We will leave a buffer number of versions.
2609 // this allows us the monitor to trim maps without caring too much about
2610 // pinned maps, and then allow us to use another ceph-mon without these
2611 // capabilities, without having to repair the store.
2612
2613 osdmap_manifest_t manifest = osdmap_manifest;
2614
2615 version_t first = get_first_committed();
2616 version_t last = get_last_committed();
2617
2618 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2619 version_t last_pinned = manifest.get_last_pinned();
2620 uint64_t prune_interval =
2621 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2622 uint64_t txsize =
2623 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2624
2625 prune_init(manifest);
2626
2627 // we need to get rid of some osdmaps
2628
2629 dout(5) << __func__
2630 << " lc (" << first << " .. " << last << ")"
2631 << " last_pinned " << last_pinned
2632 << " interval " << prune_interval
2633 << " last_to_pin " << last_to_pin
2634 << dendl;
2635
2636 // We will be erasing maps as we go.
2637 //
2638 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2639 //
2640 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2641 // we stop pruning. We could prune the maps between `next_to_pin` and
2642 // `last_to_pin`, but by not doing it we end up with neater pruned
2643 // intervals, aligned with `prune_interval`. Besides, this should not be a
2644 // problem as long as `prune_interval` is set to a sane value, instead of
2645 // hundreds or thousands of maps.
2646
2647 auto map_exists = [this](version_t v) {
f67539c2
TL
2648 string k = mon.store->combine_strings("full", v);
2649 return mon.store->exists(get_service_name(), k);
11fdf7f2
TL
2650 };
2651
2652 // 'interval' represents the number of maps from the last pinned
2653 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2654 // version 11 next; all intermediate versions will be removed.
2655 //
2656 // 'txsize' represents the maximum number of versions we'll be removing in
2657 // this iteration. If 'txsize' is large enough to perform multiple passes
2658 // pinning and removing maps, we will do so; if not, we'll do at least one
2659 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2660 // ensure that we never go *over* the maximum.
2661
2662 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2663 uint64_t removal_interval = prune_interval - 1;
2664
2665 if (txsize < removal_interval) {
2666 dout(5) << __func__
2667 << " setting txsize to removal interval size ("
2668 << removal_interval << " versions"
2669 << dendl;
2670 txsize = removal_interval;
2671 }
2672 ceph_assert(removal_interval > 0);
2673
2674 uint64_t num_pruned = 0;
2675 while (num_pruned + removal_interval <= txsize) {
2676 last_pinned = manifest.get_last_pinned();
2677
2678 if (last_pinned + prune_interval > last_to_pin) {
2679 break;
2680 }
2681 ceph_assert(last_pinned < last_to_pin);
2682
2683 version_t next_pinned = last_pinned + prune_interval;
2684 ceph_assert(next_pinned <= last_to_pin);
2685 manifest.pin(next_pinned);
2686
2687 dout(20) << __func__
2688 << " last_pinned " << last_pinned
2689 << " next_pinned " << next_pinned
2690 << " num_pruned " << num_pruned
2691 << " removal interval (" << (last_pinned+1)
2692 << ".." << (next_pinned-1) << ")"
2693 << " txsize " << txsize << dendl;
2694
2695 ceph_assert(map_exists(last_pinned));
2696 ceph_assert(map_exists(next_pinned));
2697
2698 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2699 ceph_assert(!manifest.is_pinned(v));
2700
2701 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
f67539c2 2702 string full_key = mon.store->combine_strings("full", v);
11fdf7f2
TL
2703 tx->erase(get_service_name(), full_key);
2704 ++num_pruned;
2705 }
2706 }
2707
2708 ceph_assert(num_pruned > 0);
2709
2710 bufferlist bl;
2711 manifest.encode(bl);
2712 tx->put(get_service_name(), "osdmap_manifest", bl);
2713
2714 return true;
2715}
2716
2717
7c673cae
FG
2718// -------------
2719
2720bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2721{
2722 op->mark_osdmon_event(__func__);
2723 Message *m = op->get_req();
2724 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2725
2726 switch (m->get_type()) {
2727 // READs
2728 case MSG_MON_COMMAND:
f64942e4
AA
2729 try {
2730 return preprocess_command(op);
11fdf7f2 2731 } catch (const bad_cmd_get& e) {
f64942e4 2732 bufferlist bl;
f67539c2 2733 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
2734 return true;
2735 }
7c673cae
FG
2736 case CEPH_MSG_MON_GET_OSDMAP:
2737 return preprocess_get_osdmap(op);
2738
2739 // damp updates
2740 case MSG_OSD_MARK_ME_DOWN:
2741 return preprocess_mark_me_down(op);
9f95a23c
TL
2742 case MSG_OSD_MARK_ME_DEAD:
2743 return preprocess_mark_me_dead(op);
7c673cae
FG
2744 case MSG_OSD_FULL:
2745 return preprocess_full(op);
2746 case MSG_OSD_FAILURE:
2747 return preprocess_failure(op);
2748 case MSG_OSD_BOOT:
2749 return preprocess_boot(op);
2750 case MSG_OSD_ALIVE:
2751 return preprocess_alive(op);
2752 case MSG_OSD_PG_CREATED:
2753 return preprocess_pg_created(op);
11fdf7f2
TL
2754 case MSG_OSD_PG_READY_TO_MERGE:
2755 return preprocess_pg_ready_to_merge(op);
7c673cae
FG
2756 case MSG_OSD_PGTEMP:
2757 return preprocess_pgtemp(op);
2758 case MSG_OSD_BEACON:
2759 return preprocess_beacon(op);
2760
2761 case CEPH_MSG_POOLOP:
2762 return preprocess_pool_op(op);
2763
2764 case MSG_REMOVE_SNAPS:
2765 return preprocess_remove_snaps(op);
2766
9f95a23c
TL
2767 case MSG_MON_GET_PURGED_SNAPS:
2768 return preprocess_get_purged_snaps(op);
2769
7c673cae
FG
2770 default:
2771 ceph_abort();
2772 return true;
2773 }
2774}
2775
2776bool OSDMonitor::prepare_update(MonOpRequestRef op)
2777{
2778 op->mark_osdmon_event(__func__);
2779 Message *m = op->get_req();
2780 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2781
2782 switch (m->get_type()) {
2783 // damp updates
2784 case MSG_OSD_MARK_ME_DOWN:
2785 return prepare_mark_me_down(op);
9f95a23c
TL
2786 case MSG_OSD_MARK_ME_DEAD:
2787 return prepare_mark_me_dead(op);
7c673cae
FG
2788 case MSG_OSD_FULL:
2789 return prepare_full(op);
2790 case MSG_OSD_FAILURE:
2791 return prepare_failure(op);
2792 case MSG_OSD_BOOT:
2793 return prepare_boot(op);
2794 case MSG_OSD_ALIVE:
2795 return prepare_alive(op);
2796 case MSG_OSD_PG_CREATED:
2797 return prepare_pg_created(op);
2798 case MSG_OSD_PGTEMP:
2799 return prepare_pgtemp(op);
11fdf7f2
TL
2800 case MSG_OSD_PG_READY_TO_MERGE:
2801 return prepare_pg_ready_to_merge(op);
7c673cae
FG
2802 case MSG_OSD_BEACON:
2803 return prepare_beacon(op);
2804
2805 case MSG_MON_COMMAND:
f64942e4
AA
2806 try {
2807 return prepare_command(op);
11fdf7f2 2808 } catch (const bad_cmd_get& e) {
f64942e4 2809 bufferlist bl;
f67539c2 2810 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
2811 return true;
2812 }
7c673cae
FG
2813
2814 case CEPH_MSG_POOLOP:
2815 return prepare_pool_op(op);
2816
2817 case MSG_REMOVE_SNAPS:
2818 return prepare_remove_snaps(op);
2819
2820
2821 default:
2822 ceph_abort();
2823 }
2824
2825 return false;
2826}
2827
2828bool OSDMonitor::should_propose(double& delay)
2829{
2830 dout(10) << "should_propose" << dendl;
2831
2832 // if full map, propose immediately! any subsequent changes will be clobbered.
2833 if (pending_inc.fullmap.length())
2834 return true;
2835
2836 // adjust osd weights?
2837 if (!osd_weight.empty() &&
2838 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2839 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2840 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2841 delay = 0.0;
2842 osd_weight.clear();
2843 return true;
2844 }
2845
7c673cae
FG
2846 return PaxosService::should_propose(delay);
2847}
2848
2849
2850
2851// ---------------------------
2852// READs
2853
2854bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2855{
2856 op->mark_osdmon_event(__func__);
9f95a23c 2857 auto m = op->get_req<MMonGetOSDMap>();
28e407b8 2858
f67539c2 2859 uint64_t features = mon.get_quorum_con_features();
11fdf7f2
TL
2860 if (op->get_session() && op->get_session()->con_features)
2861 features = op->get_session()->con_features;
28e407b8 2862
7c673cae 2863 dout(10) << __func__ << " " << *m << dendl;
f67539c2 2864 MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
7c673cae
FG
2865 epoch_t first = get_first_committed();
2866 epoch_t last = osdmap.get_epoch();
11fdf7f2
TL
2867 int max = g_conf()->osd_map_message_max;
2868 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2869 for (epoch_t e = std::max(first, m->get_full_first());
2870 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
7c673cae 2871 ++e, --max) {
11fdf7f2
TL
2872 bufferlist& bl = reply->maps[e];
2873 int r = get_version_full(e, features, bl);
2874 ceph_assert(r >= 0);
2875 max_bytes -= bl.length();
7c673cae 2876 }
11fdf7f2
TL
2877 for (epoch_t e = std::max(first, m->get_inc_first());
2878 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
7c673cae 2879 ++e, --max) {
11fdf7f2
TL
2880 bufferlist& bl = reply->incremental_maps[e];
2881 int r = get_version(e, features, bl);
2882 ceph_assert(r >= 0);
2883 max_bytes -= bl.length();
7c673cae 2884 }
1e59de90 2885 reply->cluster_osdmap_trim_lower_bound = first;
7c673cae 2886 reply->newest_map = last;
f67539c2 2887 mon.send_reply(op, reply);
7c673cae
FG
2888 return true;
2889}
2890
2891
2892// ---------------------------
2893// UPDATEs
2894
2895// failure --
2896
11fdf7f2 2897bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
7c673cae 2898 // check permissions
11fdf7f2 2899 MonSession *session = op->get_session();
7c673cae
FG
2900 if (!session)
2901 return true;
2902 if (!session->is_capable("osd", MON_CAP_X)) {
2903 dout(0) << "got MOSDFailure from entity with insufficient caps "
2904 << session->caps << dendl;
2905 return true;
2906 }
f67539c2 2907 if (fsid != mon.monmap->fsid) {
7c673cae 2908 dout(0) << "check_source: on fsid " << fsid
f67539c2 2909 << " != " << mon.monmap->fsid << dendl;
7c673cae
FG
2910 return true;
2911 }
2912 return false;
2913}
2914
2915
2916bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2917{
2918 op->mark_osdmon_event(__func__);
9f95a23c 2919 auto m = op->get_req<MOSDFailure>();
7c673cae 2920 // who is target_osd
11fdf7f2 2921 int badboy = m->get_target_osd();
7c673cae
FG
2922
2923 // check permissions
11fdf7f2 2924 if (check_source(op, m->fsid))
7c673cae
FG
2925 goto didit;
2926
2927 // first, verify the reporting host is valid
2928 if (m->get_orig_source().is_osd()) {
2929 int from = m->get_orig_source().num();
2930 if (!osdmap.exists(from) ||
11fdf7f2 2931 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
7c673cae 2932 (osdmap.is_down(from) && m->if_osd_failed())) {
11fdf7f2
TL
2933 dout(5) << "preprocess_failure from dead osd." << from
2934 << ", ignoring" << dendl;
7c673cae
FG
2935 send_incremental(op, m->get_epoch()+1);
2936 goto didit;
2937 }
2938 }
2939
2940
2941 // weird?
2942 if (osdmap.is_down(badboy)) {
11fdf7f2
TL
2943 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2944 << " " << m->get_target_addrs()
2945 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2946 if (m->get_epoch() < osdmap.get_epoch())
2947 send_incremental(op, m->get_epoch()+1);
2948 goto didit;
2949 }
11fdf7f2
TL
2950 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2951 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2952 << " " << m->get_target_addrs()
2953 << " != map's " << osdmap.get_addrs(badboy)
2954 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2955 if (m->get_epoch() < osdmap.get_epoch())
2956 send_incremental(op, m->get_epoch()+1);
2957 goto didit;
2958 }
2959
2960 // already reported?
2961 if (osdmap.is_down(badboy) ||
2962 osdmap.get_up_from(badboy) > m->get_epoch()) {
11fdf7f2
TL
2963 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2964 << " " << m->get_target_addrs()
2965 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2966 if (m->get_epoch() < osdmap.get_epoch())
2967 send_incremental(op, m->get_epoch()+1);
2968 goto didit;
2969 }
2970
2971 if (!can_mark_down(badboy)) {
11fdf7f2
TL
2972 dout(5) << "preprocess_failure ignoring report of osd."
2973 << m->get_target_osd() << " " << m->get_target_addrs()
2974 << " from " << m->get_orig_source() << dendl;
7c673cae
FG
2975 goto didit;
2976 }
2977
11fdf7f2
TL
2978 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2979 << " " << m->get_target_addrs()
2980 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2981 return false;
2982
2983 didit:
f67539c2 2984 mon.no_reply(op);
7c673cae
FG
2985 return true;
2986}
2987
2988class C_AckMarkedDown : public C_MonOp {
2989 OSDMonitor *osdmon;
2990public:
2991 C_AckMarkedDown(
2992 OSDMonitor *osdmon,
2993 MonOpRequestRef op)
2994 : C_MonOp(op), osdmon(osdmon) {}
2995
eafe8130
TL
2996 void _finish(int r) override {
2997 if (r == 0) {
9f95a23c 2998 auto m = op->get_req<MOSDMarkMeDown>();
f67539c2 2999 osdmon->mon.send_reply(
eafe8130
TL
3000 op,
3001 new MOSDMarkMeDown(
3002 m->fsid,
3003 m->target_osd,
3004 m->target_addrs,
3005 m->get_epoch(),
3006 false)); // ACK itself does not request an ack
3007 } else if (r == -EAGAIN) {
3008 osdmon->dispatch(op);
3009 } else {
3010 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3011 }
7c673cae
FG
3012 }
3013 ~C_AckMarkedDown() override {
3014 }
3015};
3016
3017bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3018{
3019 op->mark_osdmon_event(__func__);
9f95a23c 3020 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 3021 int from = m->target_osd;
7c673cae
FG
3022
3023 // check permissions
11fdf7f2 3024 if (check_source(op, m->fsid))
7c673cae
FG
3025 goto reply;
3026
3027 // first, verify the reporting host is valid
3028 if (!m->get_orig_source().is_osd())
3029 goto reply;
3030
3031 if (!osdmap.exists(from) ||
3032 osdmap.is_down(from) ||
11fdf7f2 3033 osdmap.get_addrs(from) != m->target_addrs) {
7c673cae
FG
3034 dout(5) << "preprocess_mark_me_down from dead osd."
3035 << from << ", ignoring" << dendl;
3036 send_incremental(op, m->get_epoch()+1);
3037 goto reply;
3038 }
3039
3040 // no down might be set
11fdf7f2 3041 if (!can_mark_down(from))
7c673cae
FG
3042 goto reply;
3043
11fdf7f2
TL
3044 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3045 << " " << m->target_addrs << dendl;
7c673cae
FG
3046 return false;
3047
3048 reply:
3049 if (m->request_ack) {
3050 Context *c(new C_AckMarkedDown(this, op));
3051 c->complete(0);
3052 }
3053 return true;
3054}
3055
3056bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3057{
3058 op->mark_osdmon_event(__func__);
9f95a23c 3059 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 3060 int target_osd = m->target_osd;
7c673cae 3061
11fdf7f2
TL
3062 ceph_assert(osdmap.is_up(target_osd));
3063 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
7c673cae 3064
1d09f67e 3065 mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
7c673cae 3066 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1d09f67e
TL
3067 if (m->down_and_dead) {
3068 if (!pending_inc.new_xinfo.count(target_osd)) {
3069 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3070 }
3071 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3072 }
7c673cae
FG
3073 if (m->request_ack)
3074 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3075 return true;
3076}
3077
9f95a23c
TL
3078bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3079{
3080 op->mark_osdmon_event(__func__);
3081 auto m = op->get_req<MOSDMarkMeDead>();
3082 int from = m->target_osd;
3083
3084 // check permissions
3085 if (check_source(op, m->fsid)) {
f67539c2 3086 mon.no_reply(op);
9f95a23c
TL
3087 return true;
3088 }
3089
3090 // first, verify the reporting host is valid
3091 if (!m->get_orig_source().is_osd()) {
f67539c2 3092 mon.no_reply(op);
9f95a23c
TL
3093 return true;
3094 }
3095
3096 if (!osdmap.exists(from) ||
3097 !osdmap.is_down(from)) {
3098 dout(5) << __func__ << " from nonexistent or up osd." << from
3099 << ", ignoring" << dendl;
3100 send_incremental(op, m->get_epoch()+1);
f67539c2 3101 mon.no_reply(op);
9f95a23c
TL
3102 return true;
3103 }
3104
3105 return false;
3106}
3107
3108bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3109{
3110 op->mark_osdmon_event(__func__);
3111 auto m = op->get_req<MOSDMarkMeDead>();
3112 int target_osd = m->target_osd;
3113
3114 ceph_assert(osdmap.is_down(target_osd));
3115
f67539c2 3116 mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
9f95a23c
TL
3117 << m->get_epoch();
3118 if (!pending_inc.new_xinfo.count(target_osd)) {
3119 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3120 }
3121 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3122 wait_for_finished_proposal(
3123 op,
3124 new LambdaContext(
3125 [op, this] (int r) {
3126 if (r >= 0) {
f67539c2 3127 mon.no_reply(op); // ignore on success
9f95a23c
TL
3128 }
3129 }
3130 ));
3131 return true;
3132}
3133
7c673cae
FG
3134bool OSDMonitor::can_mark_down(int i)
3135{
31f18b77
FG
3136 if (osdmap.is_nodown(i)) {
3137 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3138 << "will not mark it down" << dendl;
7c673cae
FG
3139 return false;
3140 }
31f18b77 3141
7c673cae
FG
3142 int num_osds = osdmap.get_num_osds();
3143 if (num_osds == 0) {
31f18b77 3144 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
3145 return false;
3146 }
3147 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3148 float up_ratio = (float)up / (float)num_osds;
11fdf7f2 3149 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
31f18b77 3150 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
11fdf7f2 3151 << g_conf()->mon_osd_min_up_ratio
7c673cae
FG
3152 << ", will not mark osd." << i << " down" << dendl;
3153 return false;
3154 }
3155 return true;
3156}
3157
3158bool OSDMonitor::can_mark_up(int i)
3159{
31f18b77
FG
3160 if (osdmap.is_noup(i)) {
3161 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3162 << "will not mark it up" << dendl;
7c673cae
FG
3163 return false;
3164 }
31f18b77 3165
7c673cae
FG
3166 return true;
3167}
3168
3169/**
3170 * @note the parameter @p i apparently only exists here so we can output the
3171 * osd's id on messages.
3172 */
3173bool OSDMonitor::can_mark_out(int i)
3174{
31f18b77
FG
3175 if (osdmap.is_noout(i)) {
3176 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3177 << "will not mark it out" << dendl;
3178 return false;
3179 }
3180
7c673cae
FG
3181 int num_osds = osdmap.get_num_osds();
3182 if (num_osds == 0) {
3183 dout(5) << __func__ << " no osds" << dendl;
3184 return false;
3185 }
3186 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3187 float in_ratio = (float)in / (float)num_osds;
11fdf7f2 3188 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
7c673cae
FG
3189 if (i >= 0)
3190 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3191 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3192 << ", will not mark osd." << i << " out" << dendl;
3193 else
3194 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3195 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3196 << ", will not mark osds out" << dendl;
3197 return false;
3198 }
3199
3200 return true;
3201}
3202
3203bool OSDMonitor::can_mark_in(int i)
3204{
31f18b77
FG
3205 if (osdmap.is_noin(i)) {
3206 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3207 << "will not mark it in" << dendl;
7c673cae
FG
3208 return false;
3209 }
31f18b77 3210
7c673cae
FG
3211 return true;
3212}
3213
3214bool OSDMonitor::check_failures(utime_t now)
3215{
3216 bool found_failure = false;
b3b6e05e
TL
3217 auto p = failure_info.begin();
3218 while (p != failure_info.end()) {
3219 auto& [target_osd, fi] = *p;
3220 if (can_mark_down(target_osd) &&
3221 check_failure(now, target_osd, fi)) {
3222 found_failure = true;
3223 ++p;
3224 } else if (is_failure_stale(now, fi)) {
3225 dout(10) << " dropping stale failure_info for osd." << target_osd
3226 << " from " << fi.reporters.size() << " reporters"
3227 << dendl;
3228 p = failure_info.erase(p);
3229 } else {
3230 ++p;
7c673cae
FG
3231 }
3232 }
3233 return found_failure;
3234}
3235
b3b6e05e
TL
3236utime_t OSDMonitor::get_grace_time(utime_t now,
3237 int target_osd,
3238 failure_info_t& fi) const
3239{
3240 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3241 if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3242 return orig_grace;
3243 }
3244 utime_t grace = orig_grace;
3245 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3246 double decay_k = ::log(.5) / halflife;
3247
3248 // scale grace period based on historical probability of 'lagginess'
3249 // (false positive failures due to slowness).
3250 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3251 const utime_t failed_for = now - fi.get_failed_since();
3252 double decay = exp((double)failed_for * decay_k);
3253 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3254 << " failed_for " << failed_for << " decay " << decay << dendl;
3255 double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3256 grace += my_grace;
3257
3258 // consider the peers reporting a failure a proxy for a potential
3259 // 'subcluster' over the overall cluster that is similarly
3260 // laggy. this is clearly not true in all cases, but will sometimes
3261 // help us localize the grace correction to a subset of the system
3262 // (say, a rack with a bad switch) that is unhappy.
3263 double peer_grace = 0;
3264 for (auto& [reporter, report] : fi.reporters) {
3265 if (osdmap.exists(reporter)) {
3266 const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3267 utime_t elapsed = now - xi.down_stamp;
3268 double decay = exp((double)elapsed * decay_k);
3269 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3270 }
3271 }
3272 peer_grace /= (double)fi.reporters.size();
3273 grace += peer_grace;
3274 dout(10) << " osd." << target_osd << " has "
3275 << fi.reporters.size() << " reporters, "
3276 << grace << " grace (" << orig_grace << " + " << my_grace
3277 << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3278 << dendl;
3279
3280 return grace;
3281}
3282
7c673cae
FG
3283bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3284{
3285 // already pending failure?
3286 if (pending_inc.new_state.count(target_osd) &&
3287 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3288 dout(10) << " already pending failure" << dendl;
3289 return true;
3290 }
3291
3292 set<string> reporters_by_subtree;
11fdf7f2 3293 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
11fdf7f2 3294 ceph_assert(fi.reporters.size());
eafe8130 3295 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
7c673cae
FG
3296 // get the parent bucket whose type matches with "reporter_subtree_level".
3297 // fall back to OSD if the level doesn't exist.
eafe8130
TL
3298 if (osdmap.exists(p->first)) {
3299 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3300 if (auto iter = reporter_loc.find(reporter_subtree_level);
3301 iter == reporter_loc.end()) {
3302 reporters_by_subtree.insert("osd." + to_string(p->first));
3303 } else {
3304 reporters_by_subtree.insert(iter->second);
3305 }
eafe8130 3306 ++p;
7c673cae 3307 } else {
eafe8130
TL
3308 fi.cancel_report(p->first);;
3309 p = fi.reporters.erase(p);
7c673cae
FG
3310 }
3311 }
b3b6e05e
TL
3312 if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3313 return false;
7c673cae 3314 }
b3b6e05e
TL
3315 const utime_t failed_for = now - fi.get_failed_since();
3316 const utime_t grace = get_grace_time(now, target_osd, fi);
3317 if (failed_for >= grace) {
7c673cae
FG
3318 dout(1) << " we have enough reporters to mark osd." << target_osd
3319 << " down" << dendl;
3320 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3321
f67539c2 3322 mon.clog->info() << "osd." << target_osd << " failed ("
31f18b77
FG
3323 << osdmap.crush->get_full_location_ordered_string(
3324 target_osd)
3325 << ") ("
3326 << (int)reporters_by_subtree.size()
3327 << " reporters from different "
7c673cae
FG
3328 << reporter_subtree_level << " after "
3329 << failed_for << " >= grace " << grace << ")";
3330 return true;
3331 }
3332 return false;
3333}
3334
b3b6e05e
TL
3335bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3336{
3337 // if it takes too long to either cancel the report to mark the osd down,
3338 // some reporters must have failed to cancel their reports. let's just
3339 // forget these reports.
3340 const utime_t failed_for = now - fi.get_failed_since();
3341 auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3342 auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3343 return failed_for >= (heartbeat_grace + heartbeat_stale);
3344}
3345
224ce89b 3346void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
3347{
3348 // already pending failure?
3349 if (pending_inc.new_state.count(target_osd) &&
3350 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3351 dout(10) << " already pending failure" << dendl;
3352 return;
3353 }
3354
3355 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3356 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
9f95a23c
TL
3357 if (!pending_inc.new_xinfo.count(target_osd)) {
3358 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3359 }
3360 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
7c673cae 3361
f67539c2 3362 mon.clog->info() << "osd." << target_osd << " failed ("
31f18b77
FG
3363 << osdmap.crush->get_full_location_ordered_string(target_osd)
3364 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
3365 return;
3366}
3367
3368bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3369{
3370 op->mark_osdmon_event(__func__);
9f95a23c 3371 auto m = op->get_req<MOSDFailure>();
11fdf7f2
TL
3372 dout(1) << "prepare_failure osd." << m->get_target_osd()
3373 << " " << m->get_target_addrs()
3374 << " from " << m->get_orig_source()
7c673cae
FG
3375 << " is reporting failure:" << m->if_osd_failed() << dendl;
3376
11fdf7f2 3377 int target_osd = m->get_target_osd();
7c673cae 3378 int reporter = m->get_orig_source().num();
11fdf7f2
TL
3379 ceph_assert(osdmap.is_up(target_osd));
3380 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
7c673cae 3381
f67539c2 3382 mon.no_reply(op);
eafe8130 3383
7c673cae
FG
3384 if (m->if_osd_failed()) {
3385 // calculate failure time
3386 utime_t now = ceph_clock_now();
3387 utime_t failed_since =
3388 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3389
3390 // add a report
3391 if (m->is_immediate()) {
f67539c2 3392 mon.clog->debug() << "osd." << m->get_target_osd()
11fdf7f2
TL
3393 << " reported immediately failed by "
3394 << m->get_orig_source();
224ce89b 3395 force_failure(target_osd, reporter);
7c673cae
FG
3396 return true;
3397 }
f67539c2 3398 mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
11fdf7f2 3399 << m->get_orig_source();
7c673cae
FG
3400
3401 failure_info_t& fi = failure_info[target_osd];
b3b6e05e 3402 fi.add_report(reporter, failed_since, op);
7c673cae
FG
3403 return check_failure(now, target_osd, fi);
3404 } else {
3405 // remove the report
f67539c2 3406 mon.clog->debug() << "osd." << m->get_target_osd()
11fdf7f2
TL
3407 << " failure report canceled by "
3408 << m->get_orig_source();
7c673cae
FG
3409 if (failure_info.count(target_osd)) {
3410 failure_info_t& fi = failure_info[target_osd];
b3b6e05e 3411 fi.cancel_report(reporter);
7c673cae
FG
3412 if (fi.reporters.empty()) {
3413 dout(10) << " removing last failure_info for osd." << target_osd
3414 << dendl;
3415 failure_info.erase(target_osd);
3416 } else {
3417 dout(10) << " failure_info for osd." << target_osd << " now "
3418 << fi.reporters.size() << " reporters" << dendl;
3419 }
3420 } else {
3421 dout(10) << " no failure_info for osd." << target_osd << dendl;
3422 }
7c673cae
FG
3423 }
3424
3425 return false;
3426}
3427
3428void OSDMonitor::process_failures()
3429{
3430 map<int,failure_info_t>::iterator p = failure_info.begin();
3431 while (p != failure_info.end()) {
3432 if (osdmap.is_up(p->first)) {
3433 ++p;
3434 } else {
3435 dout(10) << "process_failures osd." << p->first << dendl;
3436 list<MonOpRequestRef> ls;
3437 p->second.take_report_messages(ls);
3438 failure_info.erase(p++);
3439
3440 while (!ls.empty()) {
3441 MonOpRequestRef o = ls.front();
3442 if (o) {
3443 o->mark_event(__func__);
3444 MOSDFailure *m = o->get_req<MOSDFailure>();
3445 send_latest(o, m->get_epoch());
f67539c2 3446 mon.no_reply(o);
7c673cae
FG
3447 }
3448 ls.pop_front();
3449 }
3450 }
3451 }
3452}
3453
3454void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3455{
3456 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3457
3458 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3459 p != failure_info.end();
3460 ++p) {
3461 p->second.take_report_messages(ls);
3462 }
3463 failure_info.clear();
3464}
3465
f6b5b4d7
TL
3466int OSDMonitor::get_grace_interval_threshold()
3467{
3468 int halflife = g_conf()->mon_osd_laggy_halflife;
3469 // Scale the halflife period (default: 1_hr) by
3470 // a factor (48) to calculate the threshold.
3471 int grace_threshold_factor = 48;
3472 return halflife * grace_threshold_factor;
3473}
3474
3475bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3476{
3477 int grace_interval_threshold_secs = get_grace_interval_threshold();
3478 if (last_failed_interval > grace_interval_threshold_secs) {
3479 dout(1) << " last_failed_interval " << last_failed_interval
3480 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3481 << dendl;
3482 return true;
3483 }
3484 return false;
3485}
3486
3487void OSDMonitor::set_default_laggy_params(int target_osd)
3488{
3489 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3490 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3491 }
3492 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3493 xi.down_stamp = pending_inc.modified;
3494 xi.laggy_probability = 0.0;
3495 xi.laggy_interval = 0;
3496 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3497}
3498
7c673cae
FG
3499
3500// boot --
3501
3502bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3503{
3504 op->mark_osdmon_event(__func__);
9f95a23c 3505 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3506 int from = m->get_orig_source_inst().name.num();
3507
3508 // check permissions, ignore if failed (no response expected)
11fdf7f2 3509 MonSession *session = op->get_session();
7c673cae
FG
3510 if (!session)
3511 goto ignore;
3512 if (!session->is_capable("osd", MON_CAP_X)) {
3513 dout(0) << "got preprocess_boot message from entity with insufficient caps"
11fdf7f2 3514 << session->caps << dendl;
7c673cae
FG
3515 goto ignore;
3516 }
3517
f67539c2 3518 if (m->sb.cluster_fsid != mon.monmap->fsid) {
11fdf7f2 3519 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
f67539c2 3520 << " != " << mon.monmap->fsid << dendl;
7c673cae
FG
3521 goto ignore;
3522 }
3523
11fdf7f2
TL
3524 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3525 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
7c673cae
FG
3526 goto ignore;
3527 }
3528
11fdf7f2 3529 ceph_assert(m->get_orig_source_inst().name.is_osd());
7c673cae 3530
20effc67 3531 // lower bound of N-2
1e59de90 3532 if (!HAVE_FEATURE(m->osd_features, SERVER_PACIFIC)) {
20effc67
TL
3533 mon.clog->info() << "disallowing boot of OSD "
3534 << m->get_orig_source_inst()
1e59de90 3535 << " because the osd lacks CEPH_FEATURE_SERVER_PACIFIC";
20effc67 3536 goto ignore;
7c673cae
FG
3537 }
3538
9f95a23c 3539 // make sure osd versions do not span more than 3 releases
20effc67
TL
3540 if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3541 osdmap.require_osd_release < ceph_release_t::octopus) {
3542 mon.clog->info() << "disallowing boot of quincy+ OSD "
f64942e4 3543 << m->get_orig_source_inst()
20effc67 3544 << " because require_osd_release < octopus";
f64942e4
AA
3545 goto ignore;
3546 }
1e59de90
TL
3547 if (HAVE_FEATURE(m->osd_features, SERVER_REEF) &&
3548 osdmap.require_osd_release < ceph_release_t::pacific) {
3549 mon.clog->info() << "disallowing boot of reef+ OSD "
3550 << m->get_orig_source_inst()
3551 << " because require_osd_release < pacific";
3552 goto ignore;
3553 }
3554
3555 // See crimson/osd/osd.cc: OSD::_send_boot
3556 if (auto type_iter = m->metadata.find("osd_type");
3557 type_iter != m->metadata.end()) {
3558 const auto &otype = type_iter->second;
3559 // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
3560 if (otype == "crimson") {
3561 if (!osdmap.get_allow_crimson()) {
3562 mon.clog->info()
3563 << "Disallowing boot of crimson-osd without allow_crimson "
3564 << "OSDMap flag. Run ceph osd set_allow_crimson to set "
3565 << "allow_crimson flag. Note that crimson-osd is "
3566 << "considered unstable and may result in crashes or "
3567 << "data loss. Its usage should be restricted to "
3568 << "testing and development.";
3569 goto ignore;
3570 }
3571 } else {
3572 derr << __func__ << ": osd " << m->get_orig_source_inst()
3573 << " sent non-crimson osd_type field in MOSDBoot: "
3574 << otype
3575 << " -- booting anyway"
3576 << dendl;
3577 }
3578 }
f64942e4 3579
f67539c2
TL
3580 if (osdmap.stretch_mode_enabled &&
3581 !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3582 mon.clog->info() << "disallowing boot of OSD "
3583 << m->get_orig_source_inst()
3584 << " because stretch mode is on and OSD lacks support";
3585 goto ignore;
3586 }
3587
7c673cae
FG
3588 // already booted?
3589 if (osdmap.is_up(from) &&
11fdf7f2
TL
3590 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3591 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
7c673cae 3592 // yup.
11fdf7f2
TL
3593 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3594 << " " << m->get_orig_source_addrs()
3595 << " =~ " << osdmap.get_addrs(from) << dendl;
7c673cae
FG
3596 _booted(op, false);
3597 return true;
3598 }
3599
3600 if (osdmap.exists(from) &&
3601 !osdmap.get_uuid(from).is_zero() &&
3602 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3603 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3604 << " clashes with existing osd: different fsid"
3605 << " (ours: " << osdmap.get_uuid(from)
3606 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3607 goto ignore;
3608 }
3609
3610 if (osdmap.exists(from) &&
3611 osdmap.get_info(from).up_from > m->version &&
11fdf7f2
TL
3612 osdmap.get_most_recent_addrs(from).legacy_equals(
3613 m->get_orig_source_addrs())) {
7c673cae
FG
3614 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3615 send_latest(op, m->sb.current_epoch+1);
3616 return true;
3617 }
3618
3619 // noup?
3620 if (!can_mark_up(from)) {
3621 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3622 send_latest(op, m->sb.current_epoch+1);
3623 return true;
3624 }
3625
3626 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3627 return false;
3628
3629 ignore:
3630 return true;
3631}
3632
3633bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3634{
3635 op->mark_osdmon_event(__func__);
9f95a23c 3636 auto m = op->get_req<MOSDBoot>();
11fdf7f2
TL
3637 dout(7) << __func__ << " from " << m->get_source()
3638 << " sb " << m->sb
3639 << " client_addrs" << m->get_connection()->get_peer_addrs()
3640 << " cluster_addrs " << m->cluster_addrs
3641 << " hb_back_addrs " << m->hb_back_addrs
3642 << " hb_front_addrs " << m->hb_front_addrs
7c673cae
FG
3643 << dendl;
3644
11fdf7f2 3645 ceph_assert(m->get_orig_source().is_osd());
7c673cae
FG
3646 int from = m->get_orig_source().num();
3647
3648 // does this osd exist?
3649 if (from >= osdmap.get_max_osd()) {
3650 dout(1) << "boot from osd." << from << " >= max_osd "
3651 << osdmap.get_max_osd() << dendl;
3652 return false;
3653 }
3654
3655 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3656 if (pending_inc.new_state.count(from))
3657 oldstate ^= pending_inc.new_state[from];
3658
3659 // already up? mark down first?
3660 if (osdmap.is_up(from)) {
11fdf7f2
TL
3661 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3662 << osdmap.get_addrs(from) << dendl;
7c673cae 3663 // preprocess should have caught these; if not, assert.
11fdf7f2
TL
3664 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3665 m->get_orig_source_addrs()) ||
3666 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3667 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
7c673cae
FG
3668
3669 if (pending_inc.new_state.count(from) == 0 ||
3670 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3671 // mark previous guy down
3672 pending_inc.new_state[from] = CEPH_OSD_UP;
3673 }
3674 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3675 } else if (pending_inc.new_up_client.count(from)) {
3676 // already prepared, just wait
3677 dout(7) << __func__ << " already prepared, waiting on "
3678 << m->get_orig_source_addr() << dendl;
3679 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3680 } else {
3681 // mark new guy up.
11fdf7f2
TL
3682 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3683 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3684 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3685 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
7c673cae
FG
3686
3687 down_pending_out.erase(from); // if any
3688
3689 if (m->sb.weight)
3690 osd_weight[from] = m->sb.weight;
3691
3692 // set uuid?
3693 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3694 << dendl;
3695 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3696 // preprocess should have caught this; if not, assert.
11fdf7f2 3697 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
7c673cae
FG
3698 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3699 }
3700
3701 // fresh osd?
3702 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3703 const osd_info_t& i = osdmap.get_info(from);
3704 if (i.up_from > i.lost_at) {
3705 dout(10) << " fresh osd; marking lost_at too" << dendl;
3706 pending_inc.new_lost[from] = osdmap.get_epoch();
3707 }
3708 }
3709
3710 // metadata
3711 bufferlist osd_metadata;
11fdf7f2 3712 encode(m->metadata, osd_metadata);
7c673cae 3713 pending_metadata[from] = osd_metadata;
31f18b77 3714 pending_metadata_rm.erase(from);
7c673cae
FG
3715
3716 // adjust last clean unmount epoch?
3717 const osd_info_t& info = osdmap.get_info(from);
3718 dout(10) << " old osd_info: " << info << dendl;
3719 if (m->sb.mounted > info.last_clean_begin ||
3720 (m->sb.mounted == info.last_clean_begin &&
3721 m->sb.clean_thru > info.last_clean_end)) {
3722 epoch_t begin = m->sb.mounted;
3723 epoch_t end = m->sb.clean_thru;
3724
3725 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3726 << "[" << info.last_clean_begin << "," << info.last_clean_end
3727 << ") -> [" << begin << "-" << end << ")"
3728 << dendl;
3729 pending_inc.new_last_clean_interval[from] =
3730 pair<epoch_t,epoch_t>(begin, end);
3731 }
3732
9f95a23c
TL
3733 if (pending_inc.new_xinfo.count(from) == 0)
3734 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3735 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
7c673cae 3736 if (m->boot_epoch == 0) {
11fdf7f2
TL
3737 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3738 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3739 dout(10) << " not laggy, new xi " << xi << dendl;
3740 } else {
3741 if (xi.down_stamp.sec()) {
3742 int interval = ceph_clock_now().sec() -
3743 xi.down_stamp.sec();
11fdf7f2
TL
3744 if (g_conf()->mon_osd_laggy_max_interval &&
3745 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3746 interval = g_conf()->mon_osd_laggy_max_interval;
7c673cae
FG
3747 }
3748 xi.laggy_interval =
11fdf7f2
TL
3749 interval * g_conf()->mon_osd_laggy_weight +
3750 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3751 }
3752 xi.laggy_probability =
11fdf7f2
TL
3753 g_conf()->mon_osd_laggy_weight +
3754 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3755 dout(10) << " laggy, now xi " << xi << dendl;
3756 }
3757
3758 // set features shared by the osd
3759 if (m->osd_features)
3760 xi.features = m->osd_features;
3761 else
3762 xi.features = m->get_connection()->get_features();
3763
3764 // mark in?
11fdf7f2 3765 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
7c673cae 3766 (oldstate & CEPH_OSD_AUTOOUT)) ||
11fdf7f2
TL
3767 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3768 (g_conf()->mon_osd_auto_mark_in)) {
7c673cae 3769 if (can_mark_in(from)) {
9f95a23c
TL
3770 if (xi.old_weight > 0) {
3771 pending_inc.new_weight[from] = xi.old_weight;
7c673cae
FG
3772 xi.old_weight = 0;
3773 } else {
3774 pending_inc.new_weight[from] = CEPH_OSD_IN;
3775 }
3776 } else {
3777 dout(7) << __func__ << " NOIN set, will not mark in "
3778 << m->get_orig_source_addr() << dendl;
3779 }
3780 }
3781
7c673cae
FG
3782 // wait
3783 wait_for_finished_proposal(op, new C_Booted(this, op));
3784 }
3785 return true;
3786}
3787
3788void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3789{
3790 op->mark_osdmon_event(__func__);
9f95a23c 3791 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3792 dout(7) << "_booted " << m->get_orig_source_inst()
3793 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3794
3795 if (logit) {
f67539c2 3796 mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
11fdf7f2 3797 << " boot";
7c673cae
FG
3798 }
3799
3800 send_latest(op, m->sb.current_epoch+1);
3801}
3802
3803
3804// -------------
3805// full
3806
3807bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3808{
3809 op->mark_osdmon_event(__func__);
9f95a23c 3810 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3811 int from = m->get_orig_source().num();
3812 set<string> state;
3813 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3814
3815 // check permissions, ignore if failed
11fdf7f2 3816 MonSession *session = op->get_session();
7c673cae
FG
3817 if (!session)
3818 goto ignore;
3819 if (!session->is_capable("osd", MON_CAP_X)) {
3820 dout(0) << "MOSDFull from entity with insufficient privileges:"
3821 << session->caps << dendl;
3822 goto ignore;
3823 }
3824
3825 // ignore a full message from the osd instance that already went down
3826 if (!osdmap.exists(from)) {
3827 dout(7) << __func__ << " ignoring full message from nonexistent "
3828 << m->get_orig_source_inst() << dendl;
3829 goto ignore;
3830 }
3831 if ((!osdmap.is_up(from) &&
11fdf7f2
TL
3832 osdmap.get_most_recent_addrs(from).legacy_equals(
3833 m->get_orig_source_addrs())) ||
7c673cae 3834 (osdmap.is_up(from) &&
11fdf7f2 3835 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
7c673cae
FG
3836 dout(7) << __func__ << " ignoring full message from down "
3837 << m->get_orig_source_inst() << dendl;
3838 goto ignore;
3839 }
3840
3841 OSDMap::calc_state_set(osdmap.get_state(from), state);
3842
3843 if ((osdmap.get_state(from) & mask) == m->state) {
3844 dout(7) << __func__ << " state already " << state << " for osd." << from
3845 << " " << m->get_orig_source_inst() << dendl;
3846 _reply_map(op, m->version);
3847 goto ignore;
3848 }
3849
3850 dout(10) << __func__ << " want state " << state << " for osd." << from
3851 << " " << m->get_orig_source_inst() << dendl;
3852 return false;
3853
3854 ignore:
3855 return true;
3856}
3857
3858bool OSDMonitor::prepare_full(MonOpRequestRef op)
3859{
3860 op->mark_osdmon_event(__func__);
9f95a23c 3861 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3862 const int from = m->get_orig_source().num();
3863
3864 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3865 const unsigned want_state = m->state & mask; // safety first
3866
3867 unsigned cur_state = osdmap.get_state(from);
3868 auto p = pending_inc.new_state.find(from);
3869 if (p != pending_inc.new_state.end()) {
3870 cur_state ^= p->second;
3871 }
3872 cur_state &= mask;
3873
3874 set<string> want_state_set, cur_state_set;
3875 OSDMap::calc_state_set(want_state, want_state_set);
3876 OSDMap::calc_state_set(cur_state, cur_state_set);
3877
3878 if (cur_state != want_state) {
3879 if (p != pending_inc.new_state.end()) {
3880 p->second &= ~mask;
3881 } else {
3882 pending_inc.new_state[from] = 0;
3883 }
3884 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3885 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3886 << " -> " << want_state_set << dendl;
3887 } else {
3888 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3889 << " = wanted " << want_state_set << ", just waiting" << dendl;
3890 }
3891
3892 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3893 return true;
3894}
3895
3896// -------------
3897// alive
3898
3899bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3900{
3901 op->mark_osdmon_event(__func__);
9f95a23c 3902 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3903 int from = m->get_orig_source().num();
3904
3905 // check permissions, ignore if failed
11fdf7f2 3906 MonSession *session = op->get_session();
7c673cae
FG
3907 if (!session)
3908 goto ignore;
3909 if (!session->is_capable("osd", MON_CAP_X)) {
3910 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3911 << session->caps << dendl;
3912 goto ignore;
3913 }
3914
3915 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3916 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3917 dout(7) << "preprocess_alive ignoring alive message from down "
3918 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3919 << dendl;
7c673cae
FG
3920 goto ignore;
3921 }
3922
3923 if (osdmap.get_up_thru(from) >= m->want) {
3924 // yup.
3925 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3926 _reply_map(op, m->version);
3927 return true;
3928 }
3929
3930 dout(10) << "preprocess_alive want up_thru " << m->want
3931 << " from " << m->get_orig_source_inst() << dendl;
3932 return false;
3933
3934 ignore:
3935 return true;
3936}
3937
3938bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3939{
3940 op->mark_osdmon_event(__func__);
9f95a23c 3941 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3942 int from = m->get_orig_source().num();
3943
3944 if (0) { // we probably don't care much about these
f67539c2 3945 mon.clog->debug() << m->get_orig_source_inst() << " alive";
7c673cae
FG
3946 }
3947
3948 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3949 << " from " << m->get_orig_source_inst() << dendl;
3950
3951 update_up_thru(from, m->version); // set to the latest map the OSD has
3952 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3953 return true;
3954}
3955
3956void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3957{
3958 op->mark_osdmon_event(__func__);
3959 dout(7) << "_reply_map " << e
3960 << " from " << op->get_req()->get_orig_source_inst()
3961 << dendl;
3962 send_latest(op, e);
3963}
3964
3965// pg_created
3966bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3967{
3968 op->mark_osdmon_event(__func__);
9f95a23c 3969 auto m = op->get_req<MOSDPGCreated>();
7c673cae 3970 dout(10) << __func__ << " " << *m << dendl;
11fdf7f2 3971 auto session = op->get_session();
f67539c2 3972 mon.no_reply(op);
7c673cae
FG
3973 if (!session) {
3974 dout(10) << __func__ << ": no monitor session!" << dendl;
3975 return true;
3976 }
3977 if (!session->is_capable("osd", MON_CAP_X)) {
3978 derr << __func__ << " received from entity "
3979 << "with insufficient privileges " << session->caps << dendl;
3980 return true;
3981 }
3982 // always forward the "created!" to the leader
3983 return false;
3984}
3985
3986bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3987{
3988 op->mark_osdmon_event(__func__);
9f95a23c 3989 auto m = op->get_req<MOSDPGCreated>();
7c673cae
FG
3990 dout(10) << __func__ << " " << *m << dendl;
3991 auto src = m->get_orig_source();
3992 auto from = src.num();
3993 if (!src.is_osd() ||
f67539c2
TL
3994 !mon.osdmon()->osdmap.is_up(from) ||
3995 !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
11fdf7f2 3996 m->get_orig_source_addrs())) {
7c673cae
FG
3997 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3998 return false;
3999 }
4000 pending_created_pgs.push_back(m->pgid);
4001 return true;
4002}
4003
11fdf7f2
TL
4004bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
4005{
4006 op->mark_osdmon_event(__func__);
9f95a23c 4007 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
4008 dout(10) << __func__ << " " << *m << dendl;
4009 const pg_pool_t *pi;
4010 auto session = op->get_session();
4011 if (!session) {
4012 dout(10) << __func__ << ": no monitor session!" << dendl;
4013 goto ignore;
4014 }
4015 if (!session->is_capable("osd", MON_CAP_X)) {
4016 derr << __func__ << " received from entity "
4017 << "with insufficient privileges " << session->caps << dendl;
4018 goto ignore;
4019 }
4020 pi = osdmap.get_pg_pool(m->pgid.pool());
4021 if (!pi) {
4022 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
4023 goto ignore;
4024 }
4025 if (pi->get_pg_num() <= m->pgid.ps()) {
4026 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
4027 goto ignore;
4028 }
4029 if (pi->get_pg_num() != m->pgid.ps() + 1) {
4030 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4031 goto ignore;
4032 }
4033 if (pi->get_pg_num_pending() > m->pgid.ps()) {
4034 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4035 goto ignore;
4036 }
4037 return false;
4038
4039 ignore:
f67539c2 4040 mon.no_reply(op);
11fdf7f2
TL
4041 return true;
4042}
4043
4044bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4045{
4046 op->mark_osdmon_event(__func__);
9f95a23c 4047 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
4048 dout(10) << __func__ << " " << *m << dendl;
4049 pg_pool_t p;
4050 if (pending_inc.new_pools.count(m->pgid.pool()))
4051 p = pending_inc.new_pools[m->pgid.pool()];
4052 else
4053 p = *osdmap.get_pg_pool(m->pgid.pool());
4054 if (p.get_pg_num() != m->pgid.ps() + 1 ||
4055 p.get_pg_num_pending() > m->pgid.ps()) {
4056 dout(10) << __func__
4057 << " race with concurrent pg_num[_pending] update, will retry"
4058 << dendl;
4059 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4060 return true;
4061 }
4062
4063 if (m->ready) {
4064 p.dec_pg_num(m->pgid,
4065 pending_inc.epoch,
4066 m->source_version,
4067 m->target_version,
4068 m->last_epoch_started,
4069 m->last_epoch_clean);
4070 p.last_change = pending_inc.epoch;
4071 } else {
4072 // back off the merge attempt!
4073 p.set_pg_num_pending(p.get_pg_num());
4074 }
4075
4076 // force pre-nautilus clients to resend their ops, since they
4077 // don't understand pg_num_pending changes form a new interval
4078 p.last_force_op_resend_prenautilus = pending_inc.epoch;
4079
4080 pending_inc.new_pools[m->pgid.pool()] = p;
4081
4082 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4083 if (m->ready &&
4084 prob > 0 &&
4085 prob > (double)(rand() % 1000)/1000.0) {
4086 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
f67539c2 4087 auto n = new MMonCommand(mon.monmap->get_fsid());
11fdf7f2
TL
4088 n->set_connection(m->get_connection());
4089 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4090 osdmap.get_pool_name(m->pgid.pool()) +
4091 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4092 stringify(m->pgid.ps() + 1) + "\"}" };
f67539c2 4093 MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
11fdf7f2
TL
4094 nop->set_type_service();
4095 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4096 } else {
4097 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4098 }
4099 return true;
4100}
4101
4102
7c673cae
FG
4103// -------------
4104// pg_temp changes
4105
4106bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4107{
9f95a23c 4108 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
4109 dout(10) << "preprocess_pgtemp " << *m << dendl;
4110 mempool::osdmap::vector<int> empty;
4111 int from = m->get_orig_source().num();
4112 size_t ignore_cnt = 0;
4113
4114 // check caps
11fdf7f2 4115 MonSession *session = op->get_session();
7c673cae
FG
4116 if (!session)
4117 goto ignore;
4118 if (!session->is_capable("osd", MON_CAP_X)) {
4119 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4120 << session->caps << dendl;
4121 goto ignore;
4122 }
4123
4124 if (!osdmap.is_up(from) ||
11fdf7f2
TL
4125 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4126 dout(7) << "ignoring pgtemp message from down "
4127 << m->get_orig_source() << " " << m->get_orig_source_addrs()
4128 << dendl;
7c673cae
FG
4129 goto ignore;
4130 }
4131
3efd9988
FG
4132 if (m->forced) {
4133 return false;
4134 }
4135
7c673cae
FG
4136 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4137 dout(20) << " " << p->first
31f18b77 4138 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
4139 << " -> " << p->second << dendl;
4140
4141 // does the pool exist?
4142 if (!osdmap.have_pg_pool(p->first.pool())) {
4143 /*
4144 * 1. If the osdmap does not have the pool, it means the pool has been
4145 * removed in-between the osd sending this message and us handling it.
4146 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4147 * not exist in the pending either, as the osds would not send a
4148 * message about a pool they know nothing about (yet).
4149 * 3. However, if the pool does exist in the pending, then it must be a
4150 * new pool, and not relevant to this message (see 1).
4151 */
4152 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4153 << ": pool has been removed" << dendl;
4154 ignore_cnt++;
4155 continue;
4156 }
4157
4158 int acting_primary = -1;
4159 osdmap.pg_to_up_acting_osds(
4160 p->first, nullptr, nullptr, nullptr, &acting_primary);
4161 if (acting_primary != from) {
4162 /* If the source isn't the primary based on the current osdmap, we know
4163 * that the interval changed and that we can discard this message.
4164 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4165 * which of two pg temp mappings on the same pg is more recent.
4166 */
4167 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4168 << ": primary has changed" << dendl;
4169 ignore_cnt++;
4170 continue;
4171 }
4172
4173 // removal?
4174 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4175 osdmap.primary_temp->count(p->first)))
4176 return false;
4177 // change?
4178 // NOTE: we assume that this will clear pg_primary, so consider
4179 // an existing pg_primary field to imply a change
4180 if (p->second.size() &&
4181 (osdmap.pg_temp->count(p->first) == 0 ||
11fdf7f2 4182 osdmap.pg_temp->get(p->first) != p->second ||
7c673cae
FG
4183 osdmap.primary_temp->count(p->first)))
4184 return false;
4185 }
4186
4187 // should we ignore all the pgs?
4188 if (ignore_cnt == m->pg_temp.size())
4189 goto ignore;
4190
4191 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4192 _reply_map(op, m->map_epoch);
4193 return true;
4194
4195 ignore:
f67539c2 4196 mon.no_reply(op);
7c673cae
FG
4197 return true;
4198}
4199
4200void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4201{
4202 epoch_t old_up_thru = osdmap.get_up_thru(from);
4203 auto ut = pending_inc.new_up_thru.find(from);
4204 if (ut != pending_inc.new_up_thru.end()) {
4205 old_up_thru = ut->second;
4206 }
4207 if (up_thru > old_up_thru) {
4208 // set up_thru too, so the osd doesn't have to ask again
4209 pending_inc.new_up_thru[from] = up_thru;
4210 }
4211}
4212
4213bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4214{
4215 op->mark_osdmon_event(__func__);
9f95a23c 4216 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
4217 int from = m->get_orig_source().num();
4218 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4219 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4220 uint64_t pool = p->first.pool();
4221 if (pending_inc.old_pools.count(pool)) {
4222 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4223 << ": pool pending removal" << dendl;
4224 continue;
4225 }
4226 if (!osdmap.have_pg_pool(pool)) {
4227 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4228 << ": pool has been removed" << dendl;
4229 continue;
4230 }
4231 pending_inc.new_pg_temp[p->first] =
4232 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4233
4234 // unconditionally clear pg_primary (until this message can encode
4235 // a change for that, too.. at which point we need to also fix
4236 // preprocess_pg_temp)
4237 if (osdmap.primary_temp->count(p->first) ||
4238 pending_inc.new_primary_temp.count(p->first))
4239 pending_inc.new_primary_temp[p->first] = -1;
4240 }
4241
4242 // set up_thru too, so the osd doesn't have to ask again
4243 update_up_thru(from, m->map_epoch);
4244
4245 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4246 return true;
4247}
4248
4249
4250// ---
4251
4252bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4253{
4254 op->mark_osdmon_event(__func__);
9f95a23c 4255 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4256 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4257
4258 // check privilege, ignore if failed
11fdf7f2 4259 MonSession *session = op->get_session();
f67539c2 4260 mon.no_reply(op);
7c673cae
FG
4261 if (!session)
4262 goto ignore;
4263 if (!session->caps.is_capable(
11fdf7f2 4264 cct,
7c673cae 4265 session->entity_name,
11fdf7f2
TL
4266 "osd", "osd pool rmsnap", {}, true, true, false,
4267 session->get_peer_socket_addr())) {
7c673cae
FG
4268 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4269 << session->caps << dendl;
4270 goto ignore;
4271 }
4272
4273 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4274 q != m->snaps.end();
4275 ++q) {
4276 if (!osdmap.have_pg_pool(q->first)) {
9f95a23c
TL
4277 dout(10) << " ignoring removed_snaps " << q->second
4278 << " on non-existent pool " << q->first << dendl;
7c673cae
FG
4279 continue;
4280 }
4281 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4282 for (vector<snapid_t>::iterator p = q->second.begin();
4283 p != q->second.end();
4284 ++p) {
4285 if (*p > pi->get_snap_seq() ||
9f95a23c 4286 !_is_removed_snap(q->first, *p)) {
7c673cae 4287 return false;
9f95a23c 4288 }
7c673cae
FG
4289 }
4290 }
4291
9f95a23c
TL
4292 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4293 auto reply = make_message<MRemoveSnaps>();
4294 reply->snaps = m->snaps;
f67539c2 4295 mon.send_reply(op, reply.detach());
9f95a23c
TL
4296 }
4297
7c673cae
FG
4298 ignore:
4299 return true;
4300}
4301
4302bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4303{
4304 op->mark_osdmon_event(__func__);
9f95a23c 4305 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4306 dout(7) << "prepare_remove_snaps " << *m << dendl;
4307
9f95a23c
TL
4308 for (auto& [pool, snaps] : m->snaps) {
4309 if (!osdmap.have_pg_pool(pool)) {
4310 dout(10) << " ignoring removed_snaps " << snaps
4311 << " on non-existent pool " << pool << dendl;
7c673cae
FG
4312 continue;
4313 }
4314
9f95a23c
TL
4315 pg_pool_t& pi = osdmap.pools[pool];
4316 for (auto s : snaps) {
4317 if (!_is_removed_snap(pool, s) &&
4318 (!pending_inc.new_pools.count(pool) ||
4319 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4320 (!pending_inc.new_removed_snaps.count(pool) ||
4321 !pending_inc.new_removed_snaps[pool].contains(s))) {
4322 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4323 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4324 newpi->removed_snaps.insert(s);
4325 dout(10) << " pool " << pool << " removed_snaps added " << s
4326 << " (now " << newpi->removed_snaps << ")" << dendl;
4327 }
11fdf7f2 4328 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
9f95a23c
TL
4329 if (s > newpi->get_snap_seq()) {
4330 dout(10) << " pool " << pool << " snap_seq "
4331 << newpi->get_snap_seq() << " -> " << s << dendl;
4332 newpi->set_snap_seq(s);
7c673cae
FG
4333 }
4334 newpi->set_snap_epoch(pending_inc.epoch);
9f95a23c
TL
4335 dout(10) << " added pool " << pool << " snap " << s
4336 << " to removed_snaps queue" << dendl;
4337 pending_inc.new_removed_snaps[pool].insert(s);
7c673cae
FG
4338 }
4339 }
4340 }
9f95a23c
TL
4341
4342 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4343 auto reply = make_message<MRemoveSnaps>();
4344 reply->snaps = m->snaps;
4345 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4346 }
4347
4348 return true;
4349}
4350
4351bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4352{
4353 op->mark_osdmon_event(__func__);
4354 auto m = op->get_req<MMonGetPurgedSnaps>();
4355 dout(7) << __func__ << " " << *m << dendl;
4356
4357 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4358
4359 string k = make_purged_snap_epoch_key(m->start);
f67539c2 4360 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
9f95a23c
TL
4361 it->upper_bound(k);
4362 unsigned long epoch = m->last;
4363 while (it->valid()) {
4364 if (it->key().find("purged_epoch_") != 0) {
4365 break;
4366 }
4367 string k = it->key();
4368 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4369 if (n != 1) {
4370 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4371 } else if (epoch > m->last) {
4372 break;
4373 } else {
4374 bufferlist bl = it->value();
4375 auto p = bl.cbegin();
4376 auto &v = r[epoch];
4377 try {
4378 ceph::decode(v, p);
f67539c2 4379 } catch (ceph::buffer::error& e) {
9f95a23c
TL
4380 derr << __func__ << " unable to parse value for key '" << it->key()
4381 << "': \n";
4382 bl.hexdump(*_dout);
4383 *_dout << dendl;
4384 }
4385 n += 4 + v.size() * 16;
4386 }
4387 if (n > 1048576) {
4388 // impose a semi-arbitrary limit to message size
4389 break;
4390 }
4391 it->next();
4392 }
4393
4394 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4395 reply->purged_snaps.swap(r);
f67539c2 4396 mon.send_reply(op, reply.detach());
9f95a23c 4397
7c673cae
FG
4398 return true;
4399}
4400
4401// osd beacon
4402bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4403{
4404 op->mark_osdmon_event(__func__);
7c673cae 4405 // check caps
11fdf7f2 4406 auto session = op->get_session();
f67539c2 4407 mon.no_reply(op);
7c673cae
FG
4408 if (!session) {
4409 dout(10) << __func__ << " no monitor session!" << dendl;
4410 return true;
4411 }
4412 if (!session->is_capable("osd", MON_CAP_X)) {
4413 derr << __func__ << " received from entity "
4414 << "with insufficient privileges " << session->caps << dendl;
4415 return true;
4416 }
4417 // Always forward the beacon to the leader, even if they are the same as
4418 // the old one. The leader will mark as down osds that haven't sent
4419 // beacon for a few minutes.
4420 return false;
4421}
4422
4423bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4424{
4425 op->mark_osdmon_event(__func__);
9f95a23c 4426 const auto beacon = op->get_req<MOSDBeacon>();
7c673cae
FG
4427 const auto src = beacon->get_orig_source();
4428 dout(10) << __func__ << " " << *beacon
4429 << " from " << src << dendl;
4430 int from = src.num();
4431
4432 if (!src.is_osd() ||
4433 !osdmap.is_up(from) ||
11fdf7f2
TL
4434 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4435 if (src.is_osd() && !osdmap.is_up(from)) {
4436 // share some new maps with this guy in case it may not be
4437 // aware of its own deadness...
4438 send_latest(op, beacon->version+1);
4439 }
4440 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
7c673cae
FG
4441 return false;
4442 }
4443
f67539c2
TL
4444 last_osd_report[from].first = ceph_clock_now();
4445 last_osd_report[from].second = beacon->osd_beacon_report_interval;
7c673cae
FG
4446 osd_epochs[from] = beacon->version;
4447
4448 for (const auto& pg : beacon->pgs) {
522d829b
TL
4449 if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4450 unsigned pg_num = pool->get_pg_num();
4451 last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4452 }
7c673cae 4453 }
9f95a23c
TL
4454
4455 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4456 beacon->last_purged_snaps_scrub) {
4457 if (pending_inc.new_xinfo.count(from) == 0) {
4458 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4459 }
4460 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4461 beacon->last_purged_snaps_scrub;
4462 return true;
4463 } else {
4464 return false;
4465 }
7c673cae
FG
4466}
4467
4468// ---------------
4469// map helpers
4470
4471void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4472{
4473 op->mark_osdmon_event(__func__);
4474 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4475 << " start " << start << dendl;
4476 if (start == 0)
4477 send_full(op);
4478 else
4479 send_incremental(op, start);
4480}
4481
4482
28e407b8 4483MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
7c673cae 4484{
f67539c2 4485 MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
28e407b8 4486 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
1e59de90 4487 r->cluster_osdmap_trim_lower_bound = get_first_committed();
7c673cae
FG
4488 r->newest_map = osdmap.get_epoch();
4489 return r;
4490}
4491
28e407b8 4492MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
7c673cae 4493{
11fdf7f2
TL
4494 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4495 << std::hex << features << std::dec << dendl;
f67539c2 4496 MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
1e59de90 4497 m->cluster_osdmap_trim_lower_bound = get_first_committed();
7c673cae
FG
4498 m->newest_map = osdmap.get_epoch();
4499
4500 for (epoch_t e = to; e >= from && e > 0; e--) {
4501 bufferlist bl;
28e407b8 4502 int err = get_version(e, features, bl);
7c673cae 4503 if (err == 0) {
11fdf7f2 4504 ceph_assert(bl.length());
7c673cae
FG
4505 // if (get_version(e, bl) > 0) {
4506 dout(20) << "build_incremental inc " << e << " "
4507 << bl.length() << " bytes" << dendl;
4508 m->incremental_maps[e] = bl;
4509 } else {
11fdf7f2
TL
4510 ceph_assert(err == -ENOENT);
4511 ceph_assert(!bl.length());
28e407b8 4512 get_version_full(e, features, bl);
7c673cae
FG
4513 if (bl.length() > 0) {
4514 //else if (get_version("full", e, bl) > 0) {
4515 dout(20) << "build_incremental full " << e << " "
4516 << bl.length() << " bytes" << dendl;
4517 m->maps[e] = bl;
4518 } else {
4519 ceph_abort(); // we should have all maps.
4520 }
4521 }
4522 }
4523 return m;
4524}
4525
4526void OSDMonitor::send_full(MonOpRequestRef op)
4527{
4528 op->mark_osdmon_event(__func__);
4529 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
f67539c2 4530 mon.send_reply(op, build_latest_full(op->get_session()->con_features));
7c673cae
FG
4531}
4532
4533void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4534{
4535 op->mark_osdmon_event(__func__);
4536
4537 MonSession *s = op->get_session();
11fdf7f2 4538 ceph_assert(s);
7c673cae 4539
11fdf7f2 4540 if (s->proxy_con) {
7c673cae
FG
4541 // oh, we can tell the other mon to do it
4542 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4543 << first << dendl;
4544 MRoute *r = new MRoute(s->proxy_tid, NULL);
4545 r->send_osdmap_first = first;
4546 s->proxy_con->send_message(r);
4547 op->mark_event("reply: send routed send_osdmap_first reply");
4548 } else {
4549 // do it ourselves
4550 send_incremental(first, s, false, op);
4551 }
4552}
4553
4554void OSDMonitor::send_incremental(epoch_t first,
4555 MonSession *session,
4556 bool onetime,
4557 MonOpRequestRef req)
4558{
4559 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
11fdf7f2 4560 << " to " << session->name << dendl;
7c673cae 4561
28e407b8
AA
4562 // get feature of the peer
4563 // use quorum_con_features, if it's an anonymous connection.
4564 uint64_t features = session->con_features ? session->con_features :
f67539c2 4565 mon.get_quorum_con_features();
28e407b8 4566
7c673cae 4567 if (first <= session->osd_epoch) {
11fdf7f2 4568 dout(10) << __func__ << " " << session->name << " should already have epoch "
7c673cae
FG
4569 << session->osd_epoch << dendl;
4570 first = session->osd_epoch + 1;
4571 }
4572
4573 if (first < get_first_committed()) {
11fdf7f2 4574 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
1e59de90 4575 m->cluster_osdmap_trim_lower_bound = get_first_committed();
11fdf7f2
TL
4576 m->newest_map = osdmap.get_epoch();
4577
7c673cae
FG
4578 first = get_first_committed();
4579 bufferlist bl;
28e407b8 4580 int err = get_version_full(first, features, bl);
11fdf7f2
TL
4581 ceph_assert(err == 0);
4582 ceph_assert(bl.length());
7c673cae
FG
4583 dout(20) << "send_incremental starting with base full "
4584 << first << " " << bl.length() << " bytes" << dendl;
7c673cae
FG
4585 m->maps[first] = bl;
4586
4587 if (req) {
f67539c2 4588 mon.send_reply(req, m);
7c673cae
FG
4589 session->osd_epoch = first;
4590 return;
4591 } else {
4592 session->con->send_message(m);
4593 session->osd_epoch = first;
4594 }
4595 first++;
4596 }
4597
4598 while (first <= osdmap.get_epoch()) {
11fdf7f2 4599 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
28e407b8
AA
4600 osdmap.get_epoch());
4601 MOSDMap *m = build_incremental(first, last, features);
7c673cae
FG
4602
4603 if (req) {
4604 // send some maps. it may not be all of them, but it will get them
4605 // started.
f67539c2 4606 mon.send_reply(req, m);
7c673cae
FG
4607 } else {
4608 session->con->send_message(m);
4609 first = last + 1;
4610 }
4611 session->osd_epoch = last;
4612 if (onetime || req)
4613 break;
4614 }
4615}
4616
4617int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4618{
f67539c2 4619 return get_version(ver, mon.get_quorum_con_features(), bl);
28e407b8
AA
4620}
4621
4622void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4623{
4624 OSDMap::Incremental inc;
11fdf7f2 4625 auto q = bl.cbegin();
28e407b8
AA
4626 inc.decode(q);
4627 // always encode with subset of osdmap's canonical features
4628 uint64_t f = features & inc.encode_features;
4629 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4630 << dendl;
4631 bl.clear();
4632 if (inc.fullmap.length()) {
4633 // embedded full map?
4634 OSDMap m;
4635 m.decode(inc.fullmap);
4636 inc.fullmap.clear();
4637 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4638 }
4639 if (inc.crush.length()) {
4640 // embedded crush map
4641 CrushWrapper c;
11fdf7f2 4642 auto p = inc.crush.cbegin();
28e407b8
AA
4643 c.decode(p);
4644 inc.crush.clear();
4645 c.encode(inc.crush, f);
4646 }
4647 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4648}
4649
4650void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4651{
4652 OSDMap m;
11fdf7f2 4653 auto q = bl.cbegin();
28e407b8
AA
4654 m.decode(q);
4655 // always encode with subset of osdmap's canonical features
4656 uint64_t f = features & m.get_encoding_features();
4657 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4658 << dendl;
4659 bl.clear();
4660 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4661}
4662
4663int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4664{
4665 uint64_t significant_features = OSDMap::get_significant_features(features);
4666 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4667 return 0;
4668 }
4669 int ret = PaxosService::get_version(ver, bl);
4670 if (ret < 0) {
7c673cae 4671 return ret;
28e407b8
AA
4672 }
4673 // NOTE: this check is imprecise; the OSDMap encoding features may
4674 // be a subset of the latest mon quorum features, but worst case we
4675 // reencode once and then cache the (identical) result under both
4676 // feature masks.
4677 if (significant_features !=
f67539c2 4678 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
28e407b8
AA
4679 reencode_incremental_map(bl, features);
4680 }
eafe8130 4681 inc_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4682 return 0;
7c673cae
FG
4683}
4684
11fdf7f2
TL
4685int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4686{
4687 bufferlist inc_bl;
4688 int err = get_version(ver, inc_bl);
4689 ceph_assert(err == 0);
4690 ceph_assert(inc_bl.length());
4691
4692 auto p = inc_bl.cbegin();
4693 inc.decode(p);
4694 dout(10) << __func__ << " "
4695 << " epoch " << inc.epoch
4696 << " inc_crc " << inc.inc_crc
4697 << " full_crc " << inc.full_crc
4698 << " encode_features " << inc.encode_features << dendl;
4699 return 0;
4700}
4701
4702int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4703{
4704 dout(10) << __func__ << " ver " << ver << dendl;
4705
4706 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4707 if (closest_pinned == 0) {
4708 return -ENOENT;
4709 }
4710 if (closest_pinned > ver) {
4711 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4712 }
4713 ceph_assert(closest_pinned <= ver);
4714
4715 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4716
4717 // get osdmap incremental maps and apply on top of this one.
4718 bufferlist osdm_bl;
4719 bool has_cached_osdmap = false;
4720 for (version_t v = ver-1; v >= closest_pinned; --v) {
f67539c2 4721 if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
11fdf7f2
TL
4722 &osdm_bl)) {
4723 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4724 closest_pinned = v;
4725 has_cached_osdmap = true;
4726 break;
4727 }
4728 }
4729
4730 if (!has_cached_osdmap) {
4731 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4732 if (err != 0) {
4733 derr << __func__ << " closest pinned map ver " << closest_pinned
4734 << " not available! error: " << cpp_strerror(err) << dendl;
4735 }
4736 ceph_assert(err == 0);
4737 }
4738
4739 ceph_assert(osdm_bl.length());
4740
4741 OSDMap osdm;
4742 osdm.decode(osdm_bl);
4743
4744 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4745 << " e" << osdm.epoch
4746 << " crc " << osdm.get_crc()
4747 << " -- applying incremental maps." << dendl;
4748
4749 uint64_t encode_features = 0;
4750 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4751 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4752
4753 OSDMap::Incremental inc;
4754 int err = get_inc(v, inc);
4755 ceph_assert(err == 0);
4756
4757 encode_features = inc.encode_features;
4758
4759 err = osdm.apply_incremental(inc);
4760 ceph_assert(err == 0);
4761
4762 // this block performs paranoid checks on map retrieval
4763 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4764 inc.full_crc != 0) {
4765
4766 uint64_t f = encode_features;
4767 if (!f) {
f67539c2 4768 f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
11fdf7f2
TL
4769 }
4770
4771 // encode osdmap to force calculating crcs
4772 bufferlist tbl;
4773 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4774 // decode osdmap to compare crcs with what's expected by incremental
4775 OSDMap tosdm;
4776 tosdm.decode(tbl);
4777
4778 if (tosdm.get_crc() != inc.full_crc) {
4779 derr << __func__
4780 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4781 << ", expected " << inc.full_crc << ")" << dendl;
4782 ceph_abort_msg("osdmap crc mismatch");
4783 }
4784 }
4785
4786 // note: we cannot add the recently computed map to the cache, as is,
4787 // because we have not encoded the map into a bl.
4788 }
4789
4790 if (!encode_features) {
4791 dout(10) << __func__
4792 << " last incremental map didn't have features;"
4793 << " defaulting to quorum's or all" << dendl;
4794 encode_features =
f67539c2 4795 (mon.quorum_con_features ? mon.quorum_con_features : -1);
11fdf7f2
TL
4796 }
4797 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4798
4799 return 0;
4800}
4801
7c673cae
FG
4802int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4803{
f67539c2 4804 return get_version_full(ver, mon.get_quorum_con_features(), bl);
28e407b8
AA
4805}
4806
4807int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4808 bufferlist& bl)
4809{
4810 uint64_t significant_features = OSDMap::get_significant_features(features);
4811 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4812 return 0;
4813 }
4814 int ret = PaxosService::get_version_full(ver, bl);
11fdf7f2
TL
4815 if (ret == -ENOENT) {
4816 // build map?
4817 ret = get_full_from_pinned_map(ver, bl);
4818 }
28e407b8 4819 if (ret < 0) {
7c673cae 4820 return ret;
28e407b8
AA
4821 }
4822 // NOTE: this check is imprecise; the OSDMap encoding features may
4823 // be a subset of the latest mon quorum features, but worst case we
4824 // reencode once and then cache the (identical) result under both
4825 // feature masks.
4826 if (significant_features !=
f67539c2 4827 OSDMap::get_significant_features(mon.get_quorum_con_features())) {
28e407b8
AA
4828 reencode_full_map(bl, features);
4829 }
eafe8130 4830 full_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4831 return 0;
7c673cae
FG
4832}
4833
f67539c2 4834epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
11fdf7f2 4835{
f67539c2 4836 dout(10) << "blocklist " << av << " until " << until << dendl;
11fdf7f2 4837 for (auto a : av.v) {
9f95a23c 4838 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4839 a.set_type(entity_addr_t::TYPE_ANY);
4840 } else {
4841 a.set_type(entity_addr_t::TYPE_LEGACY);
4842 }
f67539c2 4843 pending_inc.new_blocklist[a] = until;
11fdf7f2
TL
4844 }
4845 return pending_inc.epoch;
4846}
4847
f67539c2 4848epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
7c673cae 4849{
9f95a23c 4850 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4851 a.set_type(entity_addr_t::TYPE_ANY);
4852 } else {
4853 a.set_type(entity_addr_t::TYPE_LEGACY);
4854 }
f67539c2
TL
4855 dout(10) << "blocklist " << a << " until " << until << dendl;
4856 pending_inc.new_blocklist[a] = until;
7c673cae
FG
4857 return pending_inc.epoch;
4858}
4859
4860
4861void OSDMonitor::check_osdmap_subs()
4862{
4863 dout(10) << __func__ << dendl;
4864 if (!osdmap.get_epoch()) {
4865 return;
4866 }
f67539c2
TL
4867 auto osdmap_subs = mon.session_map.subs.find("osdmap");
4868 if (osdmap_subs == mon.session_map.subs.end()) {
7c673cae
FG
4869 return;
4870 }
4871 auto p = osdmap_subs->second->begin();
4872 while (!p.end()) {
4873 auto sub = *p;
4874 ++p;
4875 check_osdmap_sub(sub);
4876 }
4877}
4878
4879void OSDMonitor::check_osdmap_sub(Subscription *sub)
4880{
4881 dout(10) << __func__ << " " << sub << " next " << sub->next
4882 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4883 if (sub->next <= osdmap.get_epoch()) {
4884 if (sub->next >= 1)
4885 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4886 else
28e407b8 4887 sub->session->con->send_message(build_latest_full(sub->session->con_features));
7c673cae 4888 if (sub->onetime)
f67539c2 4889 mon.session_map.remove_sub(sub);
7c673cae
FG
4890 else
4891 sub->next = osdmap.get_epoch() + 1;
4892 }
4893}
4894
4895void OSDMonitor::check_pg_creates_subs()
4896{
7c673cae
FG
4897 if (!osdmap.get_num_up_osds()) {
4898 return;
4899 }
11fdf7f2 4900 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
f67539c2 4901 mon.with_session_map([this](const MonSessionMap& session_map) {
7c673cae
FG
4902 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4903 if (pg_creates_subs == session_map.subs.end()) {
4904 return;
4905 }
4906 for (auto sub : *pg_creates_subs->second) {
4907 check_pg_creates_sub(sub);
4908 }
4909 });
4910}
4911
4912void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4913{
11fdf7f2
TL
4914 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4915 ceph_assert(sub->type == "osd_pg_creates");
7c673cae
FG
4916 // only send these if the OSD is up. we will check_subs() when they do
4917 // come up so they will get the creates then.
11fdf7f2 4918 if (sub->session->name.is_osd() &&
f67539c2 4919 mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
11fdf7f2 4920 sub->next = send_pg_creates(sub->session->name.num(),
7c673cae
FG
4921 sub->session->con.get(),
4922 sub->next);
4923 }
4924}
4925
c07f9fc5 4926void OSDMonitor::do_application_enable(int64_t pool_id,
11fdf7f2
TL
4927 const std::string &app_name,
4928 const std::string &app_key,
1911f103
TL
4929 const std::string &app_value,
4930 bool force)
c07f9fc5 4931{
f67539c2 4932 ceph_assert(paxos.is_plugged() && is_writeable());
c07f9fc5
FG
4933
4934 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4935 << dendl;
4936
9f95a23c 4937 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
35e4c445 4938
c07f9fc5 4939 auto pp = osdmap.get_pg_pool(pool_id);
11fdf7f2 4940 ceph_assert(pp != nullptr);
c07f9fc5
FG
4941
4942 pg_pool_t p = *pp;
4943 if (pending_inc.new_pools.count(pool_id)) {
4944 p = pending_inc.new_pools[pool_id];
4945 }
4946
11fdf7f2
TL
4947 if (app_key.empty()) {
4948 p.application_metadata.insert({app_name, {}});
4949 } else {
1911f103
TL
4950 if (force) {
4951 p.application_metadata[app_name][app_key] = app_value;
4952 } else {
4953 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4954 }
11fdf7f2 4955 }
c07f9fc5
FG
4956 p.last_change = pending_inc.epoch;
4957 pending_inc.new_pools[pool_id] = p;
4958}
4959
494da23a
TL
4960void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4961 pool_opts_t::key_t opt,
4962 pool_opts_t::value_t val)
4963{
1d09f67e
TL
4964 dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
4965 << " val: " << val << dendl;
494da23a
TL
4966 auto p = pending_inc.new_pools.try_emplace(
4967 pool_id, *osdmap.get_pg_pool(pool_id));
4968 p.first->second.opts.set(opt, val);
4969}
4970
31f18b77 4971unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
4972 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4973 const mempool::osdmap::set<int64_t>& removed_pools,
4974 utime_t modified,
4975 creating_pgs_t* creating_pgs) const
4976{
31f18b77 4977 unsigned queued = 0;
7c673cae
FG
4978 for (auto& p : pools) {
4979 int64_t poolid = p.first;
11fdf7f2
TL
4980 if (creating_pgs->created_pools.count(poolid)) {
4981 dout(10) << __func__ << " already created " << poolid << dendl;
4982 continue;
4983 }
7c673cae 4984 const pg_pool_t& pool = p.second;
20effc67 4985 int ruleno = pool.get_crush_rule();
7c673cae
FG
4986 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4987 continue;
4988
4989 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4990 const auto created = pool.get_last_change();
4991 if (last_scan_epoch && created <= last_scan_epoch) {
4992 dout(10) << __func__ << " no change in pool " << poolid
4993 << " " << pool << dendl;
4994 continue;
4995 }
4996 if (removed_pools.count(poolid)) {
4997 dout(10) << __func__ << " pool is being removed: " << poolid
4998 << " " << pool << dendl;
4999 continue;
5000 }
31f18b77 5001 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 5002 << " " << pool << dendl;
11fdf7f2
TL
5003 creating_pgs->create_pool(poolid, pool.get_pg_num(),
5004 created, modified);
5005 queued++;
7c673cae 5006 }
31f18b77 5007 return queued;
7c673cae
FG
5008}
5009
5010void OSDMonitor::update_creating_pgs()
5011{
31f18b77
FG
5012 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
5013 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
5014 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
5015 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 5016 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
5017 int acting_primary = -1;
5018 auto pgid = pg.first;
94b18763
FG
5019 if (!osdmap.pg_exists(pgid)) {
5020 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
5021 << dendl;
5022 continue;
5023 }
9f95a23c 5024 auto mapped = pg.second.create_epoch;
c07f9fc5 5025 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
11fdf7f2
TL
5026 spg_t spgid(pgid);
5027 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
7c673cae
FG
5028 // check the previous creating_pgs, look for the target to whom the pg was
5029 // previously mapped
5030 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5031 const auto last_acting_primary = pgs_by_epoch.first;
5032 for (auto& pgs: pgs_by_epoch.second) {
11fdf7f2 5033 if (pgs.second.count(spgid)) {
7c673cae
FG
5034 if (last_acting_primary == acting_primary) {
5035 mapped = pgs.first;
5036 } else {
5037 dout(20) << __func__ << " " << pgid << " "
5038 << " acting_primary:" << last_acting_primary
5039 << " -> " << acting_primary << dendl;
5040 // note epoch if the target of the create message changed.
5041 mapped = mapping.get_epoch();
5042 }
5043 break;
31f18b77
FG
5044 } else {
5045 // newly creating
5046 mapped = mapping.get_epoch();
5047 }
7c673cae
FG
5048 }
5049 }
5050 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 5051 << " to create " << pgid << "@" << mapped << dendl;
11fdf7f2 5052 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
7c673cae
FG
5053 }
5054 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5055 creating_pgs_epoch = mapping.get_epoch();
5056}
5057
c07f9fc5 5058epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
5059{
5060 dout(30) << __func__ << " osd." << osd << " next=" << next
5061 << " " << creating_pgs_by_osd_epoch << dendl;
5062 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
5063 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5064 dout(20) << __func__
5065 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5066 // the subscribers will be updated when the mapping is completed anyway
5067 return next;
5068 }
7c673cae
FG
5069 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5070 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5071 return next;
11fdf7f2
TL
5072 ceph_assert(!creating_pgs_by_epoch->second.empty());
5073
1e59de90 5074 auto m = make_message<MOSDPGCreate2>(creating_pgs_epoch);
7c673cae 5075
7c673cae
FG
5076 epoch_t last = 0;
5077 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5078 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5079 auto epoch = epoch_pgs->first;
5080 auto& pgs = epoch_pgs->second;
5081 dout(20) << __func__ << " osd." << osd << " from " << next
5082 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5083 last = epoch;
5084 for (auto& pg : pgs) {
7c673cae
FG
5085 // Need the create time from the monitor using its clock to set
5086 // last_scrub_stamp upon pg creation.
11fdf7f2
TL
5087 auto create = creating_pgs.pgs.find(pg.pgid);
5088 ceph_assert(create != creating_pgs.pgs.end());
1e59de90
TL
5089 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5090 create->second.create_stamp));
5091 if (create->second.history.epoch_created) {
5092 dout(20) << __func__ << " " << pg << " " << create->second.history
5093 << " " << create->second.past_intervals << dendl;
5094 m->pg_extra.emplace(pg, make_pair(create->second.history,
5095 create->second.past_intervals));
11fdf7f2 5096 }
7c673cae 5097 dout(20) << __func__ << " will create " << pg
1e59de90 5098 << " at " << create->second.create_epoch << dendl;
7c673cae
FG
5099 }
5100 }
1e59de90
TL
5101 if (!m->pgs.empty()) {
5102 con->send_message2(std::move(m));
11fdf7f2 5103 } else {
7c673cae
FG
5104 dout(20) << __func__ << " osd." << osd << " from " << next
5105 << " has nothing to send" << dendl;
5106 return next;
5107 }
11fdf7f2 5108
7c673cae
FG
5109 // sub is current through last + 1
5110 return last + 1;
5111}
5112
5113// TICK
5114
5115
5116void OSDMonitor::tick()
5117{
5118 if (!is_active()) return;
5119
5120 dout(10) << osdmap << dendl;
5121
11fdf7f2
TL
5122 // always update osdmap manifest, regardless of being the leader.
5123 load_osdmap_manifest();
5124
1911f103
TL
5125 // always tune priority cache manager memory on leader and peons
5126 if (ceph_using_tcmalloc() && mon_memory_autotune) {
5127 std::lock_guard l(balancer_lock);
5128 if (pcm != nullptr) {
5129 pcm->tune_memory();
5130 pcm->balance();
5131 _set_new_cache_sizes();
5132 dout(10) << "tick balancer "
5133 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5134 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5135 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5136 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5137 << dendl;
5138 dout(10) << "tick balancer "
5139 << " full cache_bytes: " << full_cache->get_cache_bytes()
5140 << " full comtd_bytes: " << full_cache->get_committed_size()
5141 << " full used_bytes: " << full_cache->_get_used_bytes()
5142 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5143 << dendl;
5144 }
5145 }
5146
f67539c2 5147 if (!mon.is_leader()) return;
7c673cae
FG
5148
5149 bool do_propose = false;
5150 utime_t now = ceph_clock_now();
5151
11fdf7f2 5152 if (handle_osd_timeouts(now, last_osd_report)) {
181888fb
FG
5153 do_propose = true;
5154 }
7c673cae
FG
5155
5156 // mark osds down?
11fdf7f2 5157 if (check_failures(now)) {
7c673cae 5158 do_propose = true;
11fdf7f2
TL
5159 }
5160
5161 // Force a proposal if we need to prune; pruning is performed on
5162 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5163 // even if there's nothing going on.
5164 if (is_prune_enabled() && should_prune()) {
5165 do_propose = true;
5166 }
7c673cae
FG
5167
5168 // mark down osds out?
5169
5170 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5171 * influence at all. The decision is made based on the ratio of "in" osds,
5172 * and the function returns false if this ratio is lower that the minimum
11fdf7f2 5173 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
7c673cae
FG
5174 */
5175 if (can_mark_out(-1)) {
11fdf7f2
TL
5176 string down_out_subtree_limit = g_conf().get_val<string>(
5177 "mon_osd_down_out_subtree_limit");
7c673cae
FG
5178 set<int> down_cache; // quick cache of down subtrees
5179
5180 map<int,utime_t>::iterator i = down_pending_out.begin();
5181 while (i != down_pending_out.end()) {
5182 int o = i->first;
5183 utime_t down = now;
5184 down -= i->second;
5185 ++i;
5186
5187 if (osdmap.is_down(o) &&
5188 osdmap.is_in(o) &&
5189 can_mark_out(o)) {
11fdf7f2 5190 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
7c673cae
FG
5191 utime_t grace = orig_grace;
5192 double my_grace = 0.0;
5193
11fdf7f2 5194 if (g_conf()->mon_osd_adjust_down_out_interval) {
7c673cae
FG
5195 // scale grace period the same way we do the heartbeat grace.
5196 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
11fdf7f2 5197 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
5198 double decay_k = ::log(.5) / halflife;
5199 double decay = exp((double)down * decay_k);
5200 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5201 << " down for " << down << " decay " << decay << dendl;
5202 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5203 grace += my_grace;
5204 }
5205
5206 // is this an entire large subtree down?
11fdf7f2
TL
5207 if (down_out_subtree_limit.length()) {
5208 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
7c673cae 5209 if (type > 0) {
11fdf7f2
TL
5210 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5211 dout(10) << "tick entire containing " << down_out_subtree_limit
5212 << " subtree for osd." << o
5213 << " is down; resetting timer" << dendl;
7c673cae
FG
5214 // reset timer, too.
5215 down_pending_out[o] = now;
5216 continue;
5217 }
5218 }
5219 }
5220
c07f9fc5 5221 bool down_out = !osdmap.is_destroyed(o) &&
11fdf7f2 5222 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
c07f9fc5 5223 bool destroyed_out = osdmap.is_destroyed(o) &&
11fdf7f2 5224 g_conf()->mon_osd_destroyed_out_interval > 0 &&
c07f9fc5
FG
5225 // this is not precise enough as we did not make a note when this osd
5226 // was marked as destroyed, but let's not bother with that
5227 // complexity for now.
11fdf7f2 5228 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
c07f9fc5 5229 if (down_out || destroyed_out) {
7c673cae
FG
5230 dout(10) << "tick marking osd." << o << " OUT after " << down
5231 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5232 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5233
5234 // set the AUTOOUT bit.
5235 if (pending_inc.new_state.count(o) == 0)
5236 pending_inc.new_state[o] = 0;
5237 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5238
5239 // remember previous weight
5240 if (pending_inc.new_xinfo.count(o) == 0)
5241 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5242 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5243
5244 do_propose = true;
5245
f67539c2 5246 mon.clog->info() << "Marking osd." << o << " out (has been down for "
224ce89b 5247 << int(down.sec()) << " seconds)";
7c673cae
FG
5248 } else
5249 continue;
5250 }
5251
5252 down_pending_out.erase(o);
5253 }
5254 } else {
5255 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5256 }
5257
f67539c2
TL
5258 // expire blocklisted items?
5259 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5260 p != osdmap.blocklist.end();
7c673cae
FG
5261 ++p) {
5262 if (p->second < now) {
f67539c2
TL
5263 dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5264 pending_inc.old_blocklist.push_back(p->first);
7c673cae
FG
5265 do_propose = true;
5266 }
5267 }
33c7a0ef
TL
5268 for (auto p = osdmap.range_blocklist.begin();
5269 p != osdmap.range_blocklist.end();
5270 ++p) {
5271 if (p->second < now) {
5272 dout(10) << "expiring range_blocklist item " << p->first
5273 << " expired " << p->second << " < now " << now << dendl;
5274 pending_inc.old_range_blocklist.push_back(p->first);
5275 do_propose = true;
5276 }
5277 }
7c673cae 5278
11fdf7f2
TL
5279 if (try_prune_purged_snaps()) {
5280 do_propose = true;
7c673cae
FG
5281 }
5282
5283 if (update_pools_status())
5284 do_propose = true;
5285
5286 if (do_propose ||
5287 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5288 propose_pending();
eafe8130
TL
5289}
5290
5291void OSDMonitor::_set_new_cache_sizes()
5292{
5293 uint64_t cache_size = 0;
5294 int64_t inc_alloc = 0;
5295 int64_t full_alloc = 0;
5296 int64_t kv_alloc = 0;
5297
5298 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5299 cache_size = pcm->get_tuned_mem();
5300 inc_alloc = inc_cache->get_committed_size();
5301 full_alloc = full_cache->get_committed_size();
5302 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5303 }
5304
5305 inc_osd_cache.set_bytes(inc_alloc);
5306 full_osd_cache.set_bytes(full_alloc);
5307
92f5a8d4 5308 dout(1) << __func__ << " cache_size:" << cache_size
eafe8130
TL
5309 << " inc_alloc: " << inc_alloc
5310 << " full_alloc: " << full_alloc
5311 << " kv_alloc: " << kv_alloc
5312 << dendl;
7c673cae
FG
5313}
5314
5315bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
f67539c2 5316 std::map<int, std::pair<utime_t, int>> &last_osd_report)
7c673cae 5317{
11fdf7f2 5318 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
f67539c2 5319 if (now - mon.get_leader_since() < timeo) {
7c673cae
FG
5320 // We haven't been the leader for long enough to consider OSD timeouts
5321 return false;
5322 }
5323
5324 int max_osd = osdmap.get_max_osd();
5325 bool new_down = false;
5326
5327 for (int i=0; i < max_osd; ++i) {
5328 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
5329 if (!osdmap.exists(i)) {
5330 last_osd_report.erase(i); // if any
5331 continue;
5332 }
7c673cae
FG
5333 if (!osdmap.is_up(i))
5334 continue;
f67539c2 5335 const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
7c673cae
FG
5336 if (t == last_osd_report.end()) {
5337 // it wasn't in the map; start the timer.
f67539c2
TL
5338 last_osd_report[i].first = now;
5339 last_osd_report[i].second = 0;
7c673cae 5340 } else if (can_mark_down(i)) {
f67539c2
TL
5341 utime_t diff = now - t->second.first;
5342 // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5343 // to allow for the osd to miss a beacon.
5344 int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5345 utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0);
5346 if (diff > max_timeout) {
5347 mon.clog->info() << "osd." << i << " marked down after no beacon for "
5348 << diff << " seconds";
5349 derr << "no beacon from osd." << i << " since " << t->second.first
5350 << ", " << diff << " seconds ago. marking down" << dendl;
5351 pending_inc.new_state[i] = CEPH_OSD_UP;
5352 new_down = true;
7c673cae
FG
5353 }
5354 }
5355 }
5356 return new_down;
5357}
5358
11fdf7f2
TL
5359static void dump_cpu_list(Formatter *f, const char *name,
5360 const string& strlist)
7c673cae 5361{
11fdf7f2
TL
5362 cpu_set_t cpu_set;
5363 size_t cpu_set_size;
5364 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5365 return;
5366 }
5367 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5368 f->open_array_section(name);
5369 for (auto cpu : cpus) {
5370 f->dump_int("cpu", cpu);
7c673cae 5371 }
11fdf7f2 5372 f->close_section();
7c673cae
FG
5373}
5374
5375void OSDMonitor::dump_info(Formatter *f)
5376{
5377 f->open_object_section("osdmap");
1e59de90 5378 osdmap.dump(f, cct);
7c673cae
FG
5379 f->close_section();
5380
5381 f->open_array_section("osd_metadata");
5382 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5383 if (osdmap.exists(i)) {
5384 f->open_object_section("osd");
5385 f->dump_unsigned("id", i);
5386 dump_osd_metadata(i, f, NULL);
5387 f->close_section();
5388 }
5389 }
5390 f->close_section();
5391
1911f103
TL
5392 f->open_object_section("osdmap_clean_epochs");
5393 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5394
5395 f->open_object_section("last_epoch_clean");
5396 last_epoch_clean.dump(f);
5397 f->close_section();
5398
5399 f->open_array_section("osd_epochs");
5400 for (auto& osd_epoch : osd_epochs) {
5401 f->open_object_section("osd");
5402 f->dump_unsigned("id", osd_epoch.first);
5403 f->dump_unsigned("epoch", osd_epoch.second);
5404 f->close_section();
5405 }
5406 f->close_section(); // osd_epochs
5407
5408 f->close_section(); // osd_clean_epochs
5409
7c673cae
FG
5410 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5411 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5412
5413 f->open_object_section("crushmap");
5414 osdmap.crush->dump(f);
5415 f->close_section();
11fdf7f2
TL
5416
5417 if (has_osdmap_manifest) {
5418 f->open_object_section("osdmap_manifest");
5419 osdmap_manifest.dump(f);
5420 f->close_section();
5421 }
7c673cae
FG
5422}
5423
5424namespace {
5425 enum osd_pool_get_choices {
11fdf7f2 5426 SIZE, MIN_SIZE,
28e407b8 5427 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
7c673cae
FG
5428 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5429 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5430 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
11fdf7f2 5431 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
7c673cae
FG
5432 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5433 CACHE_TARGET_FULL_RATIO,
5434 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5435 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5436 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5437 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5438 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5439 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5440 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5441 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
11fdf7f2
TL
5442 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5443 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
f67539c2 5444 PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
20effc67 5445 DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
7c673cae
FG
5446
5447 std::set<osd_pool_get_choices>
5448 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5449 const std::set<osd_pool_get_choices>& second)
5450 {
5451 std::set<osd_pool_get_choices> result;
5452 std::set_difference(first.begin(), first.end(),
5453 second.begin(), second.end(),
5454 std::inserter(result, result.end()));
5455 return result;
5456 }
5457}
5458
5459
5460bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5461{
5462 op->mark_osdmon_event(__func__);
9f95a23c 5463 auto m = op->get_req<MMonCommand>();
7c673cae
FG
5464 int r = 0;
5465 bufferlist rdata;
5466 stringstream ss, ds;
5467
11fdf7f2 5468 cmdmap_t cmdmap;
7c673cae
FG
5469 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5470 string rs = ss.str();
f67539c2 5471 mon.reply_command(op, -EINVAL, rs, get_last_committed());
7c673cae
FG
5472 return true;
5473 }
5474
11fdf7f2 5475 MonSession *session = op->get_session();
7c673cae 5476 if (!session) {
11fdf7f2 5477 derr << __func__ << " no session" << dendl;
f67539c2 5478 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
5479 return true;
5480 }
5481
5482 string prefix;
9f95a23c 5483 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 5484
20effc67 5485 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
7c673cae
FG
5486 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5487
5488 if (prefix == "osd stat") {
92f5a8d4
TL
5489 if (f) {
5490 f->open_object_section("osdmap");
5491 osdmap.print_summary(f.get(), ds, "", true);
5492 f->close_section();
7c673cae 5493 f->flush(rdata);
92f5a8d4
TL
5494 } else {
5495 osdmap.print_summary(nullptr, ds, "", true);
7c673cae 5496 rdata.append(ds);
92f5a8d4 5497 }
7c673cae 5498 }
7c673cae
FG
5499 else if (prefix == "osd dump" ||
5500 prefix == "osd tree" ||
11fdf7f2 5501 prefix == "osd tree-from" ||
7c673cae
FG
5502 prefix == "osd ls" ||
5503 prefix == "osd getmap" ||
31f18b77 5504 prefix == "osd getcrushmap" ||
9f95a23c
TL
5505 prefix == "osd ls-tree" ||
5506 prefix == "osd info") {
7c673cae 5507
20effc67 5508 epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
7c673cae
FG
5509 bufferlist osdmap_bl;
5510 int err = get_version_full(epoch, osdmap_bl);
5511 if (err == -ENOENT) {
5512 r = -ENOENT;
5513 ss << "there is no map for epoch " << epoch;
5514 goto reply;
5515 }
11fdf7f2
TL
5516 ceph_assert(err == 0);
5517 ceph_assert(osdmap_bl.length());
7c673cae
FG
5518
5519 OSDMap *p;
5520 if (epoch == osdmap.get_epoch()) {
5521 p = &osdmap;
5522 } else {
5523 p = new OSDMap;
5524 p->decode(osdmap_bl);
5525 }
5526
224ce89b
WB
5527 auto sg = make_scope_guard([&] {
5528 if (p != &osdmap) {
5529 delete p;
5530 }
5531 });
5532
7c673cae
FG
5533 if (prefix == "osd dump") {
5534 stringstream ds;
5535 if (f) {
5536 f->open_object_section("osdmap");
1e59de90 5537 p->dump(f.get(), cct);
7c673cae
FG
5538 f->close_section();
5539 f->flush(ds);
5540 } else {
1e59de90 5541 p->print(cct, ds);
7c673cae
FG
5542 }
5543 rdata.append(ds);
5544 if (!f)
5545 ds << " ";
5546 } else if (prefix == "osd ls") {
5547 if (f) {
5548 f->open_array_section("osds");
5549 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5550 if (osdmap.exists(i)) {
5551 f->dump_int("osd", i);
5552 }
5553 }
5554 f->close_section();
5555 f->flush(ds);
5556 } else {
5557 bool first = true;
5558 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5559 if (osdmap.exists(i)) {
5560 if (!first)
5561 ds << "\n";
5562 first = false;
5563 ds << i;
5564 }
5565 }
5566 }
5567 rdata.append(ds);
9f95a23c
TL
5568 } else if (prefix == "osd info") {
5569 int64_t osd_id;
5570 bool do_single_osd = true;
5571 if (!cmd_getval(cmdmap, "id", osd_id)) {
5572 do_single_osd = false;
5573 }
5574
5575 if (do_single_osd && !osdmap.exists(osd_id)) {
5576 ss << "osd." << osd_id << " does not exist";
5577 r = -EINVAL;
5578 goto reply;
5579 }
5580
5581 if (f) {
5582 if (do_single_osd) {
5583 osdmap.dump_osd(osd_id, f.get());
5584 } else {
5585 osdmap.dump_osds(f.get());
5586 }
5587 f->flush(ds);
5588 } else {
5589 if (do_single_osd) {
5590 osdmap.print_osd(osd_id, ds);
5591 } else {
5592 osdmap.print_osds(ds);
5593 }
5594 }
5595 rdata.append(ds);
11fdf7f2
TL
5596 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5597 string bucket;
5598 if (prefix == "osd tree-from") {
9f95a23c 5599 cmd_getval(cmdmap, "bucket", bucket);
11fdf7f2
TL
5600 if (!osdmap.crush->name_exists(bucket)) {
5601 ss << "bucket '" << bucket << "' does not exist";
5602 r = -ENOENT;
5603 goto reply;
5604 }
5605 int id = osdmap.crush->get_item_id(bucket);
5606 if (id >= 0) {
5607 ss << "\"" << bucket << "\" is not a bucket";
5608 r = -EINVAL;
5609 goto reply;
5610 }
5611 }
5612
31f18b77 5613 vector<string> states;
9f95a23c 5614 cmd_getval(cmdmap, "states", states);
31f18b77
FG
5615 unsigned filter = 0;
5616 for (auto& s : states) {
5617 if (s == "up") {
5618 filter |= OSDMap::DUMP_UP;
5619 } else if (s == "down") {
5620 filter |= OSDMap::DUMP_DOWN;
5621 } else if (s == "in") {
5622 filter |= OSDMap::DUMP_IN;
5623 } else if (s == "out") {
5624 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
5625 } else if (s == "destroyed") {
5626 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
5627 } else {
5628 ss << "unrecognized state '" << s << "'";
5629 r = -EINVAL;
5630 goto reply;
5631 }
5632 }
5633 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
5634 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5635 ss << "cannot specify both 'in' and 'out'";
5636 r = -EINVAL;
5637 goto reply;
5638 }
5639 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5640 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5641 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5642 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5643 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5644 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5645 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
5646 r = -EINVAL;
5647 goto reply;
5648 }
7c673cae
FG
5649 if (f) {
5650 f->open_object_section("tree");
11fdf7f2 5651 p->print_tree(f.get(), NULL, filter, bucket);
7c673cae
FG
5652 f->close_section();
5653 f->flush(ds);
5654 } else {
11fdf7f2 5655 p->print_tree(NULL, &ds, filter, bucket);
7c673cae
FG
5656 }
5657 rdata.append(ds);
5658 } else if (prefix == "osd getmap") {
5659 rdata.append(osdmap_bl);
5660 ss << "got osdmap epoch " << p->get_epoch();
5661 } else if (prefix == "osd getcrushmap") {
f67539c2 5662 p->crush->encode(rdata, mon.get_quorum_con_features());
31f18b77
FG
5663 ss << p->get_crush_version();
5664 } else if (prefix == "osd ls-tree") {
5665 string bucket_name;
9f95a23c 5666 cmd_getval(cmdmap, "name", bucket_name);
31f18b77
FG
5667 set<int> osds;
5668 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5669 if (r == -ENOENT) {
5670 ss << "\"" << bucket_name << "\" does not exist";
5671 goto reply;
5672 } else if (r < 0) {
5673 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5674 goto reply;
5675 }
5676
5677 if (f) {
5678 f->open_array_section("osds");
5679 for (auto &i : osds) {
5680 if (osdmap.exists(i)) {
5681 f->dump_int("osd", i);
5682 }
5683 }
5684 f->close_section();
5685 f->flush(ds);
5686 } else {
5687 bool first = true;
5688 for (auto &i : osds) {
5689 if (osdmap.exists(i)) {
5690 if (!first)
5691 ds << "\n";
5692 first = false;
5693 ds << i;
5694 }
5695 }
5696 }
5697
5698 rdata.append(ds);
7c673cae 5699 }
7c673cae
FG
5700 } else if (prefix == "osd getmaxosd") {
5701 if (f) {
5702 f->open_object_section("getmaxosd");
5703 f->dump_unsigned("epoch", osdmap.get_epoch());
5704 f->dump_int("max_osd", osdmap.get_max_osd());
5705 f->close_section();
5706 f->flush(rdata);
5707 } else {
5708 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5709 rdata.append(ds);
5710 }
5711 } else if (prefix == "osd utilization") {
5712 string out;
5713 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5714 if (f)
5715 f->flush(rdata);
5716 else
5717 rdata.append(out);
5718 r = 0;
5719 goto reply;
5720 } else if (prefix == "osd find") {
5721 int64_t osd;
9f95a23c 5722 if (!cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5723 ss << "unable to parse osd id value '"
5724 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5725 r = -EINVAL;
5726 goto reply;
5727 }
5728 if (!osdmap.exists(osd)) {
5729 ss << "osd." << osd << " does not exist";
5730 r = -ENOENT;
5731 goto reply;
5732 }
5733 string format;
9f95a23c 5734 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5735 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5736 f->open_object_section("osd_location");
5737 f->dump_int("osd", osd);
11fdf7f2 5738 f->dump_object("addrs", osdmap.get_addrs(osd));
f64942e4 5739 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
11fdf7f2
TL
5740
5741 // try to identify host, pod/container name, etc.
5742 map<string,string> m;
5743 load_metadata(osd, m, nullptr);
5744 if (auto p = m.find("hostname"); p != m.end()) {
5745 f->dump_string("host", p->second);
5746 }
5747 for (auto& k : {
5748 "pod_name", "pod_namespace", // set by rook
9f95a23c 5749 "container_name" // set by cephadm, ceph-ansible
11fdf7f2
TL
5750 }) {
5751 if (auto p = m.find(k); p != m.end()) {
5752 f->dump_string(k, p->second);
5753 }
5754 }
5755
5756 // crush is helpful too
7c673cae
FG
5757 f->open_object_section("crush_location");
5758 map<string,string> loc = osdmap.crush->get_full_location(osd);
5759 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5760 f->dump_string(p->first.c_str(), p->second);
5761 f->close_section();
5762 f->close_section();
5763 f->flush(rdata);
5764 } else if (prefix == "osd metadata") {
5765 int64_t osd = -1;
5766 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
9f95a23c 5767 !cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5768 ss << "unable to parse osd id value '"
5769 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5770 r = -EINVAL;
5771 goto reply;
5772 }
5773 if (osd >= 0 && !osdmap.exists(osd)) {
5774 ss << "osd." << osd << " does not exist";
5775 r = -ENOENT;
5776 goto reply;
5777 }
5778 string format;
9f95a23c 5779 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5780 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5781 if (osd >= 0) {
5782 f->open_object_section("osd_metadata");
5783 f->dump_unsigned("id", osd);
5784 r = dump_osd_metadata(osd, f.get(), &ss);
5785 if (r < 0)
5786 goto reply;
5787 f->close_section();
5788 } else {
5789 r = 0;
5790 f->open_array_section("osd_metadata");
5791 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5792 if (osdmap.exists(i)) {
5793 f->open_object_section("osd");
5794 f->dump_unsigned("id", i);
5795 r = dump_osd_metadata(i, f.get(), NULL);
5796 if (r == -EINVAL || r == -ENOENT) {
5797 // Drop error, continue to get other daemons' metadata
5798 dout(4) << "No metadata for osd." << i << dendl;
5799 r = 0;
5800 } else if (r < 0) {
5801 // Unexpected error
5802 goto reply;
5803 }
5804 f->close_section();
5805 }
5806 }
5807 f->close_section();
5808 }
5809 f->flush(rdata);
31f18b77
FG
5810 } else if (prefix == "osd versions") {
5811 if (!f)
5812 f.reset(Formatter::create("json-pretty"));
5813 count_metadata("ceph_version", f.get());
5814 f->flush(rdata);
5815 r = 0;
5816 } else if (prefix == "osd count-metadata") {
5817 if (!f)
5818 f.reset(Formatter::create("json-pretty"));
5819 string field;
9f95a23c 5820 cmd_getval(cmdmap, "property", field);
31f18b77
FG
5821 count_metadata(field, f.get());
5822 f->flush(rdata);
5823 r = 0;
11fdf7f2
TL
5824 } else if (prefix == "osd numa-status") {
5825 TextTable tbl;
5826 if (f) {
5827 f->open_array_section("osds");
5828 } else {
5829 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5830 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5831 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5832 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5833 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5834 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5835 }
5836 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5837 if (osdmap.exists(i)) {
5838 map<string,string> m;
5839 ostringstream err;
5840 if (load_metadata(i, m, &err) < 0) {
5841 continue;
5842 }
5843 string host;
5844 auto p = m.find("hostname");
5845 if (p != m.end()) {
5846 host = p->second;
5847 }
5848 if (f) {
5849 f->open_object_section("osd");
5850 f->dump_int("osd", i);
5851 f->dump_string("host", host);
5852 for (auto n : { "network_numa_node", "objectstore_numa_node",
5853 "numa_node" }) {
5854 p = m.find(n);
5855 if (p != m.end()) {
5856 f->dump_int(n, atoi(p->second.c_str()));
5857 }
5858 }
5859 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5860 p = m.find(n);
5861 if (p != m.end()) {
5862 list<string> ls = get_str_list(p->second, ",");
5863 f->open_array_section(n);
5864 for (auto node : ls) {
5865 f->dump_int("node", atoi(node.c_str()));
5866 }
5867 f->close_section();
5868 }
5869 }
5870 for (auto n : { "numa_node_cpus" }) {
5871 p = m.find(n);
5872 if (p != m.end()) {
5873 dump_cpu_list(f.get(), n, p->second);
5874 }
5875 }
5876 f->close_section();
5877 } else {
5878 tbl << i;
5879 tbl << host;
5880 p = m.find("network_numa_nodes");
5881 if (p != m.end()) {
5882 tbl << p->second;
5883 } else {
5884 tbl << "-";
5885 }
5886 p = m.find("objectstore_numa_nodes");
5887 if (p != m.end()) {
5888 tbl << p->second;
5889 } else {
5890 tbl << "-";
5891 }
5892 p = m.find("numa_node");
5893 auto q = m.find("numa_node_cpus");
5894 if (p != m.end() && q != m.end()) {
5895 tbl << p->second;
5896 tbl << q->second;
5897 } else {
5898 tbl << "-";
5899 tbl << "-";
5900 }
5901 tbl << TextTable::endrow;
5902 }
5903 }
5904 }
5905 if (f) {
5906 f->close_section();
5907 f->flush(rdata);
5908 } else {
5909 rdata.append(stringify(tbl));
5910 }
7c673cae
FG
5911 } else if (prefix == "osd map") {
5912 string poolstr, objstr, namespacestr;
9f95a23c
TL
5913 cmd_getval(cmdmap, "pool", poolstr);
5914 cmd_getval(cmdmap, "object", objstr);
5915 cmd_getval(cmdmap, "nspace", namespacestr);
7c673cae
FG
5916
5917 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5918 if (pool < 0) {
5919 ss << "pool " << poolstr << " does not exist";
5920 r = -ENOENT;
5921 goto reply;
5922 }
5923 object_locator_t oloc(pool, namespacestr);
5924 object_t oid(objstr);
5925 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5926 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5927 vector<int> up, acting;
5928 int up_p, acting_p;
5929 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5930
5931 string fullobjname;
5932 if (!namespacestr.empty())
5933 fullobjname = namespacestr + string("/") + oid.name;
5934 else
5935 fullobjname = oid.name;
5936 if (f) {
5937 f->open_object_section("osd_map");
5938 f->dump_unsigned("epoch", osdmap.get_epoch());
5939 f->dump_string("pool", poolstr);
5940 f->dump_int("pool_id", pool);
5941 f->dump_stream("objname") << fullobjname;
5942 f->dump_stream("raw_pgid") << pgid;
5943 f->dump_stream("pgid") << mpgid;
5944 f->open_array_section("up");
5945 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5946 f->dump_int("osd", *p);
5947 f->close_section();
5948 f->dump_int("up_primary", up_p);
5949 f->open_array_section("acting");
5950 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5951 f->dump_int("osd", *p);
5952 f->close_section();
5953 f->dump_int("acting_primary", acting_p);
5954 f->close_section(); // osd_map
5955 f->flush(rdata);
5956 } else {
5957 ds << "osdmap e" << osdmap.get_epoch()
5958 << " pool '" << poolstr << "' (" << pool << ")"
5959 << " object '" << fullobjname << "' ->"
5960 << " pg " << pgid << " (" << mpgid << ")"
5961 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5962 << pg_vector_string(acting) << ", p" << acting_p << ")";
5963 rdata.append(ds);
5964 }
5965
5966 } else if (prefix == "pg map") {
5967 pg_t pgid;
7c673cae 5968 vector<int> up, acting;
1e59de90
TL
5969 r = parse_pgid(cmdmap, ss, pgid);
5970 if (r < 0)
7c673cae 5971 goto reply;
7c673cae
FG
5972 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5973 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5974 if (f) {
5975 f->open_object_section("pg_map");
5976 f->dump_unsigned("epoch", osdmap.get_epoch());
5977 f->dump_stream("raw_pgid") << pgid;
5978 f->dump_stream("pgid") << mpgid;
5979 f->open_array_section("up");
5980 for (auto osd : up) {
5981 f->dump_int("up_osd", osd);
5982 }
5983 f->close_section();
5984 f->open_array_section("acting");
5985 for (auto osd : acting) {
5986 f->dump_int("acting_osd", osd);
5987 }
5988 f->close_section();
5989 f->close_section();
5990 f->flush(rdata);
5991 } else {
5992 ds << "osdmap e" << osdmap.get_epoch()
5993 << " pg " << pgid << " (" << mpgid << ")"
5994 << " -> up " << up << " acting " << acting;
5995 rdata.append(ds);
5996 }
5997 goto reply;
5998
7c673cae 5999 } else if (prefix == "osd lspools") {
7c673cae
FG
6000 if (f)
6001 f->open_array_section("pools");
6002 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6003 p != osdmap.pools.end();
6004 ++p) {
11fdf7f2
TL
6005 if (f) {
6006 f->open_object_section("pool");
6007 f->dump_int("poolnum", p->first);
6008 f->dump_string("poolname", osdmap.pool_name[p->first]);
6009 f->close_section();
6010 } else {
6011 ds << p->first << ' ' << osdmap.pool_name[p->first];
6012 if (next(p) != osdmap.pools.end()) {
6013 ds << '\n';
7c673cae
FG
6014 }
6015 }
6016 }
6017 if (f) {
6018 f->close_section();
6019 f->flush(ds);
6020 }
6021 rdata.append(ds);
f67539c2
TL
6022 } else if (prefix == "osd blocklist ls" ||
6023 prefix == "osd blacklist ls") {
7c673cae 6024 if (f)
f67539c2 6025 f->open_array_section("blocklist");
7c673cae 6026
f67539c2
TL
6027 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6028 p != osdmap.blocklist.end();
7c673cae
FG
6029 ++p) {
6030 if (f) {
6031 f->open_object_section("entry");
11fdf7f2 6032 f->dump_string("addr", p->first.get_legacy_str());
7c673cae
FG
6033 f->dump_stream("until") << p->second;
6034 f->close_section();
6035 } else {
6036 stringstream ss;
6037 string s;
6038 ss << p->first << " " << p->second;
6039 getline(ss, s);
6040 s += "\n";
6041 rdata.append(s);
6042 }
6043 }
6044 if (f) {
6045 f->close_section();
6046 f->flush(rdata);
6047 }
33c7a0ef
TL
6048 if (f)
6049 f->open_array_section("range_blocklist");
6050
6051 for (auto p = osdmap.range_blocklist.begin();
6052 p != osdmap.range_blocklist.end();
6053 ++p) {
6054 if (f) {
6055 f->open_object_section("entry");
6056 f->dump_string("range", p->first.get_legacy_str());
6057 f->dump_stream("until") << p->second;
6058 f->close_section();
6059 } else {
6060 stringstream ss;
6061 string s;
6062 ss << p->first << " " << p->second;
6063 getline(ss, s);
6064 s += "\n";
6065 rdata.append(s);
6066 }
6067 }
6068 if (f) {
6069 f->close_section();
6070 f->flush(rdata);
6071 }
6072 ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
7c673cae
FG
6073
6074 } else if (prefix == "osd pool ls") {
6075 string detail;
9f95a23c 6076 cmd_getval(cmdmap, "detail", detail);
7c673cae
FG
6077 if (!f && detail == "detail") {
6078 ostringstream ss;
1e59de90 6079 osdmap.print_pools(cct, ss);
7c673cae
FG
6080 rdata.append(ss.str());
6081 } else {
6082 if (f)
6083 f->open_array_section("pools");
1e59de90 6084 for (auto &[pid, pdata] : osdmap.get_pools()) {
7c673cae
FG
6085 if (f) {
6086 if (detail == "detail") {
6087 f->open_object_section("pool");
1e59de90
TL
6088 f->dump_int("pool_id", pid);
6089 f->dump_string("pool_name", osdmap.get_pool_name(pid));
6090 pdata.dump(f.get());
6091 osdmap.dump_read_balance_score(cct, pid, pdata, f.get());
7c673cae
FG
6092 f->close_section();
6093 } else {
1e59de90 6094 f->dump_string("pool_name", osdmap.get_pool_name(pid));
7c673cae
FG
6095 }
6096 } else {
1e59de90 6097 rdata.append(osdmap.get_pool_name(pid) + "\n");
7c673cae
FG
6098 }
6099 }
6100 if (f) {
6101 f->close_section();
6102 f->flush(rdata);
6103 }
6104 }
6105
6106 } else if (prefix == "osd crush get-tunable") {
6107 string tunable;
9f95a23c 6108 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
6109 ostringstream rss;
6110 if (f)
6111 f->open_object_section("tunable");
6112 if (tunable == "straw_calc_version") {
6113 if (f)
6114 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6115 else
6116 rss << osdmap.crush->get_straw_calc_version() << "\n";
6117 } else {
6118 r = -EINVAL;
6119 goto reply;
6120 }
6121 if (f) {
6122 f->close_section();
6123 f->flush(rdata);
6124 } else {
6125 rdata.append(rss.str());
6126 }
6127 r = 0;
6128
6129 } else if (prefix == "osd pool get") {
6130 string poolstr;
9f95a23c 6131 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
6132 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6133 if (pool < 0) {
6134 ss << "unrecognized pool '" << poolstr << "'";
6135 r = -ENOENT;
6136 goto reply;
6137 }
6138
6139 const pg_pool_t *p = osdmap.get_pg_pool(pool);
6140 string var;
9f95a23c 6141 cmd_getval(cmdmap, "var", var);
7c673cae
FG
6142
6143 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6144 const choices_map_t ALL_CHOICES = {
6145 {"size", SIZE},
6146 {"min_size", MIN_SIZE},
7c673cae 6147 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
20effc67
TL
6148 {"crush_rule", CRUSH_RULE},
6149 {"hashpspool", HASHPSPOOL},
6150 {"eio", POOL_EIO},
28e407b8 6151 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
7c673cae
FG
6152 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6153 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6154 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6155 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6156 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6157 {"use_gmt_hitset", USE_GMT_HITSET},
11fdf7f2 6158 {"target_max_objects", TARGET_MAX_OBJECTS},
7c673cae
FG
6159 {"target_max_bytes", TARGET_MAX_BYTES},
6160 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6161 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6162 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6163 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6164 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6165 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6166 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6167 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6168 {"fast_read", FAST_READ},
6169 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6170 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6171 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6172 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6173 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6174 {"recovery_priority", RECOVERY_PRIORITY},
6175 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6176 {"scrub_priority", SCRUB_PRIORITY},
6177 {"compression_mode", COMPRESSION_MODE},
6178 {"compression_algorithm", COMPRESSION_ALGORITHM},
6179 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6180 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6181 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6182 {"csum_type", CSUM_TYPE},
6183 {"csum_max_block", CSUM_MAX_BLOCK},
6184 {"csum_min_block", CSUM_MIN_BLOCK},
11fdf7f2
TL
6185 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6186 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6187 {"pg_num_min", PG_NUM_MIN},
20effc67 6188 {"pg_num_max", PG_NUM_MAX},
11fdf7f2
TL
6189 {"target_size_bytes", TARGET_SIZE_BYTES},
6190 {"target_size_ratio", TARGET_SIZE_RATIO},
6191 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
f67539c2
TL
6192 {"dedup_tier", DEDUP_TIER},
6193 {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6194 {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
20effc67 6195 {"bulk", BULK}
7c673cae
FG
6196 };
6197
6198 typedef std::set<osd_pool_get_choices> choices_set_t;
6199
6200 const choices_set_t ONLY_TIER_CHOICES = {
6201 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6202 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6203 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6204 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6205 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 6206 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
6207 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6208 };
6209 const choices_set_t ONLY_ERASURE_CHOICES = {
28e407b8 6210 EC_OVERWRITES, ERASURE_CODE_PROFILE
7c673cae
FG
6211 };
6212
6213 choices_set_t selected_choices;
6214 if (var == "all") {
6215 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6216 it != ALL_CHOICES.end(); ++it) {
6217 selected_choices.insert(it->second);
6218 }
6219
6220 if(!p->is_tier()) {
6221 selected_choices = subtract_second_from_first(selected_choices,
6222 ONLY_TIER_CHOICES);
6223 }
6224
6225 if(!p->is_erasure()) {
6226 selected_choices = subtract_second_from_first(selected_choices,
6227 ONLY_ERASURE_CHOICES);
6228 }
6229 } else /* var != "all" */ {
6230 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
522d829b
TL
6231 if (found == ALL_CHOICES.end()) {
6232 ss << "pool '" << poolstr
6233 << "': invalid variable: '" << var << "'";
6234 r = -EINVAL;
6235 goto reply;
6236 }
6237
7c673cae
FG
6238 osd_pool_get_choices selected = found->second;
6239
6240 if (!p->is_tier() &&
6241 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6242 ss << "pool '" << poolstr
6243 << "' is not a tier pool: variable not applicable";
6244 r = -EACCES;
6245 goto reply;
6246 }
6247
6248 if (!p->is_erasure() &&
6249 ONLY_ERASURE_CHOICES.find(selected)
6250 != ONLY_ERASURE_CHOICES.end()) {
6251 ss << "pool '" << poolstr
6252 << "' is not a erasure pool: variable not applicable";
6253 r = -EACCES;
6254 goto reply;
6255 }
6256
94b18763
FG
6257 if (pool_opts_t::is_opt_name(var) &&
6258 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6259 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6260 r = -ENOENT;
6261 goto reply;
6262 }
6263
7c673cae
FG
6264 selected_choices.insert(selected);
6265 }
6266
6267 if (f) {
94b18763
FG
6268 f->open_object_section("pool");
6269 f->dump_string("pool", poolstr);
6270 f->dump_int("pool_id", pool);
7c673cae
FG
6271 for(choices_set_t::const_iterator it = selected_choices.begin();
6272 it != selected_choices.end(); ++it) {
6273 choices_map_t::const_iterator i;
c07f9fc5
FG
6274 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6275 if (i->second == *it) {
6276 break;
6277 }
6278 }
11fdf7f2 6279 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6280 switch(*it) {
6281 case PG_NUM:
6282 f->dump_int("pg_num", p->get_pg_num());
6283 break;
6284 case PGP_NUM:
6285 f->dump_int("pgp_num", p->get_pgp_num());
6286 break;
7c673cae
FG
6287 case SIZE:
6288 f->dump_int("size", p->get_size());
6289 break;
6290 case MIN_SIZE:
6291 f->dump_int("min_size", p->get_min_size());
6292 break;
7c673cae 6293 case CRUSH_RULE:
31f18b77 6294 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6295 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 6296 p->get_crush_rule()));
7c673cae 6297 } else {
31f18b77 6298 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
6299 }
6300 break;
28e407b8
AA
6301 case EC_OVERWRITES:
6302 f->dump_bool("allow_ec_overwrites",
6303 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6304 break;
11fdf7f2
TL
6305 case PG_AUTOSCALE_MODE:
6306 f->dump_string("pg_autoscale_mode",
6307 pg_pool_t::get_pg_autoscale_mode_name(
6308 p->pg_autoscale_mode));
6309 break;
7c673cae 6310 case HASHPSPOOL:
20effc67 6311 case POOL_EIO:
7c673cae 6312 case NODELETE:
20effc67 6313 case BULK:
7c673cae
FG
6314 case NOPGCHANGE:
6315 case NOSIZECHANGE:
6316 case WRITE_FADVISE_DONTNEED:
6317 case NOSCRUB:
6318 case NODEEP_SCRUB:
94b18763
FG
6319 f->dump_bool(i->first.c_str(),
6320 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
6321 break;
6322 case HIT_SET_PERIOD:
6323 f->dump_int("hit_set_period", p->hit_set_period);
6324 break;
6325 case HIT_SET_COUNT:
6326 f->dump_int("hit_set_count", p->hit_set_count);
6327 break;
6328 case HIT_SET_TYPE:
6329 f->dump_string("hit_set_type",
6330 HitSet::get_type_name(p->hit_set_params.get_type()));
6331 break;
6332 case HIT_SET_FPP:
6333 {
6334 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6335 BloomHitSet::Params *bloomp =
6336 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6337 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6338 } else if(var != "all") {
6339 f->close_section();
6340 ss << "hit set is not of type Bloom; " <<
6341 "invalid to get a false positive rate!";
6342 r = -EINVAL;
6343 goto reply;
6344 }
6345 }
6346 break;
6347 case USE_GMT_HITSET:
6348 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6349 break;
6350 case TARGET_MAX_OBJECTS:
6351 f->dump_unsigned("target_max_objects", p->target_max_objects);
6352 break;
6353 case TARGET_MAX_BYTES:
6354 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6355 break;
6356 case CACHE_TARGET_DIRTY_RATIO:
6357 f->dump_unsigned("cache_target_dirty_ratio_micro",
6358 p->cache_target_dirty_ratio_micro);
6359 f->dump_float("cache_target_dirty_ratio",
6360 ((float)p->cache_target_dirty_ratio_micro/1000000));
6361 break;
6362 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6363 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6364 p->cache_target_dirty_high_ratio_micro);
6365 f->dump_float("cache_target_dirty_high_ratio",
6366 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6367 break;
6368 case CACHE_TARGET_FULL_RATIO:
6369 f->dump_unsigned("cache_target_full_ratio_micro",
6370 p->cache_target_full_ratio_micro);
6371 f->dump_float("cache_target_full_ratio",
6372 ((float)p->cache_target_full_ratio_micro/1000000));
6373 break;
6374 case CACHE_MIN_FLUSH_AGE:
6375 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6376 break;
6377 case CACHE_MIN_EVICT_AGE:
6378 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6379 break;
6380 case ERASURE_CODE_PROFILE:
6381 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6382 break;
6383 case MIN_READ_RECENCY_FOR_PROMOTE:
6384 f->dump_int("min_read_recency_for_promote",
6385 p->min_read_recency_for_promote);
6386 break;
6387 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6388 f->dump_int("min_write_recency_for_promote",
6389 p->min_write_recency_for_promote);
6390 break;
6391 case FAST_READ:
6392 f->dump_int("fast_read", p->fast_read);
6393 break;
6394 case HIT_SET_GRADE_DECAY_RATE:
6395 f->dump_int("hit_set_grade_decay_rate",
6396 p->hit_set_grade_decay_rate);
6397 break;
6398 case HIT_SET_SEARCH_LAST_N:
6399 f->dump_int("hit_set_search_last_n",
6400 p->hit_set_search_last_n);
6401 break;
6402 case SCRUB_MIN_INTERVAL:
6403 case SCRUB_MAX_INTERVAL:
6404 case DEEP_SCRUB_INTERVAL:
6405 case RECOVERY_PRIORITY:
6406 case RECOVERY_OP_PRIORITY:
6407 case SCRUB_PRIORITY:
6408 case COMPRESSION_MODE:
6409 case COMPRESSION_ALGORITHM:
6410 case COMPRESSION_REQUIRED_RATIO:
6411 case COMPRESSION_MAX_BLOB_SIZE:
6412 case COMPRESSION_MIN_BLOB_SIZE:
6413 case CSUM_TYPE:
6414 case CSUM_MAX_BLOCK:
6415 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6416 case FINGERPRINT_ALGORITHM:
6417 case PG_NUM_MIN:
20effc67 6418 case PG_NUM_MAX:
11fdf7f2
TL
6419 case TARGET_SIZE_BYTES:
6420 case TARGET_SIZE_RATIO:
6421 case PG_AUTOSCALE_BIAS:
f67539c2
TL
6422 case DEDUP_TIER:
6423 case DEDUP_CHUNK_ALGORITHM:
6424 case DEDUP_CDC_CHUNK_SIZE:
c07f9fc5
FG
6425 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6426 if (p->opts.is_set(key)) {
c07f9fc5 6427 if(*it == CSUM_TYPE) {
11fdf7f2 6428 int64_t val;
c07f9fc5
FG
6429 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6430 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6431 } else {
6432 p->opts.dump(i->first, f.get());
6433 }
94b18763 6434 }
7c673cae
FG
6435 break;
6436 }
7c673cae 6437 }
94b18763
FG
6438 f->close_section();
6439 f->flush(rdata);
7c673cae
FG
6440 } else /* !f */ {
6441 for(choices_set_t::const_iterator it = selected_choices.begin();
6442 it != selected_choices.end(); ++it) {
6443 choices_map_t::const_iterator i;
6444 switch(*it) {
6445 case PG_NUM:
6446 ss << "pg_num: " << p->get_pg_num() << "\n";
6447 break;
6448 case PGP_NUM:
6449 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6450 break;
7c673cae
FG
6451 case SIZE:
6452 ss << "size: " << p->get_size() << "\n";
6453 break;
6454 case MIN_SIZE:
6455 ss << "min_size: " << p->get_min_size() << "\n";
6456 break;
7c673cae 6457 case CRUSH_RULE:
31f18b77 6458 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6459 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 6460 p->get_crush_rule()) << "\n";
7c673cae 6461 } else {
31f18b77 6462 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
6463 }
6464 break;
11fdf7f2
TL
6465 case PG_AUTOSCALE_MODE:
6466 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6467 p->pg_autoscale_mode) <<"\n";
6468 break;
7c673cae
FG
6469 case HIT_SET_PERIOD:
6470 ss << "hit_set_period: " << p->hit_set_period << "\n";
6471 break;
6472 case HIT_SET_COUNT:
6473 ss << "hit_set_count: " << p->hit_set_count << "\n";
6474 break;
6475 case HIT_SET_TYPE:
6476 ss << "hit_set_type: " <<
6477 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6478 break;
6479 case HIT_SET_FPP:
6480 {
6481 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6482 BloomHitSet::Params *bloomp =
6483 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6484 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6485 } else if(var != "all") {
6486 ss << "hit set is not of type Bloom; " <<
6487 "invalid to get a false positive rate!";
6488 r = -EINVAL;
6489 goto reply;
6490 }
6491 }
6492 break;
6493 case USE_GMT_HITSET:
6494 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6495 break;
6496 case TARGET_MAX_OBJECTS:
6497 ss << "target_max_objects: " << p->target_max_objects << "\n";
6498 break;
6499 case TARGET_MAX_BYTES:
6500 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6501 break;
6502 case CACHE_TARGET_DIRTY_RATIO:
6503 ss << "cache_target_dirty_ratio: "
6504 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6505 break;
6506 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6507 ss << "cache_target_dirty_high_ratio: "
6508 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6509 break;
6510 case CACHE_TARGET_FULL_RATIO:
6511 ss << "cache_target_full_ratio: "
6512 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6513 break;
6514 case CACHE_MIN_FLUSH_AGE:
6515 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6516 break;
6517 case CACHE_MIN_EVICT_AGE:
6518 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6519 break;
6520 case ERASURE_CODE_PROFILE:
6521 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6522 break;
6523 case MIN_READ_RECENCY_FOR_PROMOTE:
6524 ss << "min_read_recency_for_promote: " <<
6525 p->min_read_recency_for_promote << "\n";
6526 break;
6527 case HIT_SET_GRADE_DECAY_RATE:
6528 ss << "hit_set_grade_decay_rate: " <<
6529 p->hit_set_grade_decay_rate << "\n";
6530 break;
6531 case HIT_SET_SEARCH_LAST_N:
6532 ss << "hit_set_search_last_n: " <<
6533 p->hit_set_search_last_n << "\n";
6534 break;
28e407b8
AA
6535 case EC_OVERWRITES:
6536 ss << "allow_ec_overwrites: " <<
6537 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6538 "\n";
6539 break;
7c673cae 6540 case HASHPSPOOL:
20effc67 6541 case POOL_EIO:
7c673cae 6542 case NODELETE:
20effc67 6543 case BULK:
7c673cae
FG
6544 case NOPGCHANGE:
6545 case NOSIZECHANGE:
6546 case WRITE_FADVISE_DONTNEED:
6547 case NOSCRUB:
6548 case NODEEP_SCRUB:
6549 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6550 if (i->second == *it)
6551 break;
6552 }
11fdf7f2 6553 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6554 ss << i->first << ": " <<
6555 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6556 "true" : "false") << "\n";
6557 break;
6558 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6559 ss << "min_write_recency_for_promote: " <<
6560 p->min_write_recency_for_promote << "\n";
6561 break;
6562 case FAST_READ:
6563 ss << "fast_read: " << p->fast_read << "\n";
6564 break;
6565 case SCRUB_MIN_INTERVAL:
6566 case SCRUB_MAX_INTERVAL:
6567 case DEEP_SCRUB_INTERVAL:
6568 case RECOVERY_PRIORITY:
6569 case RECOVERY_OP_PRIORITY:
6570 case SCRUB_PRIORITY:
6571 case COMPRESSION_MODE:
6572 case COMPRESSION_ALGORITHM:
6573 case COMPRESSION_REQUIRED_RATIO:
6574 case COMPRESSION_MAX_BLOB_SIZE:
6575 case COMPRESSION_MIN_BLOB_SIZE:
6576 case CSUM_TYPE:
6577 case CSUM_MAX_BLOCK:
6578 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6579 case FINGERPRINT_ALGORITHM:
6580 case PG_NUM_MIN:
20effc67 6581 case PG_NUM_MAX:
11fdf7f2
TL
6582 case TARGET_SIZE_BYTES:
6583 case TARGET_SIZE_RATIO:
6584 case PG_AUTOSCALE_BIAS:
f67539c2
TL
6585 case DEDUP_TIER:
6586 case DEDUP_CHUNK_ALGORITHM:
6587 case DEDUP_CDC_CHUNK_SIZE:
7c673cae
FG
6588 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6589 if (i->second == *it)
6590 break;
6591 }
11fdf7f2 6592 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6593 {
6594 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6595 if (p->opts.is_set(key)) {
6596 if(key == pool_opts_t::CSUM_TYPE) {
11fdf7f2 6597 int64_t val;
7c673cae
FG
6598 p->opts.get(key, &val);
6599 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6600 } else {
6601 ss << i->first << ": " << p->opts.get(key) << "\n";
6602 }
6603 }
6604 }
6605 break;
6606 }
6607 rdata.append(ss.str());
6608 ss.str("");
6609 }
6610 }
6611 r = 0;
7c673cae
FG
6612 } else if (prefix == "osd pool get-quota") {
6613 string pool_name;
9f95a23c 6614 cmd_getval(cmdmap, "pool", pool_name);
7c673cae
FG
6615
6616 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6617 if (poolid < 0) {
11fdf7f2 6618 ceph_assert(poolid == -ENOENT);
7c673cae
FG
6619 ss << "unrecognized pool '" << pool_name << "'";
6620 r = -ENOENT;
6621 goto reply;
6622 }
6623 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
f67539c2 6624 const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
20effc67
TL
6625 if (!pstat) {
6626 ss << "no stats for pool '" << pool_name << "'";
6627 r = -ENOENT;
6628 goto reply;
6629 }
9f95a23c 6630 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
6631 if (f) {
6632 f->open_object_section("pool_quotas");
6633 f->dump_string("pool_name", pool_name);
6634 f->dump_unsigned("pool_id", poolid);
6635 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
9f95a23c 6636 f->dump_int("current_num_objects", sum.num_objects);
7c673cae 6637 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
9f95a23c 6638 f->dump_int("current_num_bytes", sum.num_bytes);
7c673cae
FG
6639 f->close_section();
6640 f->flush(rdata);
6641 } else {
6642 stringstream rs;
6643 rs << "quotas for pool '" << pool_name << "':\n"
6644 << " max objects: ";
6645 if (p->quota_max_objects == 0)
6646 rs << "N/A";
9f95a23c 6647 else {
1adf2230 6648 rs << si_u_t(p->quota_max_objects) << " objects";
9f95a23c
TL
6649 rs << " (current num objects: " << sum.num_objects << " objects)";
6650 }
7c673cae
FG
6651 rs << "\n"
6652 << " max bytes : ";
6653 if (p->quota_max_bytes == 0)
6654 rs << "N/A";
9f95a23c 6655 else {
1adf2230 6656 rs << byte_u_t(p->quota_max_bytes);
9f95a23c
TL
6657 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6658 }
7c673cae
FG
6659 rdata.append(rs.str());
6660 }
6661 rdata.append("\n");
6662 r = 0;
6663 } else if (prefix == "osd crush rule list" ||
6664 prefix == "osd crush rule ls") {
c07f9fc5
FG
6665 if (f) {
6666 f->open_array_section("rules");
6667 osdmap.crush->list_rules(f.get());
6668 f->close_section();
6669 f->flush(rdata);
6670 } else {
6671 ostringstream ss;
6672 osdmap.crush->list_rules(&ss);
6673 rdata.append(ss.str());
6674 }
b5b8bbf5
FG
6675 } else if (prefix == "osd crush rule ls-by-class") {
6676 string class_name;
9f95a23c 6677 cmd_getval(cmdmap, "class", class_name);
b5b8bbf5
FG
6678 if (class_name.empty()) {
6679 ss << "no class specified";
6680 r = -EINVAL;
6681 goto reply;
6682 }
6683 set<int> rules;
6684 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6685 if (r < 0) {
6686 ss << "failed to get rules by class '" << class_name << "'";
6687 goto reply;
6688 }
6689 if (f) {
6690 f->open_array_section("rules");
6691 for (auto &rule: rules) {
6692 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6693 }
6694 f->close_section();
6695 f->flush(rdata);
6696 } else {
6697 ostringstream rs;
6698 for (auto &rule: rules) {
6699 rs << osdmap.crush->get_rule_name(rule) << "\n";
6700 }
6701 rdata.append(rs.str());
6702 }
7c673cae
FG
6703 } else if (prefix == "osd crush rule dump") {
6704 string name;
9f95a23c 6705 cmd_getval(cmdmap, "name", name);
7c673cae 6706 string format;
9f95a23c 6707 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6708 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6709 if (name == "") {
6710 f->open_array_section("rules");
6711 osdmap.crush->dump_rules(f.get());
6712 f->close_section();
6713 } else {
6714 int ruleno = osdmap.crush->get_rule_id(name);
6715 if (ruleno < 0) {
31f18b77 6716 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
6717 r = ruleno;
6718 goto reply;
6719 }
6720 osdmap.crush->dump_rule(ruleno, f.get());
6721 }
6722 ostringstream rs;
6723 f->flush(rs);
6724 rs << "\n";
6725 rdata.append(rs.str());
6726 } else if (prefix == "osd crush dump") {
6727 string format;
9f95a23c 6728 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6729 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6730 f->open_object_section("crush_map");
6731 osdmap.crush->dump(f.get());
6732 f->close_section();
6733 ostringstream rs;
6734 f->flush(rs);
6735 rs << "\n";
6736 rdata.append(rs.str());
6737 } else if (prefix == "osd crush show-tunables") {
6738 string format;
9f95a23c 6739 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6740 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6741 f->open_object_section("crush_map_tunables");
6742 osdmap.crush->dump_tunables(f.get());
6743 f->close_section();
6744 ostringstream rs;
6745 f->flush(rs);
6746 rs << "\n";
6747 rdata.append(rs.str());
6748 } else if (prefix == "osd crush tree") {
20effc67
TL
6749 bool show_shadow = false;
6750 if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6751 std::string shadow;
6752 if (cmd_getval(cmdmap, "shadow", shadow) &&
6753 shadow == "--show-shadow") {
6754 show_shadow = true;
6755 }
6756 }
c07f9fc5
FG
6757 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6758 if (f) {
91327a77 6759 f->open_object_section("crush_tree");
c07f9fc5
FG
6760 osdmap.crush->dump_tree(nullptr,
6761 f.get(),
6762 osdmap.get_pool_names(),
6763 show_shadow);
91327a77 6764 f->close_section();
c07f9fc5
FG
6765 f->flush(rdata);
6766 } else {
6767 ostringstream ss;
6768 osdmap.crush->dump_tree(&ss,
6769 nullptr,
6770 osdmap.get_pool_names(),
6771 show_shadow);
6772 rdata.append(ss.str());
6773 }
d2e6a577
FG
6774 } else if (prefix == "osd crush ls") {
6775 string name;
9f95a23c 6776 if (!cmd_getval(cmdmap, "node", name)) {
d2e6a577
FG
6777 ss << "no node specified";
6778 r = -EINVAL;
6779 goto reply;
6780 }
6781 if (!osdmap.crush->name_exists(name)) {
6782 ss << "node '" << name << "' does not exist";
6783 r = -ENOENT;
6784 goto reply;
6785 }
6786 int id = osdmap.crush->get_item_id(name);
6787 list<int> result;
6788 if (id >= 0) {
6789 result.push_back(id);
6790 } else {
6791 int num = osdmap.crush->get_bucket_size(id);
6792 for (int i = 0; i < num; ++i) {
6793 result.push_back(osdmap.crush->get_bucket_item(id, i));
6794 }
6795 }
6796 if (f) {
6797 f->open_array_section("items");
6798 for (auto i : result) {
6799 f->dump_string("item", osdmap.crush->get_item_name(i));
6800 }
6801 f->close_section();
6802 f->flush(rdata);
6803 } else {
6804 ostringstream ss;
6805 for (auto i : result) {
6806 ss << osdmap.crush->get_item_name(i) << "\n";
6807 }
6808 rdata.append(ss.str());
6809 }
6810 r = 0;
7c673cae
FG
6811 } else if (prefix == "osd crush class ls") {
6812 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6813 f->open_array_section("crush_classes");
6814 for (auto i : osdmap.crush->class_name)
6815 f->dump_string("class", i.second);
6816 f->close_section();
6817 f->flush(rdata);
224ce89b
WB
6818 } else if (prefix == "osd crush class ls-osd") {
6819 string name;
9f95a23c 6820 cmd_getval(cmdmap, "class", name);
224ce89b
WB
6821 set<int> osds;
6822 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
6823 if (f) {
6824 f->open_array_section("osds");
6825 for (auto &osd: osds)
6826 f->dump_int("osd", osd);
6827 f->close_section();
6828 f->flush(rdata);
6829 } else {
6830 bool first = true;
6831 for (auto &osd : osds) {
6832 if (!first)
6833 ds << "\n";
6834 first = false;
6835 ds << osd;
6836 }
6837 rdata.append(ds);
6838 }
11fdf7f2
TL
6839 } else if (prefix == "osd crush get-device-class") {
6840 vector<string> idvec;
9f95a23c 6841 cmd_getval(cmdmap, "ids", idvec);
11fdf7f2
TL
6842 map<int, string> class_by_osd;
6843 for (auto& id : idvec) {
6844 ostringstream ts;
6845 long osd = parse_osd_id(id.c_str(), &ts);
6846 if (osd < 0) {
6847 ss << "unable to parse osd id:'" << id << "'";
6848 r = -EINVAL;
6849 goto reply;
6850 }
6851 auto device_class = osdmap.crush->get_item_class(osd);
6852 if (device_class)
6853 class_by_osd[osd] = device_class;
6854 else
6855 class_by_osd[osd] = ""; // no class
6856 }
6857 if (f) {
6858 f->open_array_section("osd_device_classes");
6859 for (auto& i : class_by_osd) {
6860 f->open_object_section("osd_device_class");
6861 f->dump_int("osd", i.first);
6862 f->dump_string("device_class", i.second);
6863 f->close_section();
6864 }
6865 f->close_section();
6866 f->flush(rdata);
6867 } else {
6868 if (class_by_osd.size() == 1) {
6869 // for single input, make a clean output
6870 ds << class_by_osd.begin()->second;
6871 } else {
6872 // note that we do not group osds by class here
6873 for (auto it = class_by_osd.begin();
6874 it != class_by_osd.end();
6875 it++) {
6876 ds << "osd." << it->first << ' ' << it->second;
6877 if (next(it) != class_by_osd.end())
6878 ds << '\n';
6879 }
6880 }
6881 rdata.append(ds);
6882 }
7c673cae
FG
6883 } else if (prefix == "osd erasure-code-profile ls") {
6884 const auto &profiles = osdmap.get_erasure_code_profiles();
6885 if (f)
6886 f->open_array_section("erasure-code-profiles");
6887 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6888 if (f)
6889 f->dump_string("profile", i->first.c_str());
6890 else
6891 rdata.append(i->first + "\n");
6892 }
6893 if (f) {
6894 f->close_section();
6895 ostringstream rs;
6896 f->flush(rs);
6897 rs << "\n";
6898 rdata.append(rs.str());
6899 }
c07f9fc5
FG
6900 } else if (prefix == "osd crush weight-set ls") {
6901 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6902 if (f) {
6903 f->open_array_section("weight_sets");
6904 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6905 f->dump_string("pool", "(compat)");
6906 }
6907 for (auto& i : osdmap.crush->choose_args) {
6908 if (i.first >= 0) {
6909 f->dump_string("pool", osdmap.get_pool_name(i.first));
6910 }
6911 }
6912 f->close_section();
6913 f->flush(rdata);
6914 } else {
6915 ostringstream rs;
6916 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6917 rs << "(compat)\n";
6918 }
6919 for (auto& i : osdmap.crush->choose_args) {
6920 if (i.first >= 0) {
6921 rs << osdmap.get_pool_name(i.first) << "\n";
6922 }
6923 }
6924 rdata.append(rs.str());
6925 }
6926 } else if (prefix == "osd crush weight-set dump") {
6927 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6928 "json-pretty"));
6929 osdmap.crush->dump_choose_args(f.get());
6930 f->flush(rdata);
7c673cae
FG
6931 } else if (prefix == "osd erasure-code-profile get") {
6932 string name;
9f95a23c 6933 cmd_getval(cmdmap, "name", name);
7c673cae
FG
6934 if (!osdmap.has_erasure_code_profile(name)) {
6935 ss << "unknown erasure code profile '" << name << "'";
6936 r = -ENOENT;
6937 goto reply;
6938 }
6939 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6940 if (f)
6941 f->open_object_section("profile");
6942 for (map<string,string>::const_iterator i = profile.begin();
6943 i != profile.end();
6944 ++i) {
6945 if (f)
6946 f->dump_string(i->first.c_str(), i->second.c_str());
6947 else
6948 rdata.append(i->first + "=" + i->second + "\n");
6949 }
6950 if (f) {
6951 f->close_section();
6952 ostringstream rs;
6953 f->flush(rs);
6954 rs << "\n";
6955 rdata.append(rs.str());
6956 }
181888fb
FG
6957 } else if (prefix == "osd pool application get") {
6958 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6959 "json-pretty"));
6960 string pool_name;
9f95a23c 6961 cmd_getval(cmdmap, "pool", pool_name);
181888fb 6962 string app;
9f95a23c 6963 cmd_getval(cmdmap, "app", app);
181888fb 6964 string key;
9f95a23c 6965 cmd_getval(cmdmap, "key", key);
181888fb
FG
6966
6967 if (pool_name.empty()) {
6968 // all
6969 f->open_object_section("pools");
6970 for (const auto &pool : osdmap.pools) {
6971 std::string name("<unknown>");
6972 const auto &pni = osdmap.pool_name.find(pool.first);
6973 if (pni != osdmap.pool_name.end())
6974 name = pni->second;
6975 f->open_object_section(name.c_str());
6976 for (auto &app_pair : pool.second.application_metadata) {
6977 f->open_object_section(app_pair.first.c_str());
6978 for (auto &kv_pair : app_pair.second) {
6979 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6980 }
6981 f->close_section();
6982 }
6983 f->close_section(); // name
6984 }
6985 f->close_section(); // pools
6986 f->flush(rdata);
6987 } else {
6988 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6989 if (pool < 0) {
6990 ss << "unrecognized pool '" << pool_name << "'";
6991 r = -ENOENT;
6992 goto reply;
6993 }
6994 auto p = osdmap.get_pg_pool(pool);
6995 // filter by pool
6996 if (app.empty()) {
6997 f->open_object_section(pool_name.c_str());
6998 for (auto &app_pair : p->application_metadata) {
6999 f->open_object_section(app_pair.first.c_str());
7000 for (auto &kv_pair : app_pair.second) {
7001 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7002 }
7003 f->close_section(); // application
7004 }
7005 f->close_section(); // pool_name
7006 f->flush(rdata);
7007 goto reply;
7008 }
7009
7010 auto app_it = p->application_metadata.find(app);
7011 if (app_it == p->application_metadata.end()) {
7012 ss << "pool '" << pool_name << "' has no application '" << app << "'";
7013 r = -ENOENT;
7014 goto reply;
7015 }
7016 // filter by pool + app
7017 if (key.empty()) {
7018 f->open_object_section(app_it->first.c_str());
7019 for (auto &kv_pair : app_it->second) {
7020 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7021 }
7022 f->close_section(); // application
7023 f->flush(rdata);
7024 goto reply;
7025 }
7026 // filter by pool + app + key
7027 auto key_it = app_it->second.find(key);
7028 if (key_it == app_it->second.end()) {
7029 ss << "application '" << app << "' on pool '" << pool_name
7030 << "' does not have key '" << key << "'";
7031 r = -ENOENT;
7032 goto reply;
7033 }
7034 ss << key_it->second << "\n";
7035 rdata.append(ss.str());
7036 ss.str("");
7037 }
11fdf7f2 7038 } else if (prefix == "osd get-require-min-compat-client") {
9f95a23c 7039 ss << osdmap.require_min_compat_client << std::endl;
11fdf7f2
TL
7040 rdata.append(ss.str());
7041 ss.str("");
7042 goto reply;
7043 } else if (prefix == "osd pool application enable" ||
7044 prefix == "osd pool application disable" ||
7045 prefix == "osd pool application set" ||
7046 prefix == "osd pool application rm") {
7047 bool changed = false;
7048 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7049 if (r != 0) {
7050 // Error, reply.
7051 goto reply;
7052 } else if (changed) {
7053 // Valid mutation, proceed to prepare phase
7054 return false;
7055 } else {
7056 // Idempotent case, reply
7057 goto reply;
7058 }
7c673cae
FG
7059 } else {
7060 // try prepare update
7061 return false;
7062 }
7063
7064 reply:
7065 string rs;
7066 getline(ss, rs);
f67539c2 7067 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
7068 return true;
7069}
7070
3efd9988
FG
7071void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7072{
7073 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7074 osdmap.get_pg_pool(pool_id));
11fdf7f2 7075 ceph_assert(pool);
3efd9988
FG
7076 pool->set_flag(flags);
7077}
7078
7079void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 7080{
3efd9988
FG
7081 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7082 osdmap.get_pg_pool(pool_id));
11fdf7f2 7083 ceph_assert(pool);
3efd9988 7084 pool->unset_flag(flags);
7c673cae
FG
7085}
7086
9f95a23c 7087string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
11fdf7f2
TL
7088{
7089 char k[80];
9f95a23c 7090 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
11fdf7f2
TL
7091 return k;
7092}
7093
9f95a23c 7094string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
11fdf7f2
TL
7095{
7096 char k[80];
9f95a23c 7097 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
11fdf7f2
TL
7098 (unsigned long long)pool, (unsigned long long)snap);
7099 return k;
7100}
7101
9f95a23c 7102string OSDMonitor::make_purged_snap_key_value(
11fdf7f2
TL
7103 int64_t pool, snapid_t snap, snapid_t num,
7104 epoch_t epoch, bufferlist *v)
7105{
7106 // encode the *last* epoch in the key so that we can use forward
7107 // iteration only to search for an epoch in an interval.
7108 encode(snap, *v);
7109 encode(snap + num, *v);
7110 encode(epoch, *v);
9f95a23c 7111 return make_purged_snap_key(pool, snap + num - 1);
11fdf7f2
TL
7112}
7113
11fdf7f2 7114
9f95a23c
TL
7115int OSDMonitor::lookup_purged_snap(
7116 int64_t pool, snapid_t snap,
7117 snapid_t *begin, snapid_t *end)
11fdf7f2 7118{
9f95a23c 7119 string k = make_purged_snap_key(pool, snap);
f67539c2 7120 auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
11fdf7f2
TL
7121 it->lower_bound(k);
7122 if (!it->valid()) {
9f95a23c
TL
7123 dout(20) << __func__
7124 << " pool " << pool << " snap " << snap
7125 << " - key '" << k << "' not found" << dendl;
7126 return -ENOENT;
7127 }
7128 if (it->key().find("purged_snap_") != 0) {
7129 dout(20) << __func__
7130 << " pool " << pool << " snap " << snap
7131 << " - key '" << k << "' got '" << it->key()
7132 << "', wrong prefix" << dendl;
11fdf7f2
TL
7133 return -ENOENT;
7134 }
9f95a23c
TL
7135 string gotk = it->key();
7136 const char *format = "purged_snap_%llu_";
7137 long long int keypool;
7138 int n = sscanf(gotk.c_str(), format, &keypool);
7139 if (n != 1) {
7140 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7141 return -ENOENT;
7142 }
7143 if (pool != keypool) {
7144 dout(20) << __func__
7145 << " pool " << pool << " snap " << snap
7146 << " - key '" << k << "' got '" << gotk
7147 << "', wrong pool " << keypool
7148 << dendl;
11fdf7f2
TL
7149 return -ENOENT;
7150 }
7151 bufferlist v = it->value();
7152 auto p = v.cbegin();
7153 decode(*begin, p);
7154 decode(*end, p);
7155 if (snap < *begin || snap >= *end) {
9f95a23c
TL
7156 dout(20) << __func__
7157 << " pool " << pool << " snap " << snap
7158 << " - found [" << *begin << "," << *end << "), no overlap"
7159 << dendl;
11fdf7f2
TL
7160 return -ENOENT;
7161 }
7162 return 0;
7163}
7164
9f95a23c
TL
7165void OSDMonitor::insert_purged_snap_update(
7166 int64_t pool,
7167 snapid_t start, snapid_t end,
7168 epoch_t epoch,
7169 MonitorDBStore::TransactionRef t)
7170{
7171 snapid_t before_begin, before_end;
7172 snapid_t after_begin, after_end;
7173 int b = lookup_purged_snap(pool, start - 1,
7174 &before_begin, &before_end);
7175 int a = lookup_purged_snap(pool, end,
7176 &after_begin, &after_end);
7177 if (!b && !a) {
7178 dout(10) << __func__
7179 << " [" << start << "," << end << ") - joins ["
7180 << before_begin << "," << before_end << ") and ["
7181 << after_begin << "," << after_end << ")" << dendl;
7182 // erase only the begin record; we'll overwrite the end one.
7183 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7184 bufferlist v;
7185 string k = make_purged_snap_key_value(pool,
7186 before_begin, after_end - before_begin,
7187 pending_inc.epoch, &v);
7188 t->put(OSD_SNAP_PREFIX, k, v);
7189 } else if (!b) {
7190 dout(10) << __func__
7191 << " [" << start << "," << end << ") - join with earlier ["
7192 << before_begin << "," << before_end << ")" << dendl;
7193 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7194 bufferlist v;
7195 string k = make_purged_snap_key_value(pool,
7196 before_begin, end - before_begin,
7197 pending_inc.epoch, &v);
7198 t->put(OSD_SNAP_PREFIX, k, v);
7199 } else if (!a) {
7200 dout(10) << __func__
7201 << " [" << start << "," << end << ") - join with later ["
7202 << after_begin << "," << after_end << ")" << dendl;
7203 // overwrite after record
7204 bufferlist v;
7205 string k = make_purged_snap_key_value(pool,
7206 start, after_end - start,
7207 pending_inc.epoch, &v);
7208 t->put(OSD_SNAP_PREFIX, k, v);
7209 } else {
7210 dout(10) << __func__
7211 << " [" << start << "," << end << ") - new"
7212 << dendl;
7213 bufferlist v;
7214 string k = make_purged_snap_key_value(pool,
7215 start, end - start,
7216 pending_inc.epoch, &v);
7217 t->put(OSD_SNAP_PREFIX, k, v);
7218 }
7219}
7220
11fdf7f2
TL
7221bool OSDMonitor::try_prune_purged_snaps()
7222{
f67539c2 7223 if (!mon.mgrstatmon()->is_readable()) {
11fdf7f2
TL
7224 return false;
7225 }
11fdf7f2
TL
7226 if (!pending_inc.new_purged_snaps.empty()) {
7227 return false; // we already pruned for this epoch
7228 }
7229
7230 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7231 "mon_max_snap_prune_per_epoch");
7232 if (!max_prune) {
7233 max_prune = 100000;
7234 }
7235 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7236
7237 unsigned actually_pruned = 0;
f67539c2 7238 auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
11fdf7f2
TL
7239 for (auto& p : osdmap.get_pools()) {
7240 auto q = purged_snaps.find(p.first);
7241 if (q == purged_snaps.end()) {
7242 continue;
7243 }
7244 auto& purged = q->second;
7245 if (purged.empty()) {
7246 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7247 continue;
7248 }
7249 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
9f95a23c 7250 snap_interval_set_t to_prune;
11fdf7f2
TL
7251 unsigned maybe_pruned = actually_pruned;
7252 for (auto i = purged.begin(); i != purged.end(); ++i) {
7253 snapid_t begin = i.get_start();
7254 auto end = i.get_start() + i.get_len();
7255 snapid_t pbegin = 0, pend = 0;
9f95a23c 7256 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
11fdf7f2
TL
7257 if (r == 0) {
7258 // already purged.
7259 // be a bit aggressive about backing off here, because the mon may
7260 // do a lot of work going through this set, and if we know the
7261 // purged set from the OSDs is at least *partly* stale we may as
7262 // well wait for it to be fresh.
9f95a23c 7263 dout(20) << __func__ << " we've already purged " << pbegin
11fdf7f2
TL
7264 << "~" << (pend - pbegin) << dendl;
7265 break; // next pool
7266 }
9f95a23c 7267 if (pbegin && pbegin > begin && pbegin < end) {
11fdf7f2 7268 // the tail of [begin,end) is purged; shorten the range
11fdf7f2
TL
7269 end = pbegin;
7270 }
7271 to_prune.insert(begin, end - begin);
7272 maybe_pruned += end - begin;
7273 if (maybe_pruned >= max_prune) {
7274 break;
7275 }
7276 }
7277 if (!to_prune.empty()) {
7278 // PGs may still be reporting things as purged that we have already
7279 // pruned from removed_snaps_queue.
9f95a23c 7280 snap_interval_set_t actual;
11fdf7f2
TL
7281 auto r = osdmap.removed_snaps_queue.find(p.first);
7282 if (r != osdmap.removed_snaps_queue.end()) {
7283 actual.intersection_of(to_prune, r->second);
7284 }
7285 actually_pruned += actual.size();
7286 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7287 << ", actual pruned " << actual << dendl;
7288 if (!actual.empty()) {
7289 pending_inc.new_purged_snaps[p.first].swap(actual);
7290 }
7291 }
7292 if (actually_pruned >= max_prune) {
7293 break;
7294 }
7295 }
7296 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7297 return !!actually_pruned;
7298}
7299
7c673cae
FG
7300bool OSDMonitor::update_pools_status()
7301{
f67539c2 7302 if (!mon.mgrstatmon()->is_readable())
7c673cae
FG
7303 return false;
7304
7305 bool ret = false;
7306
7307 auto& pools = osdmap.get_pools();
7308 for (auto it = pools.begin(); it != pools.end(); ++it) {
f67539c2 7309 const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
31f18b77 7310 if (!pstat)
7c673cae 7311 continue;
31f18b77 7312 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
7313 const pg_pool_t &pool = it->second;
7314 const string& pool_name = osdmap.get_pool_name(it->first);
7315
7316 bool pool_is_full =
7317 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7318 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7319
11fdf7f2 7320 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7c673cae
FG
7321 if (pool_is_full)
7322 continue;
7323
f67539c2 7324 mon.clog->info() << "pool '" << pool_name
3efd9988
FG
7325 << "' no longer out of quota; removing NO_QUOTA flag";
7326 // below we cancel FLAG_FULL too, we'll set it again in
7327 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7328 clear_pool_flags(it->first,
11fdf7f2 7329 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
7330 ret = true;
7331 } else {
7332 if (!pool_is_full)
7333 continue;
7334
7335 if (pool.quota_max_bytes > 0 &&
7336 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
f67539c2 7337 mon.clog->warn() << "pool '" << pool_name << "' is full"
7c673cae 7338 << " (reached quota's max_bytes: "
1adf2230 7339 << byte_u_t(pool.quota_max_bytes) << ")";
7c673cae
FG
7340 }
7341 if (pool.quota_max_objects > 0 &&
7342 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
f67539c2 7343 mon.clog->warn() << "pool '" << pool_name << "' is full"
7c673cae
FG
7344 << " (reached quota's max_objects: "
7345 << pool.quota_max_objects << ")";
7346 }
11fdf7f2 7347 // set both FLAG_FULL_QUOTA and FLAG_FULL
3efd9988
FG
7348 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7349 // since FLAG_FULL should always take precedence
7350 set_pool_flags(it->first,
11fdf7f2 7351 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
3efd9988
FG
7352 clear_pool_flags(it->first,
7353 pg_pool_t::FLAG_NEARFULL |
7354 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
7355 ret = true;
7356 }
7357 }
7358 return ret;
7359}
7360
7c673cae
FG
7361int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7362{
7363 op->mark_osdmon_event(__func__);
9f95a23c 7364 auto m = op->get_req<MPoolOp>();
7c673cae 7365 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
11fdf7f2 7366 MonSession *session = op->get_session();
7c673cae
FG
7367 if (!session)
7368 return -EPERM;
7369 string erasure_code_profile;
7370 stringstream ss;
31f18b77 7371 string rule_name;
20effc67 7372 bool bulk = false;
94b18763 7373 int ret = 0;
11fdf7f2 7374 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
20effc67 7375 0, 0, 0, 0, 0, 0, 0.0,
11fdf7f2 7376 erasure_code_profile,
20effc67 7377 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
1e59de90 7378 cct->_conf.get_val<bool>("osd_pool_default_crimson"),
9f95a23c 7379 &ss);
94b18763
FG
7380
7381 if (ret < 0) {
7382 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7383 }
7384 return ret;
7c673cae
FG
7385}
7386
7387int OSDMonitor::crush_rename_bucket(const string& srcname,
7388 const string& dstname,
7389 ostream *ss)
7390{
7391 int ret;
7392 //
7393 // Avoid creating a pending crush if it does not already exists and
7394 // the rename would fail.
7395 //
7396 if (!_have_pending_crush()) {
7397 ret = _get_stable_crush().can_rename_bucket(srcname,
7398 dstname,
7399 ss);
7400 if (ret)
7401 return ret;
7402 }
7403
20effc67 7404 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
7405
7406 ret = newcrush.rename_bucket(srcname,
7407 dstname,
7408 ss);
7409 if (ret)
7410 return ret;
7411
7412 pending_inc.crush.clear();
f67539c2 7413 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
7414 *ss << "renamed bucket " << srcname << " into " << dstname;
7415 return 0;
7416}
7417
7418void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7419{
7420 string replacement = "";
7421
7422 if (plugin == "jerasure_generic" ||
7423 plugin == "jerasure_sse3" ||
7424 plugin == "jerasure_sse4" ||
7425 plugin == "jerasure_neon") {
7426 replacement = "jerasure";
7427 } else if (plugin == "shec_generic" ||
7428 plugin == "shec_sse3" ||
7429 plugin == "shec_sse4" ||
7430 plugin == "shec_neon") {
7431 replacement = "shec";
7432 }
7433
7434 if (replacement != "") {
7435 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7436 << plugin << " that has been deprecated. Please use "
7437 << replacement << " instead." << dendl;
7438 }
7439}
7440
7441int OSDMonitor::normalize_profile(const string& profilename,
7442 ErasureCodeProfile &profile,
7443 bool force,
7444 ostream *ss)
7445{
7446 ErasureCodeInterfaceRef erasure_code;
7447 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7448 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7449 check_legacy_ec_plugin(plugin->second, profilename);
7450 int err = instance.factory(plugin->second,
11fdf7f2 7451 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7452 profile, &erasure_code, ss);
7453 if (err) {
7454 return err;
7455 }
7456
7457 err = erasure_code->init(profile, ss);
7458 if (err) {
7459 return err;
7460 }
7461
7462 auto it = profile.find("stripe_unit");
7463 if (it != profile.end()) {
7464 string err_str;
20effc67 7465 uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7c673cae
FG
7466 if (!err_str.empty()) {
7467 *ss << "could not parse stripe_unit '" << it->second
7468 << "': " << err_str << std::endl;
7469 return -EINVAL;
7470 }
7471 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7472 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7473 if (chunk_size != stripe_unit) {
7474 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7475 << "alignment. Would be padded to " << chunk_size
7476 << std::endl;
7477 return -EINVAL;
7478 }
7479 if ((stripe_unit % 4096) != 0 && !force) {
7480 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7481 << "use --force to override this check" << std::endl;
7482 return -EINVAL;
7483 }
7484 }
7485 return 0;
7486}
7487
31f18b77 7488int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 7489 const string &profile,
31f18b77 7490 int *rule,
7c673cae
FG
7491 ostream *ss)
7492{
7493 int ruleid = osdmap.crush->get_rule_id(name);
7494 if (ruleid != -ENOENT) {
20effc67 7495 *rule = ruleid;
7c673cae
FG
7496 return -EEXIST;
7497 }
7498
20effc67 7499 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
7500
7501 ruleid = newcrush.get_rule_id(name);
7502 if (ruleid != -ENOENT) {
20effc67 7503 *rule = ruleid;
7c673cae
FG
7504 return -EALREADY;
7505 } else {
7506 ErasureCodeInterfaceRef erasure_code;
7507 int err = get_erasure_code(profile, &erasure_code, ss);
7508 if (err) {
7509 *ss << "failed to load plugin using profile " << profile << std::endl;
7510 return err;
7511 }
7512
224ce89b 7513 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
7514 erasure_code.reset();
7515 if (err < 0)
7516 return err;
31f18b77 7517 *rule = err;
7c673cae 7518 pending_inc.crush.clear();
f67539c2 7519 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
7520 return 0;
7521 }
7522}
7523
7524int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7525 ErasureCodeInterfaceRef *erasure_code,
7526 ostream *ss) const
7527{
7528 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7529 return -EAGAIN;
7530 ErasureCodeProfile profile =
7531 osdmap.get_erasure_code_profile(erasure_code_profile);
7532 ErasureCodeProfile::const_iterator plugin =
7533 profile.find("plugin");
7534 if (plugin == profile.end()) {
7535 *ss << "cannot determine the erasure code plugin"
7536 << " because there is no 'plugin' entry in the erasure_code_profile "
7537 << profile << std::endl;
7538 return -EINVAL;
7539 }
7540 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
f67539c2 7541 auto& instance = ErasureCodePluginRegistry::instance();
7c673cae 7542 return instance.factory(plugin->second,
11fdf7f2 7543 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7544 profile, erasure_code, ss);
7545}
7546
7547int OSDMonitor::check_cluster_features(uint64_t features,
7548 stringstream &ss)
7549{
7550 stringstream unsupported_ss;
7551 int unsupported_count = 0;
f67539c2 7552 if ((mon.get_quorum_con_features() & features) != features) {
7c673cae
FG
7553 unsupported_ss << "the monitor cluster";
7554 ++unsupported_count;
7555 }
7556
7557 set<int32_t> up_osds;
7558 osdmap.get_up_osds(up_osds);
7559 for (set<int32_t>::iterator it = up_osds.begin();
7560 it != up_osds.end(); ++it) {
7561 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7562 if ((xi.features & features) != features) {
7563 if (unsupported_count > 0)
7564 unsupported_ss << ", ";
7565 unsupported_ss << "osd." << *it;
7566 unsupported_count ++;
7567 }
7568 }
7569
7570 if (unsupported_count > 0) {
7571 ss << "features " << features << " unsupported by: "
7572 << unsupported_ss.str();
7573 return -ENOTSUP;
7574 }
7575
7576 // check pending osd state, too!
7577 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7578 pending_inc.new_xinfo.begin();
7579 p != pending_inc.new_xinfo.end(); ++p) {
7580 const osd_xinfo_t &xi = p->second;
7581 if ((xi.features & features) != features) {
7582 dout(10) << __func__ << " pending osd." << p->first
7583 << " features are insufficient; retry" << dendl;
7584 return -EAGAIN;
7585 }
7586 }
7587
7588 return 0;
7589}
7590
7591bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7592 stringstream& ss)
7593{
7594 OSDMap::Incremental new_pending = pending_inc;
f67539c2 7595 encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7c673cae
FG
7596 OSDMap newmap;
7597 newmap.deepish_copy_from(osdmap);
7598 newmap.apply_incremental(new_pending);
7599
7600 // client compat
9f95a23c 7601 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7c673cae 7602 auto mv = newmap.get_min_compat_client();
31f18b77 7603 if (mv > newmap.require_min_compat_client) {
9f95a23c 7604 ss << "new crush map requires client version " << mv
7c673cae 7605 << " but require_min_compat_client is "
9f95a23c 7606 << newmap.require_min_compat_client;
7c673cae
FG
7607 return false;
7608 }
7609 }
7610
7611 // osd compat
7612 uint64_t features =
7613 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7614 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7615 stringstream features_ss;
7616 int r = check_cluster_features(features, features_ss);
7617 if (r) {
7618 ss << "Could not change CRUSH: " << features_ss.str();
7619 return false;
7620 }
7621
7622 return true;
7623}
7624
7625bool OSDMonitor::erasure_code_profile_in_use(
7626 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7627 const string &profile,
7628 ostream *ss)
7629{
7630 bool found = false;
7631 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7632 p != pools.end();
7633 ++p) {
11fdf7f2 7634 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7c673cae
FG
7635 *ss << osdmap.pool_name[p->first] << " ";
7636 found = true;
7637 }
7638 }
7639 if (found) {
7640 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7641 }
7642 return found;
7643}
7644
7645int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7646 map<string,string> *erasure_code_profile_map,
7647 ostream *ss)
7648{
11fdf7f2
TL
7649 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7650 get_json_str_map,
7651 *ss,
7652 erasure_code_profile_map,
7653 true);
7c673cae
FG
7654 if (r)
7655 return r;
11fdf7f2 7656 ceph_assert((*erasure_code_profile_map).count("plugin"));
7c673cae
FG
7657 string default_plugin = (*erasure_code_profile_map)["plugin"];
7658 map<string,string> user_map;
7659 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7660 i != erasure_code_profile.end();
7661 ++i) {
7662 size_t equal = i->find('=');
7663 if (equal == string::npos) {
7664 user_map[*i] = string();
7665 (*erasure_code_profile_map)[*i] = string();
7666 } else {
11fdf7f2 7667 const string key = i->substr(0, equal);
7c673cae
FG
7668 equal++;
7669 const string value = i->substr(equal);
11fdf7f2
TL
7670 if (key.find("ruleset-") == 0) {
7671 *ss << "property '" << key << "' is no longer supported; try "
7672 << "'crush-" << key.substr(8) << "' instead";
7673 return -EINVAL;
3efd9988 7674 }
7c673cae
FG
7675 user_map[key] = value;
7676 (*erasure_code_profile_map)[key] = value;
7677 }
7678 }
7679
7680 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7681 (*erasure_code_profile_map) = user_map;
7682
7683 return 0;
7684}
7685
7686int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7687 const string &erasure_code_profile,
11fdf7f2 7688 uint8_t repl_size,
7c673cae
FG
7689 unsigned *size, unsigned *min_size,
7690 ostream *ss)
7691{
7692 int err = 0;
f67539c2 7693 bool set_min_size = false;
7c673cae
FG
7694 switch (pool_type) {
7695 case pg_pool_t::TYPE_REPLICATED:
f67539c2
TL
7696 if (osdmap.stretch_mode_enabled) {
7697 if (repl_size == 0)
7698 repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7699 if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7700 *ss << "prepare_pool_size: we are in stretch mode but size "
7701 << repl_size << " does not match!";
7702 return -EINVAL;
7703 }
7704 *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7705 set_min_size = true;
7706 }
11fdf7f2
TL
7707 if (repl_size == 0) {
7708 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7709 }
7710 *size = repl_size;
f67539c2
TL
7711 if (!set_min_size)
7712 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7c673cae
FG
7713 break;
7714 case pg_pool_t::TYPE_ERASURE:
7715 {
f67539c2
TL
7716 if (osdmap.stretch_mode_enabled) {
7717 *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7718 return -EINVAL;
7719 }
7c673cae
FG
7720 ErasureCodeInterfaceRef erasure_code;
7721 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7722 if (err == 0) {
7723 *size = erasure_code->get_chunk_count();
11fdf7f2
TL
7724 *min_size =
7725 erasure_code->get_data_chunk_count() +
7726 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7727 assert(*min_size <= *size);
7728 assert(*min_size >= erasure_code->get_data_chunk_count());
7c673cae
FG
7729 }
7730 }
7731 break;
7732 default:
7733 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7734 err = -EINVAL;
7735 break;
7736 }
7737 return err;
7738}
7739
7740int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7741 const string &erasure_code_profile,
7742 uint32_t *stripe_width,
7743 ostream *ss)
7744{
7745 int err = 0;
7746 switch (pool_type) {
7747 case pg_pool_t::TYPE_REPLICATED:
7748 // ignored
7749 break;
7750 case pg_pool_t::TYPE_ERASURE:
7751 {
7752 ErasureCodeProfile profile =
7753 osdmap.get_erasure_code_profile(erasure_code_profile);
7754 ErasureCodeInterfaceRef erasure_code;
7755 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7756 if (err)
7757 break;
7758 uint32_t data_chunks = erasure_code->get_data_chunk_count();
11fdf7f2 7759 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7c673cae
FG
7760 auto it = profile.find("stripe_unit");
7761 if (it != profile.end()) {
7762 string err_str;
20effc67 7763 stripe_unit = strict_iecstrtoll(it->second, &err_str);
11fdf7f2 7764 ceph_assert(err_str.empty());
7c673cae
FG
7765 }
7766 *stripe_width = data_chunks *
7767 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7768 }
7769 break;
7770 default:
7771 *ss << "prepare_pool_stripe_width: "
7772 << pool_type << " is not a known pool type";
7773 err = -EINVAL;
7774 break;
7775 }
7776 return err;
7777}
7778
522d829b
TL
7779int OSDMonitor::get_replicated_stretch_crush_rule()
7780{
7781 /* we don't write down the stretch rule anywhere, so
7782 * we have to guess it. How? Look at all the pools
7783 * and count up how many times a given rule is used
7784 * on stretch pools and then return the one with
7785 * the most users!
7786 */
7787 map<int,int> rule_counts;
7788 for (const auto& pooli : osdmap.pools) {
7789 const pg_pool_t& p = pooli.second;
7790 if (p.is_replicated() && p.is_stretch_pool()) {
7791 if (!rule_counts.count(p.crush_rule)) {
7792 rule_counts[p.crush_rule] = 1;
7793 } else {
7794 ++rule_counts[p.crush_rule];
7795 }
7796 }
7797 }
7798
7799 if (rule_counts.empty()) {
7800 return -ENOENT;
7801 }
7802
7803 int most_used_count = 0;
7804 int most_used_rule = -1;
7805 for (auto i : rule_counts) {
7806 if (i.second > most_used_count) {
7807 most_used_rule = i.first;
7808 most_used_count = i.second;
7809 }
7810 }
7811 ceph_assert(most_used_count > 0);
7812 ceph_assert(most_used_rule >= 0);
7813 return most_used_rule;
7814}
7815
31f18b77 7816int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
7817 const string &erasure_code_profile,
7818 const string &rule_name,
7819 int *crush_rule,
7820 ostream *ss)
7c673cae
FG
7821{
7822
31f18b77 7823 if (*crush_rule < 0) {
7c673cae
FG
7824 switch (pool_type) {
7825 case pg_pool_t::TYPE_REPLICATED:
7826 {
31f18b77 7827 if (rule_name == "") {
522d829b
TL
7828 if (osdmap.stretch_mode_enabled) {
7829 *crush_rule = get_replicated_stretch_crush_rule();
7830 } else {
7831 // Use default rule
20effc67 7832 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
522d829b 7833 }
31f18b77
FG
7834 if (*crush_rule < 0) {
7835 // Errors may happen e.g. if no valid rule is available
7836 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
7837 << "'osd pool default crush *' config options";
7838 return -ENOENT;
7839 }
7840 } else {
31f18b77 7841 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
7842 }
7843 }
7844 break;
7845 case pg_pool_t::TYPE_ERASURE:
7846 {
31f18b77 7847 int err = crush_rule_create_erasure(rule_name,
7c673cae 7848 erasure_code_profile,
31f18b77 7849 crush_rule, ss);
7c673cae
FG
7850 switch (err) {
7851 case -EALREADY:
31f18b77
FG
7852 dout(20) << "prepare_pool_crush_rule: rule "
7853 << rule_name << " try again" << dendl;
7c673cae
FG
7854 // fall through
7855 case 0:
7856 // need to wait for the crush rule to be proposed before proceeding
7857 err = -EAGAIN;
7858 break;
7859 case -EEXIST:
7860 err = 0;
7861 break;
7862 }
7863 return err;
7864 }
7865 break;
7866 default:
31f18b77 7867 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
7868 << " is not a known pool type";
7869 return -EINVAL;
7c673cae
FG
7870 }
7871 } else {
20effc67 7872 if (!osdmap.crush->rule_exists(*crush_rule)) {
31f18b77 7873 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
7874 return -ENOENT;
7875 }
7876 }
7877
7878 return 0;
7879}
7880
31f18b77 7881int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
7882 int *crush_rule,
7883 ostream *ss)
7c673cae
FG
7884{
7885 int ret;
31f18b77 7886 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
7887 if (ret != -ENOENT) {
7888 // found it, use it
31f18b77 7889 *crush_rule = ret;
7c673cae 7890 } else {
20effc67 7891 CrushWrapper newcrush = _get_pending_crush();
7c673cae 7892
31f18b77 7893 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
7894 if (ret != -ENOENT) {
7895 // found it, wait for it to be proposed
31f18b77 7896 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
7897 << " try again" << dendl;
7898 return -EAGAIN;
7899 } else {
224ce89b 7900 // Cannot find it , return error
31f18b77 7901 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
7902 return ret;
7903 }
7904 }
7905 return 0;
7906}
7907
39ae355f
TL
7908/*
7909* Get the number of 'in' osds according to the crush_rule,
7910*/
7911uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule)
7912{
7913 set<int> out_osds;
7914 set<int> crush_in_osds;
7915 set<int> roots;
7916 CrushWrapper newcrush = _get_pending_crush();
7917 newcrush.find_takes_by_rule(crush_rule, &roots);
7918 for (auto root : roots) {
7919 const char *rootname = newcrush.get_item_name(root);
7920 set<int> crush_all_osds;
7921 newcrush.get_leaves(rootname, &crush_all_osds);
7922 std::set_difference(crush_all_osds.begin(), crush_all_osds.end(),
7923 out_osds.begin(), out_osds.end(),
7924 std::inserter(crush_in_osds, crush_in_osds.end()));
7925 }
7926 return crush_in_osds.size();
7927}
7928
7929int OSDMonitor::check_pg_num(int64_t pool,
7930 int pg_num,
7931 int size,
7932 int crush_rule,
7933 ostream *ss)
3efd9988 7934{
11fdf7f2 7935 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988 7936 uint64_t projected = 0;
39ae355f
TL
7937 uint32_t osd_num_by_crush = 0;
7938 set<int64_t> crush_pool_ids;
3efd9988 7939 if (pool < 0) {
20effc67 7940 // a new pool
3efd9988
FG
7941 projected += pg_num * size;
7942 }
39ae355f
TL
7943
7944 osd_num_by_crush = get_osd_num_by_crush(crush_rule);
7945 osdmap.get_pool_ids_by_rule(crush_rule, &crush_pool_ids);
7946
7947 for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7948 // Check only for pools affected by crush rule
1e59de90 7949 if (crush_pool_ids.contains(pool_id)) {
20effc67 7950 if (pool_id == pool) {
39ae355f
TL
7951 // Specified pool, use given pg_num and size values.
7952 projected += pg_num * size;
20effc67 7953 } else {
39ae355f
TL
7954 // Use pg_num_target for evaluating the projected pg num
7955 projected += pool_info.get_pg_num_target() * pool_info.get_size();
20effc67 7956 }
3efd9988
FG
7957 }
7958 }
39ae355f
TL
7959 // assume min cluster size 3
7960 osd_num_by_crush = std::max(osd_num_by_crush, 3u);
7961 auto projected_pgs_per_osd = projected / osd_num_by_crush;
7962
7963 if (projected_pgs_per_osd > max_pgs_per_osd) {
3efd9988
FG
7964 if (pool >= 0) {
7965 *ss << "pool id " << pool;
7966 }
39ae355f
TL
7967 *ss << " pg_num " << pg_num
7968 << " size " << size
7969 << " for this pool would result in "
7970 << projected_pgs_per_osd
7971 << " cumulative PGs per OSD (" << projected
7972 << " total PG replicas on " << osd_num_by_crush
7973 << " 'in' root OSDs by crush rule) "
7974 << "which exceeds the mon_max_pg_per_osd "
7975 << "value of " << max_pgs_per_osd;
3efd9988
FG
7976 return -ERANGE;
7977 }
7978 return 0;
7979}
7980
7c673cae
FG
7981/**
7982 * @param name The name of the new pool
31f18b77
FG
7983 * @param crush_rule The crush rule to use. If <0, will use the system default
7984 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
7985 * @param pg_num The pg_num to use. If set to 0, will use the system default
7986 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
20effc67
TL
7987 * @param pg_num_min min pg_num
7988 * @param pg_num_max max pg_num
11fdf7f2 7989 * @param repl_size Replication factor, or 0 for default
7c673cae
FG
7990 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7991 * @param pool_type TYPE_ERASURE, or TYPE_REP
7992 * @param expected_num_objects expected number of objects on the pool
7993 * @param fast_read fast read type.
1e59de90
TL
7994 * @param pg_autoscale_mode autoscale mode, one of on, off, warn
7995 * @param bool bulk indicates whether pool should be a bulk pool
7996 * @param bool crimson indicates whether pool is a crimson pool
7c673cae
FG
7997 * @param ss human readable error message, if any.
7998 *
7999 * @return 0 on success, negative errno on failure.
8000 */
11fdf7f2 8001int OSDMonitor::prepare_new_pool(string& name,
31f18b77
FG
8002 int crush_rule,
8003 const string &crush_rule_name,
7c673cae 8004 unsigned pg_num, unsigned pgp_num,
11fdf7f2 8005 unsigned pg_num_min,
20effc67 8006 unsigned pg_num_max,
11fdf7f2
TL
8007 const uint64_t repl_size,
8008 const uint64_t target_size_bytes,
8009 const float target_size_ratio,
7c673cae
FG
8010 const string &erasure_code_profile,
8011 const unsigned pool_type,
8012 const uint64_t expected_num_objects,
8013 FastReadType fast_read,
1e59de90 8014 string pg_autoscale_mode,
20effc67 8015 bool bulk,
1e59de90 8016 bool crimson,
7c673cae
FG
8017 ostream *ss)
8018{
1e59de90
TL
8019 if (crimson && pg_autoscale_mode.empty()) {
8020 // default pg_autoscale_mode to off for crimson, we'll error out below if
8021 // the user tried to actually set pg_autoscale_mode to something other than
8022 // "off"
8023 pg_autoscale_mode = "off";
8024 }
8025
7c673cae
FG
8026 if (name.length() == 0)
8027 return -EINVAL;
1e59de90 8028
20effc67
TL
8029 if (pg_num == 0) {
8030 auto pg_num_from_mode =
8031 [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
8032 (const string& mode) {
8033 return mode == "on" ? 1 : pg_num;
8034 };
8035 pg_num = pg_num_from_mode(
8036 pg_autoscale_mode.empty() ?
8037 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
8038 pg_autoscale_mode);
8039 }
7c673cae 8040 if (pgp_num == 0)
11fdf7f2
TL
8041 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
8042 if (!pgp_num)
8043 pgp_num = pg_num;
8044 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7c673cae 8045 *ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 8046 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7c673cae
FG
8047 << " (you may adjust 'mon max pool pg num' for higher values)";
8048 return -ERANGE;
8049 }
8050 if (pgp_num > pg_num) {
8051 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8052 << ", which in this case is " << pg_num;
8053 return -ERANGE;
8054 }
1e59de90
TL
8055
8056 if (crimson) {
8057 /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
8058 * be static. User must also have specified set-allow-crimson */
8059 const auto *suffix = " (--crimson specified or osd_pool_default_crimson set)";
8060 if (pool_type != pg_pool_t::TYPE_REPLICATED) {
8061 *ss << "crimson-osd only supports replicated pools" << suffix;
8062 return -EINVAL;
8063 } else if (pg_autoscale_mode != "off") {
8064 *ss << "crimson-osd does not support changing pg_num or pgp_num, "
8065 << "pg_autoscale_mode must be set to 'off'" << suffix;
8066 return -EINVAL;
8067 } else if (!osdmap.get_allow_crimson()) {
8068 *ss << "set-allow-crimson must be set to create a pool with the "
8069 << "crimson flag" << suffix;
8070 return -EINVAL;
8071 }
8072 }
8073
7c673cae
FG
8074 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
8075 *ss << "'fast_read' can only apply to erasure coding pool";
8076 return -EINVAL;
8077 }
8078 int r;
31f18b77
FG
8079 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
8080 crush_rule_name, &crush_rule, ss);
7c673cae 8081 if (r) {
94b18763 8082 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
8083 return r;
8084 }
20effc67
TL
8085 unsigned size, min_size;
8086 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8087 &size, &min_size, ss);
8088 if (r) {
8089 dout(10) << "prepare_pool_size returns " << r << dendl;
8090 return r;
8091 }
11fdf7f2 8092 if (g_conf()->mon_osd_crush_smoke_test) {
20effc67 8093 CrushWrapper newcrush = _get_pending_crush();
224ce89b
WB
8094 ostringstream err;
8095 CrushTester tester(newcrush, err);
b5b8bbf5 8096 tester.set_min_x(0);
224ce89b
WB
8097 tester.set_max_x(50);
8098 tester.set_rule(crush_rule);
20effc67 8099 tester.set_num_rep(size);
b5b8bbf5 8100 auto start = ceph::coarse_mono_clock::now();
39ae355f
TL
8101 r = tester.test_with_fork(cct, g_conf()->mon_lease);
8102 dout(10) << __func__ << " crush test_with_fork tester created " << dendl;
b5b8bbf5 8103 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 8104 if (r < 0) {
94b18763 8105 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
8106 << ": " << err.str() << dendl;
8107 *ss << "crush test failed with " << r << ": " << err.str();
8108 return r;
8109 }
181888fb 8110 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 8111 << duration << dendl;
7c673cae 8112 }
20effc67 8113 r = check_pg_num(-1, pg_num, size, crush_rule, ss);
3efd9988 8114 if (r) {
94b18763 8115 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
8116 return r;
8117 }
7c673cae 8118
20effc67
TL
8119 if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8120 *ss << "crush rule " << crush_rule << " type does not match pool";
7c673cae
FG
8121 return -EINVAL;
8122 }
8123
8124 uint32_t stripe_width = 0;
8125 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8126 if (r) {
94b18763 8127 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
8128 return r;
8129 }
8130
8131 bool fread = false;
8132 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8133 switch (fast_read) {
8134 case FAST_READ_OFF:
8135 fread = false;
8136 break;
8137 case FAST_READ_ON:
8138 fread = true;
8139 break;
8140 case FAST_READ_DEFAULT:
11fdf7f2 8141 fread = g_conf()->osd_pool_default_ec_fast_read;
7c673cae
FG
8142 break;
8143 default:
8144 *ss << "invalid fast_read setting: " << fast_read;
8145 return -EINVAL;
8146 }
8147 }
8148
8149 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8150 p != pending_inc.new_pool_names.end();
8151 ++p) {
8152 if (p->second == name)
8153 return 0;
8154 }
8155
8156 if (-1 == pending_inc.new_pool_max)
8157 pending_inc.new_pool_max = osdmap.pool_max;
8158 int64_t pool = ++pending_inc.new_pool_max;
8159 pg_pool_t empty;
8160 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
11fdf7f2 8161 pi->create_time = ceph_clock_now();
7c673cae
FG
8162 pi->type = pool_type;
8163 pi->fast_read = fread;
11fdf7f2 8164 pi->flags = g_conf()->osd_pool_default_flags;
20effc67
TL
8165 if (bulk) {
8166 pi->set_flag(pg_pool_t::FLAG_BULK);
8167 } else if (g_conf()->osd_pool_default_flag_bulk) {
8168 pi->set_flag(pg_pool_t::FLAG_BULK);
8169 }
11fdf7f2 8170 if (g_conf()->osd_pool_default_flag_hashpspool)
7c673cae 8171 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
11fdf7f2 8172 if (g_conf()->osd_pool_default_flag_nodelete)
7c673cae 8173 pi->set_flag(pg_pool_t::FLAG_NODELETE);
11fdf7f2 8174 if (g_conf()->osd_pool_default_flag_nopgchange)
7c673cae 8175 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
11fdf7f2 8176 if (g_conf()->osd_pool_default_flag_nosizechange)
7c673cae 8177 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
8178 pi->set_flag(pg_pool_t::FLAG_CREATING);
8179 if (g_conf()->osd_pool_use_gmt_hitset)
7c673cae
FG
8180 pi->use_gmt_hitset = true;
8181 else
8182 pi->use_gmt_hitset = false;
1e59de90
TL
8183 if (crimson) {
8184 pi->set_flag(pg_pool_t::FLAG_CRIMSON);
8185 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8186 }
7c673cae
FG
8187
8188 pi->size = size;
8189 pi->min_size = min_size;
31f18b77 8190 pi->crush_rule = crush_rule;
7c673cae
FG
8191 pi->expected_num_objects = expected_num_objects;
8192 pi->object_hash = CEPH_STR_HASH_RJENKINS;
f67539c2
TL
8193 if (osdmap.stretch_mode_enabled) {
8194 pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8195 pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8196 pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8197 pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8198 if (osdmap.degraded_stretch_mode) {
8199 pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8200 pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8201 // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8202 // TODO: drat, we don't record this ^ anywhere, though given that it
8203 // necessarily won't exist elsewhere it likely doesn't matter
8204 pi->min_size = pi->min_size / 2;
8205 pi->size = pi->size / 2; // only support 2 zones now
8206 }
8207 }
11fdf7f2 8208
9f95a23c
TL
8209 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8210 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8211 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8212 pi->pg_autoscale_mode = m;
8213 } else {
8214 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
11fdf7f2
TL
8215 }
8216 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8217 pi->set_pg_num(
8218 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8219 : pg_num);
8220 pi->set_pg_num_pending(pi->get_pg_num());
8221 pi->set_pg_num_target(pg_num);
8222 pi->set_pgp_num(pi->get_pg_num());
8223 pi->set_pgp_num_target(pgp_num);
9f95a23c 8224 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
8225 pg_num_min) {
8226 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8227 }
20effc67
TL
8228 if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8229 pg_num_max) {
8230 pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8231 }
9f95a23c
TL
8232 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8233 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8234 pi->pg_autoscale_mode = m;
8235 }
11fdf7f2 8236
7c673cae 8237 pi->last_change = pending_inc.epoch;
11fdf7f2
TL
8238 pi->auid = 0;
8239
8240 if (pool_type == pg_pool_t::TYPE_ERASURE) {
8241 pi->erasure_code_profile = erasure_code_profile;
8242 } else {
8243 pi->erasure_code_profile = "";
8244 }
7c673cae 8245 pi->stripe_width = stripe_width;
11fdf7f2 8246
9f95a23c 8247 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
8248 target_size_bytes) {
8249 // only store for nautilus+ because TARGET_SIZE_BYTES may be
8250 // larger than int32_t max.
8251 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8252 }
8253 if (target_size_ratio > 0.0 &&
9f95a23c 8254 osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
8255 // only store for nautilus+, just to be consistent and tidy.
8256 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8257 }
8258
7c673cae 8259 pi->cache_target_dirty_ratio_micro =
11fdf7f2 8260 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7c673cae 8261 pi->cache_target_dirty_high_ratio_micro =
11fdf7f2 8262 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7c673cae 8263 pi->cache_target_full_ratio_micro =
11fdf7f2
TL
8264 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8265 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8266 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8267
7c673cae
FG
8268 pending_inc.new_pool_names[pool] = name;
8269 return 0;
8270}
8271
8272bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8273{
8274 op->mark_osdmon_event(__func__);
8275 ostringstream ss;
8276 if (pending_inc.new_flags < 0)
8277 pending_inc.new_flags = osdmap.get_flags();
8278 pending_inc.new_flags |= flag;
8279 ss << OSDMap::get_flag_string(flag) << " is set";
8280 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8281 get_last_committed() + 1));
8282 return true;
8283}
8284
8285bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8286{
8287 op->mark_osdmon_event(__func__);
8288 ostringstream ss;
8289 if (pending_inc.new_flags < 0)
8290 pending_inc.new_flags = osdmap.get_flags();
8291 pending_inc.new_flags &= ~flag;
8292 ss << OSDMap::get_flag_string(flag) << " is unset";
8293 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8294 get_last_committed() + 1));
8295 return true;
8296}
8297
11fdf7f2 8298int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7c673cae
FG
8299 stringstream& ss)
8300{
8301 string poolstr;
9f95a23c 8302 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
8303 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8304 if (pool < 0) {
8305 ss << "unrecognized pool '" << poolstr << "'";
8306 return -ENOENT;
8307 }
8308 string var;
9f95a23c 8309 cmd_getval(cmdmap, "var", var);
7c673cae
FG
8310
8311 pg_pool_t p = *osdmap.get_pg_pool(pool);
8312 if (pending_inc.new_pools.count(pool))
8313 p = pending_inc.new_pools[pool];
8314
8315 // accept val as a json string in the normal case (current
8316 // generation monitor). parse out int or float values from the
8317 // string as needed. however, if it is not a string, try to pull
8318 // out an int, in case an older monitor with an older json schema is
8319 // forwarding a request.
8320 string val;
8321 string interr, floaterr;
8322 int64_t n = 0;
8323 double f = 0;
8324 int64_t uf = 0; // micro-f
9f95a23c 8325 cmd_getval(cmdmap, "val", val);
f64942e4 8326
9f95a23c
TL
8327 auto si_options = {
8328 "target_max_objects"
8329 };
8330 auto iec_options = {
8331 "target_max_bytes",
8332 "target_size_bytes",
8333 "compression_max_blob_size",
8334 "compression_min_blob_size",
8335 "csum_max_block",
8336 "csum_min_block",
8337 };
8338 if (count(begin(si_options), end(si_options), var)) {
20effc67 8339 n = strict_si_cast<int64_t>(val, &interr);
9f95a23c 8340 } else if (count(begin(iec_options), end(iec_options), var)) {
20effc67 8341 n = strict_iec_cast<int64_t>(val, &interr);
92f5a8d4
TL
8342 } else {
8343 // parse string as both int and float; different fields use different types.
8344 n = strict_strtoll(val.c_str(), 10, &interr);
8345 f = strict_strtod(val.c_str(), &floaterr);
8346 uf = llrintl(f * (double)1000000.0);
8347 }
7c673cae
FG
8348
8349 if (!p.is_tier() &&
8350 (var == "hit_set_type" || var == "hit_set_period" ||
8351 var == "hit_set_count" || var == "hit_set_fpp" ||
8352 var == "target_max_objects" || var == "target_max_bytes" ||
8353 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8354 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8355 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8356 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8357 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8358 return -EACCES;
8359 }
8360
8361 if (var == "size") {
8362 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8363 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8364 return -EPERM;
8365 }
8366 if (p.type == pg_pool_t::TYPE_ERASURE) {
8367 ss << "can not change the size of an erasure-coded pool";
8368 return -ENOTSUP;
8369 }
8370 if (interr.length()) {
8371 ss << "error parsing integer value '" << val << "': " << interr;
8372 return -EINVAL;
8373 }
8374 if (n <= 0 || n > 10) {
8375 ss << "pool size must be between 1 and 10";
8376 return -EINVAL;
8377 }
f67539c2
TL
8378 if (n == 1) {
8379 if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8380 ss << "configuring pool size as 1 is disabled by default.";
8381 return -EPERM;
8382 }
8383 bool sure = false;
8384 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8385 if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8386 "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8387 "pass the flag --yes-i-really-mean-it.";
8388 return -EPERM;
8389 }
8390 }
20effc67
TL
8391 if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8392 ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
eafe8130
TL
8393 return -EINVAL;
8394 }
39ae355f
TL
8395 if (n > p.size) {
8396 // only when increasing pool size
8397 int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8398 if (r < 0) {
8399 return r;
8400 }
3efd9988 8401 }
7c673cae 8402 p.size = n;
1911f103 8403 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
7c673cae
FG
8404 } else if (var == "min_size") {
8405 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8406 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8407 return -EPERM;
8408 }
8409 if (interr.length()) {
8410 ss << "error parsing integer value '" << val << "': " << interr;
8411 return -EINVAL;
8412 }
8413
8414 if (p.type != pg_pool_t::TYPE_ERASURE) {
8415 if (n < 1 || n > p.size) {
494da23a 8416 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7c673cae
FG
8417 return -EINVAL;
8418 }
8419 } else {
8420 ErasureCodeInterfaceRef erasure_code;
8421 int k;
8422 stringstream tmp;
8423 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8424 if (err == 0) {
8425 k = erasure_code->get_data_chunk_count();
8426 } else {
b32b8144 8427 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
8428 return err;
8429 }
8430
8431 if (n < k || n > p.size) {
494da23a 8432 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7c673cae
FG
8433 return -EINVAL;
8434 }
8435 }
8436 p.min_size = n;
11fdf7f2 8437 } else if (var == "pg_num_actual") {
1e59de90
TL
8438 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8439 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8440 return -EPERM;
8441 }
7c673cae
FG
8442 if (interr.length()) {
8443 ss << "error parsing integer value '" << val << "': " << interr;
8444 return -EINVAL;
8445 }
11fdf7f2
TL
8446 if (n == (int)p.get_pg_num()) {
8447 return 0;
8448 }
8449 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8450 ss << "'pg_num' must be greater than 0 and less than or equal to "
8451 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8452 << " (you may adjust 'mon max pool pg num' for higher values)";
8453 return -ERANGE;
8454 }
8455 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8456 ss << "cannot adjust pg_num while initial PGs are being created";
8457 return -EBUSY;
8458 }
8459 if (n > (int)p.get_pg_num()) {
8460 if (p.get_pg_num() != p.get_pg_num_pending()) {
8461 // force pre-nautilus clients to resend their ops, since they
8462 // don't understand pg_num_pending changes form a new interval
8463 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8464 }
8465 p.set_pg_num(n);
8466 } else {
9f95a23c 8467 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8468 ss << "nautilus OSDs are required to adjust pg_num_pending";
8469 return -EPERM;
8470 }
8471 if (n < (int)p.get_pgp_num()) {
8472 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8473 return -EINVAL;
8474 }
8475 if (n < (int)p.get_pg_num() - 1) {
8476 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8477 << ") - 1; only single pg decrease is currently supported";
8478 return -EINVAL;
8479 }
8480 p.set_pg_num_pending(n);
8481 // force pre-nautilus clients to resend their ops, since they
8482 // don't understand pg_num_pending changes form a new interval
8483 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7c673cae 8484 }
11fdf7f2
TL
8485 // force pre-luminous clients to resend their ops, since they
8486 // don't understand that split PGs now form a new interval.
8487 p.last_force_op_resend_preluminous = pending_inc.epoch;
7c673cae
FG
8488 } else if (var == "pg_num") {
8489 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8490 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8491 return -EPERM;
8492 }
8493 if (interr.length()) {
8494 ss << "error parsing integer value '" << val << "': " << interr;
8495 return -EINVAL;
8496 }
11fdf7f2 8497 if (n == (int)p.get_pg_num_target()) {
7c673cae
FG
8498 return 0;
8499 }
11fdf7f2
TL
8500 if (n <= 0 || static_cast<uint64_t>(n) >
8501 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
c07f9fc5 8502 ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 8503 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
c07f9fc5
FG
8504 << " (you may adjust 'mon max pool pg num' for higher values)";
8505 return -ERANGE;
8506 }
11fdf7f2 8507 if (n > (int)p.get_pg_num_target()) {
20effc67 8508 int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
11fdf7f2
TL
8509 if (r) {
8510 return r;
8511 }
8512 bool force = false;
9f95a23c 8513 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8514 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8515 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8516 return -EPERM;
8517 }
8518 } else {
9f95a23c 8519 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8520 ss << "nautilus OSDs are required to decrease pg_num";
8521 return -EPERM;
8522 }
7c673cae 8523 }
20effc67
TL
8524 int64_t pg_min = 0, pg_max = 0;
8525 p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8526 p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8527 if (pg_min && n < pg_min) {
8528 ss << "specified pg_num " << n
8529 << " < pg_num_min " << pg_min;
8530 return -EINVAL;
8531 }
8532 if (pg_max && n > pg_max) {
8533 ss << "specified pg_num " << n
8534 << " < pg_num_max " << pg_max;
8535 return -EINVAL;
8536 }
9f95a23c 8537 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8538 // pre-nautilus osdmap format; increase pg_num directly
8539 assert(n > (int)p.get_pg_num());
8540 // force pre-nautilus clients to resend their ops, since they
8541 // don't understand pg_num_target changes form a new interval
8542 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8543 // force pre-luminous clients to resend their ops, since they
8544 // don't understand that split PGs now form a new interval.
8545 p.last_force_op_resend_preluminous = pending_inc.epoch;
8546 p.set_pg_num(n);
8547 } else {
8548 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8549 // make pgp_num track pg_num if it already matches. if it is set
8550 // differently, leave it different and let the user control it
8551 // manually.
8552 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8553 p.set_pgp_num_target(n);
8554 }
8555 p.set_pg_num_target(n);
7c673cae 8556 }
11fdf7f2 8557 } else if (var == "pgp_num_actual") {
7c673cae
FG
8558 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8559 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8560 return -EPERM;
8561 }
8562 if (interr.length()) {
8563 ss << "error parsing integer value '" << val << "': " << interr;
8564 return -EINVAL;
8565 }
8566 if (n <= 0) {
8567 ss << "specified pgp_num must > 0, but you set to " << n;
8568 return -EINVAL;
8569 }
8570 if (n > (int)p.get_pg_num()) {
8571 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8572 return -EINVAL;
8573 }
11fdf7f2
TL
8574 if (n > (int)p.get_pg_num_pending()) {
8575 ss << "specified pgp_num " << n
8576 << " > pg_num_pending " << p.get_pg_num_pending();
8577 return -EINVAL;
8578 }
7c673cae 8579 p.set_pgp_num(n);
11fdf7f2
TL
8580 } else if (var == "pgp_num") {
8581 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8582 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8583 return -EPERM;
8584 }
8585 if (interr.length()) {
8586 ss << "error parsing integer value '" << val << "': " << interr;
8587 return -EINVAL;
8588 }
8589 if (n <= 0) {
8590 ss << "specified pgp_num must > 0, but you set to " << n;
8591 return -EINVAL;
8592 }
8593 if (n > (int)p.get_pg_num_target()) {
8594 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8595 return -EINVAL;
8596 }
9f95a23c 8597 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8598 // pre-nautilus osdmap format; increase pgp_num directly
8599 p.set_pgp_num(n);
8600 } else {
8601 p.set_pgp_num_target(n);
8602 }
11fdf7f2 8603 } else if (var == "pg_autoscale_mode") {
9f95a23c
TL
8604 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8605 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
11fdf7f2
TL
8606 ss << "specified invalid mode " << val;
8607 return -EINVAL;
8608 }
9f95a23c 8609 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8610 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8611 return -EINVAL;
8612 }
9f95a23c 8613 p.pg_autoscale_mode = m;
7c673cae
FG
8614 } else if (var == "crush_rule") {
8615 int id = osdmap.crush->get_rule_id(val);
8616 if (id == -ENOENT) {
8617 ss << "crush rule " << val << " does not exist";
8618 return -ENOENT;
8619 }
8620 if (id < 0) {
8621 ss << cpp_strerror(id);
8622 return -ENOENT;
8623 }
20effc67
TL
8624 if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8625 ss << "crush rule " << id << " type does not match pool";
7c673cae
FG
8626 return -EINVAL;
8627 }
31f18b77 8628 p.crush_rule = id;
7c673cae
FG
8629 } else if (var == "nodelete" || var == "nopgchange" ||
8630 var == "nosizechange" || var == "write_fadvise_dontneed" ||
20effc67 8631 var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
7c673cae 8632 uint64_t flag = pg_pool_t::get_flag_by_name(var);
20effc67
TL
8633 // make sure we only compare against 'n' if we didn't receive a string
8634 if (val == "true" || (interr.empty() && n == 1)) {
8635 p.set_flag(flag);
8636 } else if (val == "false" || (interr.empty() && n == 0)) {
1e59de90
TL
8637 if (flag == pg_pool_t::FLAG_NOPGCHANGE && p.is_crimson()) {
8638 ss << "cannot clear FLAG_NOPGCHANGE on a crimson pool";
8639 return -EINVAL;
8640 }
20effc67
TL
8641 p.unset_flag(flag);
8642 } else {
8643 ss << "expecting value 'true', 'false', '0', or '1'";
8644 return -EINVAL;
8645 }
8646 } else if (var == "eio") {
8647 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8648
7c673cae
FG
8649 // make sure we only compare against 'n' if we didn't receive a string
8650 if (val == "true" || (interr.empty() && n == 1)) {
8651 p.set_flag(flag);
8652 } else if (val == "false" || (interr.empty() && n == 0)) {
8653 p.unset_flag(flag);
8654 } else {
8655 ss << "expecting value 'true', 'false', '0', or '1'";
8656 return -EINVAL;
8657 }
8658 } else if (var == "hashpspool") {
8659 uint64_t flag = pg_pool_t::get_flag_by_name(var);
11fdf7f2 8660 bool force = false;
9f95a23c 8661 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8662
8663 if (!force) {
7c673cae
FG
8664 ss << "are you SURE? this will remap all placement groups in this pool,"
8665 " this triggers large data movement,"
8666 " pass --yes-i-really-mean-it if you really do.";
8667 return -EPERM;
8668 }
8669 // make sure we only compare against 'n' if we didn't receive a string
8670 if (val == "true" || (interr.empty() && n == 1)) {
8671 p.set_flag(flag);
8672 } else if (val == "false" || (interr.empty() && n == 0)) {
8673 p.unset_flag(flag);
8674 } else {
8675 ss << "expecting value 'true', 'false', '0', or '1'";
8676 return -EINVAL;
8677 }
8678 } else if (var == "hit_set_type") {
8679 if (val == "none")
8680 p.hit_set_params = HitSet::Params();
8681 else {
8682 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8683 if (err)
8684 return err;
8685 if (val == "bloom") {
8686 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 8687 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae
FG
8688 p.hit_set_params = HitSet::Params(bsp);
8689 } else if (val == "explicit_hash")
8690 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8691 else if (val == "explicit_object")
8692 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8693 else {
8694 ss << "unrecognized hit_set type '" << val << "'";
8695 return -EINVAL;
8696 }
8697 }
8698 } else if (var == "hit_set_period") {
8699 if (interr.length()) {
8700 ss << "error parsing integer value '" << val << "': " << interr;
8701 return -EINVAL;
11fdf7f2
TL
8702 } else if (n < 0) {
8703 ss << "hit_set_period should be non-negative";
8704 return -EINVAL;
7c673cae
FG
8705 }
8706 p.hit_set_period = n;
8707 } else if (var == "hit_set_count") {
8708 if (interr.length()) {
8709 ss << "error parsing integer value '" << val << "': " << interr;
8710 return -EINVAL;
11fdf7f2
TL
8711 } else if (n < 0) {
8712 ss << "hit_set_count should be non-negative";
8713 return -EINVAL;
7c673cae
FG
8714 }
8715 p.hit_set_count = n;
8716 } else if (var == "hit_set_fpp") {
8717 if (floaterr.length()) {
8718 ss << "error parsing floating point value '" << val << "': " << floaterr;
8719 return -EINVAL;
11fdf7f2
TL
8720 } else if (f < 0 || f > 1.0) {
8721 ss << "hit_set_fpp should be in the range 0..1";
8722 return -EINVAL;
7c673cae
FG
8723 }
8724 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8725 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8726 return -EINVAL;
8727 }
8728 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8729 bloomp->set_fpp(f);
8730 } else if (var == "use_gmt_hitset") {
8731 if (val == "true" || (interr.empty() && n == 1)) {
7c673cae
FG
8732 p.use_gmt_hitset = true;
8733 } else {
8734 ss << "expecting value 'true' or '1'";
8735 return -EINVAL;
8736 }
8737 } else if (var == "allow_ec_overwrites") {
8738 if (!p.is_erasure()) {
8739 ss << "ec overwrites can only be enabled for an erasure coded pool";
8740 return -EINVAL;
8741 }
224ce89b 8742 stringstream err;
11fdf7f2 8743 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
224ce89b
WB
8744 !is_pool_currently_all_bluestore(pool, p, &err)) {
8745 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8746 return -EINVAL;
8747 }
7c673cae
FG
8748 if (val == "true" || (interr.empty() && n == 1)) {
8749 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8750 } else if (val == "false" || (interr.empty() && n == 0)) {
8751 ss << "ec overwrites cannot be disabled once enabled";
8752 return -EINVAL;
8753 } else {
8754 ss << "expecting value 'true', 'false', '0', or '1'";
8755 return -EINVAL;
8756 }
7c673cae
FG
8757 } else if (var == "target_max_objects") {
8758 if (interr.length()) {
8759 ss << "error parsing int '" << val << "': " << interr;
8760 return -EINVAL;
8761 }
8762 p.target_max_objects = n;
8763 } else if (var == "target_max_bytes") {
8764 if (interr.length()) {
8765 ss << "error parsing int '" << val << "': " << interr;
8766 return -EINVAL;
8767 }
8768 p.target_max_bytes = n;
8769 } else if (var == "cache_target_dirty_ratio") {
8770 if (floaterr.length()) {
8771 ss << "error parsing float '" << val << "': " << floaterr;
8772 return -EINVAL;
8773 }
8774 if (f < 0 || f > 1.0) {
8775 ss << "value must be in the range 0..1";
8776 return -ERANGE;
8777 }
8778 p.cache_target_dirty_ratio_micro = uf;
8779 } else if (var == "cache_target_dirty_high_ratio") {
8780 if (floaterr.length()) {
8781 ss << "error parsing float '" << val << "': " << floaterr;
8782 return -EINVAL;
8783 }
8784 if (f < 0 || f > 1.0) {
8785 ss << "value must be in the range 0..1";
8786 return -ERANGE;
8787 }
8788 p.cache_target_dirty_high_ratio_micro = uf;
8789 } else if (var == "cache_target_full_ratio") {
8790 if (floaterr.length()) {
8791 ss << "error parsing float '" << val << "': " << floaterr;
8792 return -EINVAL;
8793 }
8794 if (f < 0 || f > 1.0) {
8795 ss << "value must be in the range 0..1";
8796 return -ERANGE;
8797 }
8798 p.cache_target_full_ratio_micro = uf;
8799 } else if (var == "cache_min_flush_age") {
8800 if (interr.length()) {
8801 ss << "error parsing int '" << val << "': " << interr;
8802 return -EINVAL;
8803 }
8804 p.cache_min_flush_age = n;
8805 } else if (var == "cache_min_evict_age") {
8806 if (interr.length()) {
8807 ss << "error parsing int '" << val << "': " << interr;
8808 return -EINVAL;
8809 }
8810 p.cache_min_evict_age = n;
8811 } else if (var == "min_read_recency_for_promote") {
8812 if (interr.length()) {
8813 ss << "error parsing integer value '" << val << "': " << interr;
8814 return -EINVAL;
8815 }
8816 p.min_read_recency_for_promote = n;
8817 } else if (var == "hit_set_grade_decay_rate") {
8818 if (interr.length()) {
8819 ss << "error parsing integer value '" << val << "': " << interr;
8820 return -EINVAL;
8821 }
8822 if (n > 100 || n < 0) {
8823 ss << "value out of range,valid range is 0 - 100";
8824 return -EINVAL;
8825 }
8826 p.hit_set_grade_decay_rate = n;
8827 } else if (var == "hit_set_search_last_n") {
8828 if (interr.length()) {
8829 ss << "error parsing integer value '" << val << "': " << interr;
8830 return -EINVAL;
8831 }
8832 if (n > p.hit_set_count || n < 0) {
8833 ss << "value out of range,valid range is 0 - hit_set_count";
8834 return -EINVAL;
8835 }
8836 p.hit_set_search_last_n = n;
8837 } else if (var == "min_write_recency_for_promote") {
8838 if (interr.length()) {
8839 ss << "error parsing integer value '" << val << "': " << interr;
8840 return -EINVAL;
8841 }
8842 p.min_write_recency_for_promote = n;
8843 } else if (var == "fast_read") {
8844 if (p.is_replicated()) {
8845 ss << "fast read is not supported in replication pool";
8846 return -EINVAL;
8847 }
8848 if (val == "true" || (interr.empty() && n == 1)) {
8849 p.fast_read = true;
8850 } else if (val == "false" || (interr.empty() && n == 0)) {
8851 p.fast_read = false;
8852 } else {
8853 ss << "expecting value 'true', 'false', '0', or '1'";
8854 return -EINVAL;
8855 }
8856 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 8857 bool unset = val == "unset";
7c673cae 8858 if (var == "compression_mode") {
224ce89b
WB
8859 if (!unset) {
8860 auto cmode = Compressor::get_comp_mode_type(val);
8861 if (!cmode) {
8862 ss << "unrecognized compression mode '" << val << "'";
8863 return -EINVAL;
8864 }
7c673cae
FG
8865 }
8866 } else if (var == "compression_algorithm") {
224ce89b
WB
8867 if (!unset) {
8868 auto alg = Compressor::get_comp_alg_type(val);
8869 if (!alg) {
8870 ss << "unrecognized compression_algorithm '" << val << "'";
8871 return -EINVAL;
8872 }
7c673cae
FG
8873 }
8874 } else if (var == "compression_required_ratio") {
8875 if (floaterr.length()) {
8876 ss << "error parsing float value '" << val << "': " << floaterr;
8877 return -EINVAL;
8878 }
224ce89b 8879 if (f < 0 || f > 1) {
7c673cae 8880 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 8881 return -EINVAL;
7c673cae
FG
8882 }
8883 } else if (var == "csum_type") {
224ce89b 8884 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
8885 if (t < 0 ) {
8886 ss << "unrecognized csum_type '" << val << "'";
224ce89b 8887 return -EINVAL;
7c673cae
FG
8888 }
8889 //preserve csum_type numeric value
8890 n = t;
8891 interr.clear();
8892 } else if (var == "compression_max_blob_size" ||
8893 var == "compression_min_blob_size" ||
8894 var == "csum_max_block" ||
8895 var == "csum_min_block") {
8896 if (interr.length()) {
8897 ss << "error parsing int value '" << val << "': " << interr;
8898 return -EINVAL;
8899 }
11fdf7f2
TL
8900 } else if (var == "fingerprint_algorithm") {
8901 if (!unset) {
8902 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8903 if (!alg) {
8904 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8905 return -EINVAL;
8906 }
8907 }
92f5a8d4
TL
8908 } else if (var == "target_size_bytes") {
8909 if (interr.length()) {
8910 ss << "error parsing unit value '" << val << "': " << interr;
8911 return -EINVAL;
8912 }
9f95a23c 8913 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
92f5a8d4
TL
8914 ss << "must set require_osd_release to nautilus or "
8915 << "later before setting target_size_bytes";
8916 return -EINVAL;
8917 }
1d09f67e
TL
8918 } else if (var == "target_size_ratio") {
8919 if (f < 0.0) {
8920 ss << "target_size_ratio cannot be negative";
8921 return -EINVAL;
8922 }
11fdf7f2
TL
8923 } else if (var == "pg_num_min") {
8924 if (interr.length()) {
8925 ss << "error parsing int value '" << val << "': " << interr;
8926 return -EINVAL;
8927 }
8928 if (n > (int)p.get_pg_num_target()) {
8929 ss << "specified pg_num_min " << n
8930 << " > pg_num " << p.get_pg_num_target();
8931 return -EINVAL;
8932 }
20effc67
TL
8933 } else if (var == "pg_num_max") {
8934 if (interr.length()) {
8935 ss << "error parsing int value '" << val << "': " << interr;
8936 return -EINVAL;
8937 }
8938 if (n && n < (int)p.get_pg_num_target()) {
8939 ss << "specified pg_num_max " << n
8940 << " < pg_num " << p.get_pg_num_target();
8941 return -EINVAL;
8942 }
11fdf7f2
TL
8943 } else if (var == "recovery_priority") {
8944 if (interr.length()) {
8945 ss << "error parsing int value '" << val << "': " << interr;
8946 return -EINVAL;
8947 }
81eedcae
TL
8948 if (!g_conf()->debug_allow_any_pool_priority) {
8949 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8950 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8951 << " and " << OSD_POOL_PRIORITY_MAX;
8952 return -EINVAL;
8953 }
11fdf7f2
TL
8954 }
8955 } else if (var == "pg_autoscale_bias") {
8956 if (f < 0.0 || f > 1000.0) {
8957 ss << "pg_autoscale_bias must be between 0 and 1000";
8958 return -EINVAL;
8959 }
f67539c2
TL
8960 } else if (var == "dedup_tier") {
8961 if (interr.empty()) {
8962 ss << "expecting value 'pool name'";
8963 return -EINVAL;
8964 }
8965 // Current base tier in dedup does not support ec pool
8966 if (p.is_erasure()) {
8967 ss << "pool '" << poolstr
8968 << "' is an ec pool, which cannot be a base tier";
8969 return -ENOTSUP;
8970 }
8971 int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8972 if (lowtierpool_id < 0) {
8973 ss << "unrecognized pool '" << val << "'";
8974 return -ENOENT;
8975 }
8976 const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8977 ceph_assert(tp);
8978 n = lowtierpool_id;
8979 // The original input is string (pool name), but we convert it to int64_t.
8980 // So, clear interr
8981 interr.clear();
8982 } else if (var == "dedup_chunk_algorithm") {
8983 if (!unset) {
8984 auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8985 if (!alg) {
8986 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8987 return -EINVAL;
8988 }
8989 }
8990 } else if (var == "dedup_cdc_chunk_size") {
8991 if (interr.length()) {
8992 ss << "error parsing int value '" << val << "': " << interr;
8993 return -EINVAL;
8994 }
7c673cae
FG
8995 }
8996
8997 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8998 switch (desc.type) {
8999 case pool_opts_t::STR:
224ce89b 9000 if (unset) {
7c673cae
FG
9001 p.opts.unset(desc.key);
9002 } else {
9003 p.opts.set(desc.key, static_cast<std::string>(val));
9004 }
9005 break;
9006 case pool_opts_t::INT:
9007 if (interr.length()) {
9008 ss << "error parsing integer value '" << val << "': " << interr;
9009 return -EINVAL;
9010 }
9011 if (n == 0) {
9012 p.opts.unset(desc.key);
9013 } else {
11fdf7f2 9014 p.opts.set(desc.key, static_cast<int64_t>(n));
7c673cae
FG
9015 }
9016 break;
9017 case pool_opts_t::DOUBLE:
9018 if (floaterr.length()) {
9019 ss << "error parsing floating point value '" << val << "': " << floaterr;
9020 return -EINVAL;
9021 }
9022 if (f == 0) {
9023 p.opts.unset(desc.key);
9024 } else {
9025 p.opts.set(desc.key, static_cast<double>(f));
9026 }
9027 break;
9028 default:
11fdf7f2 9029 ceph_assert(!"unknown type");
7c673cae
FG
9030 }
9031 } else {
9032 ss << "unrecognized variable '" << var << "'";
9033 return -EINVAL;
9034 }
224ce89b
WB
9035 if (val != "unset") {
9036 ss << "set pool " << pool << " " << var << " to " << val;
9037 } else {
9038 ss << "unset pool " << pool << " " << var;
9039 }
7c673cae
FG
9040 p.last_change = pending_inc.epoch;
9041 pending_inc.new_pools[pool] = p;
9042 return 0;
9043}
9044
c07f9fc5 9045int OSDMonitor::prepare_command_pool_application(const string &prefix,
11fdf7f2 9046 const cmdmap_t& cmdmap,
c07f9fc5 9047 stringstream& ss)
11fdf7f2
TL
9048{
9049 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
9050}
9051
9052int OSDMonitor::preprocess_command_pool_application(const string &prefix,
9053 const cmdmap_t& cmdmap,
9054 stringstream& ss,
9055 bool *modified)
9056{
9057 return _command_pool_application(prefix, cmdmap, ss, modified, false);
9058}
9059
9060
9061/**
9062 * Common logic for preprocess and prepare phases of pool application
9063 * tag commands. In preprocess mode we're only detecting invalid
9064 * commands, and determining whether it was a modification or a no-op.
9065 * In prepare mode we're actually updating the pending state.
9066 */
9067int OSDMonitor::_command_pool_application(const string &prefix,
9068 const cmdmap_t& cmdmap,
9069 stringstream& ss,
9070 bool *modified,
9071 bool preparing)
c07f9fc5
FG
9072{
9073 string pool_name;
9f95a23c 9074 cmd_getval(cmdmap, "pool", pool_name);
c07f9fc5
FG
9075 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
9076 if (pool < 0) {
9077 ss << "unrecognized pool '" << pool_name << "'";
9078 return -ENOENT;
9079 }
9080
9081 pg_pool_t p = *osdmap.get_pg_pool(pool);
11fdf7f2
TL
9082 if (preparing) {
9083 if (pending_inc.new_pools.count(pool)) {
9084 p = pending_inc.new_pools[pool];
9085 }
c07f9fc5
FG
9086 }
9087
9088 string app;
9f95a23c 9089 cmd_getval(cmdmap, "app", app);
c07f9fc5
FG
9090 bool app_exists = (p.application_metadata.count(app) > 0);
9091
11fdf7f2 9092 string key;
9f95a23c 9093 cmd_getval(cmdmap, "key", key);
11fdf7f2
TL
9094 if (key == "all") {
9095 ss << "key cannot be 'all'";
9096 return -EINVAL;
9097 }
9098
9099 string value;
9f95a23c 9100 cmd_getval(cmdmap, "value", value);
11fdf7f2
TL
9101 if (value == "all") {
9102 ss << "value cannot be 'all'";
9103 return -EINVAL;
9104 }
9105
c07f9fc5
FG
9106 if (boost::algorithm::ends_with(prefix, "enable")) {
9107 if (app.empty()) {
9108 ss << "application name must be provided";
9109 return -EINVAL;
9110 }
9111
9112 if (p.is_tier()) {
9113 ss << "application must be enabled on base tier";
9114 return -EINVAL;
9115 }
9116
11fdf7f2 9117 bool force = false;
9f95a23c 9118 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 9119
11fdf7f2 9120 if (!app_exists && !p.application_metadata.empty() && !force) {
c07f9fc5
FG
9121 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9122 << "application; pass --yes-i-really-mean-it to proceed anyway";
9123 return -EPERM;
9124 }
9125
9126 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9127 ss << "too many enabled applications on pool '" << pool_name << "'; "
9128 << "max " << MAX_POOL_APPLICATIONS;
9129 return -EINVAL;
9130 }
9131
9132 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9133 ss << "application name '" << app << "' too long; max length "
9134 << MAX_POOL_APPLICATION_LENGTH;
9135 return -EINVAL;
9136 }
9137
9138 if (!app_exists) {
9139 p.application_metadata[app] = {};
9140 }
9141 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9142
9143 } else if (boost::algorithm::ends_with(prefix, "disable")) {
11fdf7f2 9144 bool force = false;
9f95a23c 9145 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 9146
11fdf7f2 9147 if (!force) {
c07f9fc5
FG
9148 ss << "Are you SURE? Disabling an application within a pool might result "
9149 << "in loss of application functionality; pass "
9150 << "--yes-i-really-mean-it to proceed anyway";
9151 return -EPERM;
9152 }
9153
9154 if (!app_exists) {
9155 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9156 << "'";
9157 return 0; // idempotent
9158 }
9159
9160 p.application_metadata.erase(app);
9161 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9162
9163 } else if (boost::algorithm::ends_with(prefix, "set")) {
9164 if (p.is_tier()) {
9165 ss << "application metadata must be set on base tier";
9166 return -EINVAL;
9167 }
9168
9169 if (!app_exists) {
9170 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9171 << "'";
9172 return -ENOENT;
9173 }
9174
9175 string key;
9f95a23c 9176 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
9177
9178 if (key.empty()) {
9179 ss << "key must be provided";
9180 return -EINVAL;
9181 }
9182
9183 auto &app_keys = p.application_metadata[app];
9184 if (app_keys.count(key) == 0 &&
9185 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9186 ss << "too many keys set for application '" << app << "' on pool '"
9187 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9188 return -EINVAL;
9189 }
9190
9191 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9192 ss << "key '" << app << "' too long; max length "
9193 << MAX_POOL_APPLICATION_LENGTH;
9194 return -EINVAL;
9195 }
9196
9197 string value;
9f95a23c 9198 cmd_getval(cmdmap, "value", value);
c07f9fc5
FG
9199 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9200 ss << "value '" << value << "' too long; max length "
9201 << MAX_POOL_APPLICATION_LENGTH;
9202 return -EINVAL;
9203 }
9204
9205 p.application_metadata[app][key] = value;
9206 ss << "set application '" << app << "' key '" << key << "' to '"
9207 << value << "' on pool '" << pool_name << "'";
9208 } else if (boost::algorithm::ends_with(prefix, "rm")) {
9209 if (!app_exists) {
9210 ss << "application '" << app << "' is not enabled on pool '" << pool_name
9211 << "'";
9212 return -ENOENT;
9213 }
9214
9215 string key;
9f95a23c 9216 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
9217 auto it = p.application_metadata[app].find(key);
9218 if (it == p.application_metadata[app].end()) {
9219 ss << "application '" << app << "' on pool '" << pool_name
9220 << "' does not have key '" << key << "'";
9221 return 0; // idempotent
9222 }
9223
9224 p.application_metadata[app].erase(it);
9225 ss << "removed application '" << app << "' key '" << key << "' on pool '"
9226 << pool_name << "'";
9227 } else {
11fdf7f2
TL
9228 ceph_abort();
9229 }
9230
9231 if (preparing) {
9232 p.last_change = pending_inc.epoch;
9233 pending_inc.new_pools[pool] = p;
9234 }
9235
9236 // Because we fell through this far, we didn't hit no-op cases,
9237 // so pool was definitely modified
9238 if (modified != nullptr) {
9239 *modified = true;
c07f9fc5
FG
9240 }
9241
c07f9fc5
FG
9242 return 0;
9243}
9244
31f18b77
FG
9245int OSDMonitor::_prepare_command_osd_crush_remove(
9246 CrushWrapper &newcrush,
9247 int32_t id,
9248 int32_t ancestor,
9249 bool has_ancestor,
9250 bool unlink_only)
9251{
9252 int err = 0;
9253
9254 if (has_ancestor) {
11fdf7f2 9255 err = newcrush.remove_item_under(cct, id, ancestor,
31f18b77
FG
9256 unlink_only);
9257 } else {
11fdf7f2 9258 err = newcrush.remove_item(cct, id, unlink_only);
31f18b77
FG
9259 }
9260 return err;
9261}
9262
9263void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9264{
9265 pending_inc.crush.clear();
f67539c2 9266 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
31f18b77
FG
9267}
9268
9269int OSDMonitor::prepare_command_osd_crush_remove(
9270 CrushWrapper &newcrush,
9271 int32_t id,
9272 int32_t ancestor,
9273 bool has_ancestor,
9274 bool unlink_only)
9275{
9276 int err = _prepare_command_osd_crush_remove(
9277 newcrush, id, ancestor,
9278 has_ancestor, unlink_only);
9279
9280 if (err < 0)
9281 return err;
9282
11fdf7f2 9283 ceph_assert(err == 0);
31f18b77
FG
9284 do_osd_crush_remove(newcrush);
9285
9286 return 0;
9287}
9288
9289int OSDMonitor::prepare_command_osd_remove(int32_t id)
9290{
9291 if (osdmap.is_up(id)) {
9292 return -EBUSY;
9293 }
9294
9295 pending_inc.new_state[id] = osdmap.get_state(id);
9296 pending_inc.new_uuid[id] = uuid_d();
9297 pending_metadata_rm.insert(id);
9298 pending_metadata.erase(id);
9299
9300 return 0;
9301}
9302
9303int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9304{
11fdf7f2 9305 ceph_assert(existing_id);
31f18b77
FG
9306 *existing_id = -1;
9307
9308 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9309 if (!osdmap.exists(i) &&
9310 pending_inc.new_up_client.count(i) == 0 &&
9311 (pending_inc.new_state.count(i) == 0 ||
9312 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9313 *existing_id = i;
9314 return -1;
9315 }
9316 }
9317
9318 if (pending_inc.new_max_osd < 0) {
9319 return osdmap.get_max_osd();
9320 }
9321 return pending_inc.new_max_osd;
9322}
9323
9324void OSDMonitor::do_osd_create(
9325 const int32_t id,
9326 const uuid_d& uuid,
3a9019d9 9327 const string& device_class,
31f18b77
FG
9328 int32_t* new_id)
9329{
9330 dout(10) << __func__ << " uuid " << uuid << dendl;
11fdf7f2 9331 ceph_assert(new_id);
31f18b77
FG
9332
9333 // We presume validation has been performed prior to calling this
9334 // function. We assert with prejudice.
9335
9336 int32_t allocated_id = -1; // declare here so we can jump
9337 int32_t existing_id = -1;
9338 if (!uuid.is_zero()) {
9339 existing_id = osdmap.identify_osd(uuid);
9340 if (existing_id >= 0) {
11fdf7f2 9341 ceph_assert(id < 0 || id == existing_id);
31f18b77
FG
9342 *new_id = existing_id;
9343 goto out;
9344 } else if (id >= 0) {
9345 // uuid does not exist, and id has been provided, so just create
9346 // the new osd.id
9347 *new_id = id;
9348 goto out;
9349 }
9350 }
9351
9352 // allocate a new id
9353 allocated_id = _allocate_osd_id(&existing_id);
9354 dout(10) << __func__ << " allocated id " << allocated_id
9355 << " existing id " << existing_id << dendl;
9356 if (existing_id >= 0) {
11fdf7f2
TL
9357 ceph_assert(existing_id < osdmap.get_max_osd());
9358 ceph_assert(allocated_id < 0);
31f18b77 9359 *new_id = existing_id;
31f18b77 9360 } else if (allocated_id >= 0) {
11fdf7f2 9361 ceph_assert(existing_id < 0);
31f18b77
FG
9362 // raise max_osd
9363 if (pending_inc.new_max_osd < 0) {
9364 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9365 } else {
9366 ++pending_inc.new_max_osd;
9367 }
9368 *new_id = pending_inc.new_max_osd - 1;
11fdf7f2 9369 ceph_assert(*new_id == allocated_id);
31f18b77 9370 } else {
11fdf7f2 9371 ceph_abort_msg("unexpected condition");
31f18b77
FG
9372 }
9373
9374out:
3a9019d9 9375 if (device_class.size()) {
20effc67 9376 CrushWrapper newcrush = _get_pending_crush();
3a9019d9
FG
9377 if (newcrush.get_max_devices() < *new_id + 1) {
9378 newcrush.set_max_devices(*new_id + 1);
9379 }
9380 string name = string("osd.") + stringify(*new_id);
9381 if (!newcrush.item_exists(*new_id)) {
9382 newcrush.set_item_name(*new_id, name);
9383 }
9384 ostringstream ss;
9385 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9386 if (r < 0) {
9387 derr << __func__ << " failed to set " << name << " device_class "
9388 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9389 << dendl;
9390 // non-fatal... this might be a replay and we want to be idempotent.
9391 } else {
9392 dout(20) << __func__ << " set " << name << " device_class " << device_class
9393 << dendl;
9394 pending_inc.crush.clear();
f67539c2 9395 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
3a9019d9
FG
9396 }
9397 } else {
9398 dout(20) << __func__ << " no device_class" << dendl;
9399 }
9400
31f18b77
FG
9401 dout(10) << __func__ << " using id " << *new_id << dendl;
9402 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9403 pending_inc.new_max_osd = *new_id + 1;
9404 }
9405
f67539c2
TL
9406 pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9407 // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9408 // set it for us. (ugh.)
9409 pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
31f18b77
FG
9410 if (!uuid.is_zero())
9411 pending_inc.new_uuid[*new_id] = uuid;
9412}
9413
9414int OSDMonitor::validate_osd_create(
9415 const int32_t id,
9416 const uuid_d& uuid,
9417 const bool check_osd_exists,
9418 int32_t* existing_id,
9419 stringstream& ss)
9420{
9421
9422 dout(10) << __func__ << " id " << id << " uuid " << uuid
9423 << " check_osd_exists " << check_osd_exists << dendl;
9424
11fdf7f2 9425 ceph_assert(existing_id);
31f18b77
FG
9426
9427 if (id < 0 && uuid.is_zero()) {
9428 // we have nothing to validate
9429 *existing_id = -1;
9430 return 0;
9431 } else if (uuid.is_zero()) {
9432 // we have an id but we will ignore it - because that's what
9433 // `osd create` does.
9434 return 0;
9435 }
9436
9437 /*
9438 * This function will be used to validate whether we are able to
9439 * create a new osd when the `uuid` is specified.
9440 *
9441 * It will be used by both `osd create` and `osd new`, as the checks
9442 * are basically the same when it pertains to osd id and uuid validation.
9443 * However, `osd create` presumes an `uuid` is optional, for legacy
9444 * reasons, while `osd new` requires the `uuid` to be provided. This
9445 * means that `osd create` will not be idempotent if an `uuid` is not
9446 * provided, but we will always guarantee the idempotency of `osd new`.
9447 */
9448
11fdf7f2 9449 ceph_assert(!uuid.is_zero());
31f18b77
FG
9450 if (pending_inc.identify_osd(uuid) >= 0) {
9451 // osd is about to exist
9452 return -EAGAIN;
9453 }
9454
9455 int32_t i = osdmap.identify_osd(uuid);
9456 if (i >= 0) {
9457 // osd already exists
9458 if (id >= 0 && i != id) {
9459 ss << "uuid " << uuid << " already in use for different id " << i;
9460 return -EEXIST;
9461 }
9462 // return a positive errno to distinguish between a blocking error
9463 // and an error we consider to not be a problem (i.e., this would be
9464 // an idempotent operation).
9465 *existing_id = i;
9466 return EEXIST;
9467 }
9468 // i < 0
9469 if (id >= 0) {
9470 if (pending_inc.new_state.count(id)) {
9471 // osd is about to exist
9472 return -EAGAIN;
9473 }
9474 // we may not care if an osd exists if we are recreating a previously
9475 // destroyed osd.
9476 if (check_osd_exists && osdmap.exists(id)) {
9477 ss << "id " << id << " already in use and does not match uuid "
9478 << uuid;
9479 return -EINVAL;
9480 }
9481 }
9482 return 0;
9483}
9484
9485int OSDMonitor::prepare_command_osd_create(
9486 const int32_t id,
9487 const uuid_d& uuid,
9488 int32_t* existing_id,
9489 stringstream& ss)
9490{
9491 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
11fdf7f2 9492 ceph_assert(existing_id);
b5b8bbf5
FG
9493 if (osdmap.is_destroyed(id)) {
9494 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9495 "instead.";
9496 return -EINVAL;
9497 }
31f18b77
FG
9498
9499 if (uuid.is_zero()) {
9500 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9501 }
9502
9503 return validate_osd_create(id, uuid, true, existing_id, ss);
9504}
9505
9506int OSDMonitor::prepare_command_osd_new(
9507 MonOpRequestRef op,
11fdf7f2 9508 const cmdmap_t& cmdmap,
3a9019d9 9509 const map<string,string>& params,
31f18b77
FG
9510 stringstream &ss,
9511 Formatter *f)
9512{
9513 uuid_d uuid;
9514 string uuidstr;
9515 int64_t id = -1;
9516
f67539c2 9517 ceph_assert(paxos.is_plugged());
31f18b77
FG
9518
9519 dout(10) << __func__ << " " << op << dendl;
9520
9521 /* validate command. abort now if something's wrong. */
9522
9523 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9524 *
9525 * If `id` is not specified, we will identify any existing osd based
9526 * on `uuid`. Operation will be idempotent iff secrets match.
9527 *
9528 * If `id` is specified, we will identify any existing osd based on
9529 * `uuid` and match against `id`. If they match, operation will be
9530 * idempotent iff secrets match.
9531 *
9532 * `-i secrets.json` will be optional. If supplied, will be used
9533 * to check for idempotency when `id` and `uuid` match.
9534 *
9535 * If `id` is not specified, and `uuid` does not exist, an id will
9536 * be found or allocated for the osd.
9537 *
9538 * If `id` is specified, and the osd has been previously marked
9539 * as destroyed, then the `id` will be reused.
9540 */
9f95a23c 9541 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
31f18b77
FG
9542 ss << "requires the OSD's UUID to be specified.";
9543 return -EINVAL;
9544 } else if (!uuid.parse(uuidstr.c_str())) {
9545 ss << "invalid UUID value '" << uuidstr << "'.";
9546 return -EINVAL;
9547 }
9548
9f95a23c 9549 if (cmd_getval(cmdmap, "id", id) &&
31f18b77
FG
9550 (id < 0)) {
9551 ss << "invalid OSD id; must be greater or equal than zero.";
9552 return -EINVAL;
9553 }
9554
9555 // are we running an `osd create`-like command, or recreating
9556 // a previously destroyed osd?
9557
9558 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9559
9560 // we will care about `id` to assess whether osd is `destroyed`, or
9561 // to create a new osd.
9562 // we will need an `id` by the time we reach auth.
9563
9564 int32_t existing_id = -1;
9565 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9566 &existing_id, ss);
9567
9568 bool may_be_idempotent = false;
9569 if (err == EEXIST) {
9570 // this is idempotent from the osdmon's point-of-view
9571 may_be_idempotent = true;
11fdf7f2 9572 ceph_assert(existing_id >= 0);
31f18b77
FG
9573 id = existing_id;
9574 } else if (err < 0) {
9575 return err;
9576 }
9577
9578 if (!may_be_idempotent) {
9579 // idempotency is out of the window. We are either creating a new
9580 // osd or recreating a destroyed osd.
9581 //
9582 // We now need to figure out if we have an `id` (and if it's valid),
9583 // of find an `id` if we don't have one.
9584
9585 // NOTE: we need to consider the case where the `id` is specified for
9586 // `osd create`, and we must honor it. So this means checking if
9587 // the `id` is destroyed, and if so assume the destroy; otherwise,
9588 // check if it `exists` - in which case we complain about not being
9589 // `destroyed`. In the end, if nothing fails, we must allow the
9590 // creation, so that we are compatible with `create`.
9591 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9592 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9593 ss << "OSD " << id << " has not yet been destroyed";
9594 return -EINVAL;
9595 } else if (id < 0) {
9596 // find an `id`
9597 id = _allocate_osd_id(&existing_id);
9598 if (id < 0) {
11fdf7f2 9599 ceph_assert(existing_id >= 0);
31f18b77
FG
9600 id = existing_id;
9601 }
9602 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9603 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9604 dout(10) << __func__ << " recreating osd." << id << dendl;
9605 } else {
9606 dout(10) << __func__ << " creating new osd." << id << dendl;
9607 }
9608 } else {
11fdf7f2
TL
9609 ceph_assert(id >= 0);
9610 ceph_assert(osdmap.exists(id));
31f18b77
FG
9611 }
9612
9613 // we are now able to either create a brand new osd or reuse an existing
9614 // osd that has been previously destroyed.
9615
9616 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9617
3a9019d9 9618 if (may_be_idempotent && params.empty()) {
31f18b77 9619 // nothing to do, really.
3a9019d9 9620 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
11fdf7f2 9621 ceph_assert(id >= 0);
31f18b77
FG
9622 if (f) {
9623 f->open_object_section("created_osd");
9624 f->dump_int("osdid", id);
9625 f->close_section();
9626 } else {
9627 ss << id;
9628 }
9629 return EEXIST;
9630 }
9631
3a9019d9
FG
9632 string device_class;
9633 auto p = params.find("crush_device_class");
9634 if (p != params.end()) {
9635 device_class = p->second;
9636 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9637 }
31f18b77
FG
9638 string cephx_secret, lockbox_secret, dmcrypt_key;
9639 bool has_lockbox = false;
3a9019d9
FG
9640 bool has_secrets = params.count("cephx_secret")
9641 || params.count("cephx_lockbox_secret")
9642 || params.count("dmcrypt_key");
31f18b77 9643
f67539c2 9644 KVMonitor *svc = nullptr;
31f18b77
FG
9645 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9646
9647 if (has_secrets) {
3a9019d9 9648 if (params.count("cephx_secret") == 0) {
31f18b77
FG
9649 ss << "requires a cephx secret.";
9650 return -EINVAL;
9651 }
3a9019d9 9652 cephx_secret = params.at("cephx_secret");
31f18b77 9653
3a9019d9
FG
9654 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9655 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
9656
9657 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9658 << " dmcrypt " << has_dmcrypt_key << dendl;
9659
9660 if (has_lockbox_secret && has_dmcrypt_key) {
9661 has_lockbox = true;
3a9019d9
FG
9662 lockbox_secret = params.at("cephx_lockbox_secret");
9663 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
9664 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9665 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9666 return -EINVAL;
9667 }
9668
9669 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9670
f67539c2 9671 err = mon.authmon()->validate_osd_new(id, uuid,
31f18b77
FG
9672 cephx_secret,
9673 lockbox_secret,
9674 cephx_entity,
9675 lockbox_entity,
9676 ss);
9677 if (err < 0) {
9678 return err;
9679 } else if (may_be_idempotent && err != EEXIST) {
9680 // for this to be idempotent, `id` should already be >= 0; no need
9681 // to use validate_id.
11fdf7f2 9682 ceph_assert(id >= 0);
31f18b77
FG
9683 ss << "osd." << id << " exists but secrets do not match";
9684 return -EEXIST;
9685 }
9686
9687 if (has_lockbox) {
f67539c2 9688 svc = mon.kvmon();
31f18b77
FG
9689 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9690 if (err < 0) {
9691 return err;
9692 } else if (may_be_idempotent && err != EEXIST) {
11fdf7f2 9693 ceph_assert(id >= 0);
31f18b77
FG
9694 ss << "osd." << id << " exists but dm-crypt key does not match.";
9695 return -EEXIST;
9696 }
9697 }
9698 }
11fdf7f2
TL
9699 ceph_assert(!has_secrets || !cephx_secret.empty());
9700 ceph_assert(!has_lockbox || !lockbox_secret.empty());
31f18b77
FG
9701
9702 if (may_be_idempotent) {
9703 // we have nothing to do for either the osdmon or the authmon,
9704 // and we have no lockbox - so the config key service will not be
9705 // touched. This is therefore an idempotent operation, and we can
9706 // just return right away.
9707 dout(10) << __func__ << " idempotent -- no op." << dendl;
11fdf7f2 9708 ceph_assert(id >= 0);
31f18b77
FG
9709 if (f) {
9710 f->open_object_section("created_osd");
9711 f->dump_int("osdid", id);
9712 f->close_section();
9713 } else {
9714 ss << id;
9715 }
9716 return EEXIST;
9717 }
11fdf7f2 9718 ceph_assert(!may_be_idempotent);
31f18b77
FG
9719
9720 // perform updates.
9721 if (has_secrets) {
11fdf7f2
TL
9722 ceph_assert(!cephx_secret.empty());
9723 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
31f18b77
FG
9724 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9725
f67539c2 9726 err = mon.authmon()->do_osd_new(cephx_entity,
31f18b77
FG
9727 lockbox_entity,
9728 has_lockbox);
11fdf7f2 9729 ceph_assert(0 == err);
31f18b77
FG
9730
9731 if (has_lockbox) {
11fdf7f2 9732 ceph_assert(nullptr != svc);
31f18b77
FG
9733 svc->do_osd_new(uuid, dmcrypt_key);
9734 }
9735 }
9736
9737 if (is_recreate_destroyed) {
11fdf7f2
TL
9738 ceph_assert(id >= 0);
9739 ceph_assert(osdmap.is_destroyed(id));
11fdf7f2
TL
9740 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9741 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9742 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9743 }
c07f9fc5
FG
9744 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9745 // due to http://tracker.ceph.com/issues/20751 some clusters may
9746 // have UP set for non-existent OSDs; make sure it is cleared
9747 // for a newly created osd.
9748 pending_inc.new_state[id] |= CEPH_OSD_UP;
9749 }
31f18b77
FG
9750 pending_inc.new_uuid[id] = uuid;
9751 } else {
11fdf7f2 9752 ceph_assert(id >= 0);
31f18b77 9753 int32_t new_id = -1;
3a9019d9 9754 do_osd_create(id, uuid, device_class, &new_id);
11fdf7f2
TL
9755 ceph_assert(new_id >= 0);
9756 ceph_assert(id == new_id);
31f18b77
FG
9757 }
9758
9759 if (f) {
9760 f->open_object_section("created_osd");
9761 f->dump_int("osdid", id);
9762 f->close_section();
9763 } else {
9764 ss << id;
9765 }
9766
9767 return 0;
9768}
9769
7c673cae
FG
9770bool OSDMonitor::prepare_command(MonOpRequestRef op)
9771{
9772 op->mark_osdmon_event(__func__);
9f95a23c 9773 auto m = op->get_req<MMonCommand>();
7c673cae 9774 stringstream ss;
11fdf7f2 9775 cmdmap_t cmdmap;
7c673cae
FG
9776 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9777 string rs = ss.str();
f67539c2 9778 mon.reply_command(op, -EINVAL, rs, get_last_committed());
7c673cae
FG
9779 return true;
9780 }
9781
11fdf7f2 9782 MonSession *session = op->get_session();
7c673cae 9783 if (!session) {
11fdf7f2 9784 derr << __func__ << " no session" << dendl;
f67539c2 9785 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
9786 return true;
9787 }
9788
9789 return prepare_command_impl(op, cmdmap);
9790}
9791
9792static int parse_reweights(CephContext *cct,
11fdf7f2 9793 const cmdmap_t& cmdmap,
7c673cae
FG
9794 const OSDMap& osdmap,
9795 map<int32_t, uint32_t>* weights)
9796{
9797 string weights_str;
9f95a23c 9798 if (!cmd_getval(cmdmap, "weights", weights_str)) {
7c673cae
FG
9799 return -EINVAL;
9800 }
9801 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9802 json_spirit::mValue json_value;
9803 if (!json_spirit::read(weights_str, json_value)) {
9804 return -EINVAL;
9805 }
9806 if (json_value.type() != json_spirit::obj_type) {
9807 return -EINVAL;
9808 }
9809 const auto obj = json_value.get_obj();
9810 try {
9811 for (auto& osd_weight : obj) {
9812 auto osd_id = std::stoi(osd_weight.first);
9813 if (!osdmap.exists(osd_id)) {
9814 return -ENOENT;
9815 }
9816 if (osd_weight.second.type() != json_spirit::str_type) {
9817 return -EINVAL;
9818 }
9819 auto weight = std::stoul(osd_weight.second.get_str());
9820 weights->insert({osd_id, weight});
9821 }
9822 } catch (const std::logic_error& e) {
9823 return -EINVAL;
9824 }
9825 return 0;
9826}
9827
31f18b77
FG
9828int OSDMonitor::prepare_command_osd_destroy(
9829 int32_t id,
9830 stringstream& ss)
9831{
f67539c2 9832 ceph_assert(paxos.is_plugged());
31f18b77
FG
9833
9834 // we check if the osd exists for the benefit of `osd purge`, which may
9835 // have previously removed the osd. If the osd does not exist, return
9836 // -ENOENT to convey this, and let the caller deal with it.
9837 //
9838 // we presume that all auth secrets and config keys were removed prior
9839 // to this command being called. if they exist by now, we also assume
9840 // they must have been created by some other command and do not pertain
9841 // to this non-existent osd.
9842 if (!osdmap.exists(id)) {
9843 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9844 return -ENOENT;
9845 }
9846
9847 uuid_d uuid = osdmap.get_uuid(id);
9848 dout(10) << __func__ << " destroying osd." << id
9849 << " uuid " << uuid << dendl;
9850
9851 // if it has been destroyed, we assume our work here is done.
9852 if (osdmap.is_destroyed(id)) {
9853 ss << "destroyed osd." << id;
9854 return 0;
9855 }
9856
9857 EntityName cephx_entity, lockbox_entity;
9858 bool idempotent_auth = false, idempotent_cks = false;
9859
f67539c2 9860 int err = mon.authmon()->validate_osd_destroy(id, uuid,
31f18b77
FG
9861 cephx_entity,
9862 lockbox_entity,
9863 ss);
9864 if (err < 0) {
9865 if (err == -ENOENT) {
9866 idempotent_auth = true;
31f18b77
FG
9867 } else {
9868 return err;
9869 }
9870 }
9871
f67539c2 9872 auto svc = mon.kvmon();
31f18b77
FG
9873 err = svc->validate_osd_destroy(id, uuid);
9874 if (err < 0) {
11fdf7f2 9875 ceph_assert(err == -ENOENT);
31f18b77
FG
9876 err = 0;
9877 idempotent_cks = true;
9878 }
9879
9880 if (!idempotent_auth) {
f67539c2 9881 err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
11fdf7f2 9882 ceph_assert(0 == err);
31f18b77
FG
9883 }
9884
9885 if (!idempotent_cks) {
9886 svc->do_osd_destroy(id, uuid);
9887 }
9888
9889 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9890 pending_inc.new_uuid[id] = uuid_d();
9891
9892 // we can only propose_pending() once per service, otherwise we'll be
9893 // defying PaxosService and all laws of nature. Therefore, as we may
9894 // be used during 'osd purge', let's keep the caller responsible for
9895 // proposing.
11fdf7f2 9896 ceph_assert(err == 0);
31f18b77
FG
9897 return 0;
9898}
9899
9900int OSDMonitor::prepare_command_osd_purge(
9901 int32_t id,
9902 stringstream& ss)
9903{
f67539c2 9904 ceph_assert(paxos.is_plugged());
31f18b77
FG
9905 dout(10) << __func__ << " purging osd." << id << dendl;
9906
11fdf7f2 9907 ceph_assert(!osdmap.is_up(id));
31f18b77
FG
9908
9909 /*
9910 * This may look a bit weird, but this is what's going to happen:
9911 *
9912 * 1. we make sure that removing from crush works
9913 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9914 * error, then we abort the whole operation, as no updates
9915 * have been made. However, we this function will have
9916 * side-effects, thus we need to make sure that all operations
9917 * performed henceforth will *always* succeed.
9918 * 3. we call `prepare_command_osd_remove()`. Although this
9919 * function can return an error, it currently only checks if the
9920 * osd is up - and we have made sure that it is not so, so there
9921 * is no conflict, and it is effectively an update.
9922 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9923 * the crush update we delayed from before.
9924 */
9925
20effc67 9926 CrushWrapper newcrush = _get_pending_crush();
31f18b77
FG
9927
9928 bool may_be_idempotent = false;
9929
9930 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9931 if (err == -ENOENT) {
9932 err = 0;
9933 may_be_idempotent = true;
9934 } else if (err < 0) {
9935 ss << "error removing osd." << id << " from crush";
9936 return err;
9937 }
9938
9939 // no point destroying the osd again if it has already been marked destroyed
9940 if (!osdmap.is_destroyed(id)) {
9941 err = prepare_command_osd_destroy(id, ss);
9942 if (err < 0) {
9943 if (err == -ENOENT) {
9944 err = 0;
9945 } else {
9946 return err;
9947 }
9948 } else {
9949 may_be_idempotent = false;
9950 }
9951 }
11fdf7f2 9952 ceph_assert(0 == err);
31f18b77
FG
9953
9954 if (may_be_idempotent && !osdmap.exists(id)) {
9955 dout(10) << __func__ << " osd." << id << " does not exist and "
9956 << "we are idempotent." << dendl;
9957 return -ENOENT;
9958 }
9959
9960 err = prepare_command_osd_remove(id);
9961 // we should not be busy, as we should have made sure this id is not up.
11fdf7f2 9962 ceph_assert(0 == err);
31f18b77
FG
9963
9964 do_osd_crush_remove(newcrush);
9965 return 0;
9966}
9967
1e59de90
TL
9968int OSDMonitor::parse_pgid(const cmdmap_t& cmdmap, stringstream &ss,
9969 /* out */ pg_t &pgid, std::optional<string> pgids) {
9970 string pgidstr;
9971 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
9972 ss << "unable to parse 'pgid' value '"
9973 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
9974 return -EINVAL;
9975 }
9976 if (!pgid.parse(pgidstr.c_str())) {
9977 ss << "invalid pgid '" << pgidstr << "'";
9978 return -EINVAL;
9979 }
9980 if (!osdmap.pg_exists(pgid)) {
9981 ss << "pgid '" << pgid << "' does not exist";
9982 return -ENOENT;
9983 }
9984 if (pgids.has_value())
9985 pgids.value() = pgidstr;
9986 return 0;
9987}
9988
7c673cae 9989bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
11fdf7f2 9990 const cmdmap_t& cmdmap)
7c673cae
FG
9991{
9992 op->mark_osdmon_event(__func__);
9f95a23c 9993 auto m = op->get_req<MMonCommand>();
7c673cae
FG
9994 bool ret = false;
9995 stringstream ss;
9996 string rs;
9997 bufferlist rdata;
9998 int err = 0;
9999
20effc67 10000 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
7c673cae
FG
10001 boost::scoped_ptr<Formatter> f(Formatter::create(format));
10002
10003 string prefix;
9f95a23c 10004 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
10005
10006 int64_t osdid;
11fdf7f2 10007 string osd_name;
b32b8144
FG
10008 bool osdid_present = false;
10009 if (prefix != "osd pg-temp" &&
10010 prefix != "osd pg-upmap" &&
10011 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9f95a23c 10012 osdid_present = cmd_getval(cmdmap, "id", osdid);
b32b8144 10013 }
7c673cae
FG
10014 if (osdid_present) {
10015 ostringstream oss;
10016 oss << "osd." << osdid;
11fdf7f2 10017 osd_name = oss.str();
7c673cae
FG
10018 }
10019
10020 // Even if there's a pending state with changes that could affect
10021 // a command, considering that said state isn't yet committed, we
10022 // just don't care about those changes if the command currently being
10023 // handled acts as a no-op against the current committed state.
10024 // In a nutshell, we assume this command happens *before*.
10025 //
10026 // Let me make this clearer:
10027 //
10028 // - If we have only one client, and that client issues some
10029 // operation that would conflict with this operation but is
10030 // still on the pending state, then we would be sure that said
10031 // operation wouldn't have returned yet, so the client wouldn't
10032 // issue this operation (unless the client didn't wait for the
10033 // operation to finish, and that would be the client's own fault).
10034 //
10035 // - If we have more than one client, each client will observe
10036 // whatever is the state at the moment of the commit. So, if we
10037 // have two clients, one issuing an unlink and another issuing a
10038 // link, and if the link happens while the unlink is still on the
10039 // pending state, from the link's point-of-view this is a no-op.
10040 // If different clients are issuing conflicting operations and
10041 // they care about that, then the clients should make sure they
10042 // enforce some kind of concurrency mechanism -- from our
10043 // perspective that's what Douglas Adams would call an SEP.
10044 //
10045 // This should be used as a general guideline for most commands handled
10046 // in this function. Adapt as you see fit, but please bear in mind that
10047 // this is the expected behavior.
10048
10049
10050 if (prefix == "osd setcrushmap" ||
10051 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
10052 if (pending_inc.crush.length()) {
10053 dout(10) << __func__ << " waiting for pending crush update " << dendl;
10054 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10055 return true;
10056 }
7c673cae
FG
10057 dout(10) << "prepare_command setting new crush map" << dendl;
10058 bufferlist data(m->get_data());
10059 CrushWrapper crush;
10060 try {
11fdf7f2 10061 auto bl = data.cbegin();
7c673cae
FG
10062 crush.decode(bl);
10063 }
10064 catch (const std::exception &e) {
10065 err = -EINVAL;
10066 ss << "Failed to parse crushmap: " << e.what();
10067 goto reply;
10068 }
31f18b77
FG
10069
10070 int64_t prior_version = 0;
9f95a23c 10071 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
31f18b77
FG
10072 if (prior_version == osdmap.get_crush_version() - 1) {
10073 // see if we are a resend of the last update. this is imperfect
10074 // (multiple racing updaters may not both get reliable success)
10075 // but we expect crush updaters (via this interface) to be rare-ish.
10076 bufferlist current, proposed;
f67539c2
TL
10077 osdmap.crush->encode(current, mon.get_quorum_con_features());
10078 crush.encode(proposed, mon.get_quorum_con_features());
31f18b77
FG
10079 if (current.contents_equal(proposed)) {
10080 dout(10) << __func__
10081 << " proposed matches current and version equals previous"
10082 << dendl;
10083 err = 0;
10084 ss << osdmap.get_crush_version();
10085 goto reply;
10086 }
10087 }
10088 if (prior_version != osdmap.get_crush_version()) {
10089 err = -EPERM;
10090 ss << "prior_version " << prior_version << " != crush version "
10091 << osdmap.get_crush_version();
10092 goto reply;
10093 }
10094 }
7c673cae
FG
10095
10096 if (!validate_crush_against_features(&crush, ss)) {
10097 err = -EINVAL;
10098 goto reply;
10099 }
31f18b77 10100
3efd9988
FG
10101 err = osdmap.validate_crush_rules(&crush, &ss);
10102 if (err < 0) {
10103 goto reply;
7c673cae
FG
10104 }
10105
11fdf7f2 10106 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
10107 // sanity check: test some inputs to make sure this map isn't
10108 // totally broken
10109 dout(10) << " testing map" << dendl;
10110 stringstream ess;
10111 CrushTester tester(crush, ess);
b5b8bbf5 10112 tester.set_min_x(0);
224ce89b 10113 tester.set_max_x(50);
20effc67 10114 tester.set_num_rep(3); // arbitrary
b5b8bbf5 10115 auto start = ceph::coarse_mono_clock::now();
39ae355f 10116 int r = tester.test_with_fork(cct, g_conf()->mon_lease);
b5b8bbf5 10117 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
10118 if (r < 0) {
10119 dout(10) << " tester.test_with_fork returns " << r
10120 << ": " << ess.str() << dendl;
10121 ss << "crush smoke test failed with " << r << ": " << ess.str();
10122 err = r;
10123 goto reply;
10124 }
b5b8bbf5
FG
10125 dout(10) << __func__ << " crush somke test duration: "
10126 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
10127 }
10128
7c673cae 10129 pending_inc.crush = data;
31f18b77 10130 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
10131 goto update;
10132
3efd9988 10133 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
20effc67 10134 CrushWrapper newcrush = _get_pending_crush();
3efd9988
FG
10135 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10136 int bid = -1 - b;
10137 if (newcrush.bucket_exists(bid) &&
11fdf7f2 10138 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
3efd9988
FG
10139 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10140 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10141 }
10142 }
10143 if (!validate_crush_against_features(&newcrush, ss)) {
10144 err = -EINVAL;
10145 goto reply;
10146 }
10147 pending_inc.crush.clear();
f67539c2 10148 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
3efd9988
FG
10149 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10150 get_last_committed() + 1));
10151 return true;
7c673cae 10152 } else if (prefix == "osd crush set-device-class") {
7c673cae 10153 string device_class;
9f95a23c 10154 if (!cmd_getval(cmdmap, "class", device_class)) {
7c673cae
FG
10155 err = -EINVAL; // no value!
10156 goto reply;
10157 }
10158
224ce89b
WB
10159 bool stop = false;
10160 vector<string> idvec;
9f95a23c 10161 cmd_getval(cmdmap, "ids", idvec);
20effc67 10162 CrushWrapper newcrush = _get_pending_crush();
224ce89b
WB
10163 set<int> updated;
10164 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10165 set<int> osds;
10166 // wildcard?
10167 if (j == 0 &&
10168 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10169 osdmap.get_all_osds(osds);
10170 stop = true;
10171 } else {
10172 // try traditional single osd way
10173 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10174 if (osd < 0) {
10175 // ss has reason for failure
10176 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10177 err = -EINVAL;
10178 continue;
10179 }
10180 osds.insert(osd);
10181 }
7c673cae 10182
224ce89b
WB
10183 for (auto &osd : osds) {
10184 if (!osdmap.exists(osd)) {
10185 ss << "osd." << osd << " does not exist. ";
10186 continue;
10187 }
7c673cae 10188
224ce89b
WB
10189 ostringstream oss;
10190 oss << "osd." << osd;
10191 string name = oss.str();
7c673cae 10192
3a9019d9
FG
10193 if (newcrush.get_max_devices() < osd + 1) {
10194 newcrush.set_max_devices(osd + 1);
10195 }
224ce89b
WB
10196 string action;
10197 if (newcrush.item_exists(osd)) {
10198 action = "updating";
10199 } else {
10200 action = "creating";
10201 newcrush.set_item_name(osd, name);
10202 }
7c673cae 10203
224ce89b
WB
10204 dout(5) << action << " crush item id " << osd << " name '" << name
10205 << "' device_class '" << device_class << "'"
10206 << dendl;
10207 err = newcrush.update_device_class(osd, device_class, name, &ss);
10208 if (err < 0) {
10209 goto reply;
10210 }
10211 if (err == 0 && !_have_pending_crush()) {
10212 if (!stop) {
10213 // for single osd only, wildcard makes too much noise
10214 ss << "set-device-class item id " << osd << " name '" << name
11fdf7f2 10215 << "' device_class '" << device_class << "': no change. ";
224ce89b
WB
10216 }
10217 } else {
10218 updated.insert(osd);
10219 }
10220 }
7c673cae
FG
10221 }
10222
f67539c2
TL
10223 pending_inc.crush.clear();
10224 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10225 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10226 getline(ss, rs);
10227 wait_for_finished_proposal(
10228 op,
10229 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10230 return true;
c07f9fc5
FG
10231 } else if (prefix == "osd crush rm-device-class") {
10232 bool stop = false;
10233 vector<string> idvec;
9f95a23c 10234 cmd_getval(cmdmap, "ids", idvec);
20effc67 10235 CrushWrapper newcrush = _get_pending_crush();
c07f9fc5
FG
10236 set<int> updated;
10237
10238 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10239 set<int> osds;
10240
10241 // wildcard?
10242 if (j == 0 &&
10243 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10244 osdmap.get_all_osds(osds);
10245 stop = true;
10246 } else {
10247 // try traditional single osd way
10248 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10249 if (osd < 0) {
10250 // ss has reason for failure
10251 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10252 err = -EINVAL;
10253 goto reply;
10254 }
10255 osds.insert(osd);
10256 }
10257
10258 for (auto &osd : osds) {
10259 if (!osdmap.exists(osd)) {
10260 ss << "osd." << osd << " does not exist. ";
10261 continue;
10262 }
10263
10264 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
10265 if (!class_name) {
10266 ss << "osd." << osd << " belongs to no class, ";
10267 continue;
10268 }
10269 // note that we do not verify if class_is_in_use here
10270 // in case the device is misclassified and user wants
10271 // to overridely reset...
10272
11fdf7f2 10273 err = newcrush.remove_device_class(cct, osd, &ss);
c07f9fc5
FG
10274 if (err < 0) {
10275 // ss has reason for failure
10276 goto reply;
10277 }
10278 updated.insert(osd);
10279 }
10280 }
10281
f67539c2
TL
10282 pending_inc.crush.clear();
10283 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10284 ss << "done removing class of osd(s): " << updated;
10285 getline(ss, rs);
10286 wait_for_finished_proposal(
10287 op,
10288 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10289 return true;
11fdf7f2
TL
10290 } else if (prefix == "osd crush class create") {
10291 string device_class;
9f95a23c 10292 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
10293 err = -EINVAL; // no value!
10294 goto reply;
10295 }
9f95a23c 10296 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
10297 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10298 << "luminous' before using crush device classes";
10299 err = -EPERM;
10300 goto reply;
10301 }
10302 if (!_have_pending_crush() &&
10303 _get_stable_crush().class_exists(device_class)) {
10304 ss << "class '" << device_class << "' already exists";
10305 goto reply;
10306 }
20effc67 10307 CrushWrapper newcrush = _get_pending_crush();
11fdf7f2
TL
10308 if (newcrush.class_exists(device_class)) {
10309 ss << "class '" << device_class << "' already exists";
10310 goto update;
10311 }
10312 int class_id = newcrush.get_or_create_class_id(device_class);
10313 pending_inc.crush.clear();
f67539c2 10314 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10315 ss << "created class " << device_class << " with id " << class_id
10316 << " to crush map";
10317 goto update;
10318 } else if (prefix == "osd crush class rm") {
10319 string device_class;
9f95a23c 10320 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
10321 err = -EINVAL; // no value!
10322 goto reply;
10323 }
9f95a23c 10324 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
10325 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10326 << "luminous' before using crush device classes";
10327 err = -EPERM;
10328 goto reply;
10329 }
10330
10331 if (!osdmap.crush->class_exists(device_class)) {
10332 err = 0;
10333 goto reply;
10334 }
10335
20effc67 10336 CrushWrapper newcrush = _get_pending_crush();
11fdf7f2
TL
10337 if (!newcrush.class_exists(device_class)) {
10338 err = 0; // make command idempotent
10339 goto wait;
10340 }
10341 int class_id = newcrush.get_class_id(device_class);
10342 stringstream ts;
10343 if (newcrush.class_is_in_use(class_id, &ts)) {
10344 err = -EBUSY;
10345 ss << "class '" << device_class << "' " << ts.str();
10346 goto reply;
10347 }
10348
10349 // check if class is used by any erasure-code-profiles
10350 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10351 osdmap.get_erasure_code_profiles();
10352 auto ec_profiles = pending_inc.get_erasure_code_profiles();
10353#ifdef HAVE_STDLIB_MAP_SPLICING
10354 ec_profiles.merge(old_ec_profiles);
10355#else
10356 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10357 make_move_iterator(end(old_ec_profiles)));
10358#endif
10359 list<string> referenced_by;
10360 for (auto &i: ec_profiles) {
10361 for (auto &j: i.second) {
10362 if ("crush-device-class" == j.first && device_class == j.second) {
10363 referenced_by.push_back(i.first);
10364 }
10365 }
10366 }
10367 if (!referenced_by.empty()) {
10368 err = -EBUSY;
10369 ss << "class '" << device_class
10370 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10371 goto reply;
10372 }
10373
10374 set<int> osds;
10375 newcrush.get_devices_by_class(device_class, &osds);
10376 for (auto& p: osds) {
1e59de90 10377 err = newcrush.remove_device_class(cct, p, &ss);
11fdf7f2
TL
10378 if (err < 0) {
10379 // ss has reason for failure
10380 goto reply;
10381 }
10382 }
10383
10384 if (osds.empty()) {
10385 // empty class, remove directly
10386 err = newcrush.remove_class_name(device_class);
10387 if (err < 0) {
10388 ss << "class '" << device_class << "' cannot be removed '"
10389 << cpp_strerror(err) << "'";
10390 goto reply;
10391 }
10392 }
10393
10394 pending_inc.crush.clear();
f67539c2 10395 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10396 ss << "removed class " << device_class << " with id " << class_id
10397 << " from crush map";
10398 goto update;
35e4c445
FG
10399 } else if (prefix == "osd crush class rename") {
10400 string srcname, dstname;
9f95a23c 10401 if (!cmd_getval(cmdmap, "srcname", srcname)) {
35e4c445
FG
10402 err = -EINVAL;
10403 goto reply;
10404 }
9f95a23c 10405 if (!cmd_getval(cmdmap, "dstname", dstname)) {
35e4c445
FG
10406 err = -EINVAL;
10407 goto reply;
10408 }
10409
20effc67 10410 CrushWrapper newcrush = _get_pending_crush();
181888fb
FG
10411 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10412 // suppose this is a replay and return success
10413 // so command is idempotent
10414 ss << "already renamed to '" << dstname << "'";
10415 err = 0;
35e4c445
FG
10416 goto reply;
10417 }
c07f9fc5 10418
35e4c445
FG
10419 err = newcrush.rename_class(srcname, dstname);
10420 if (err < 0) {
10421 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10422 << cpp_strerror(err);
10423 goto reply;
10424 }
10425
10426 pending_inc.crush.clear();
f67539c2 10427 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
35e4c445
FG
10428 ss << "rename class '" << srcname << "' to '" << dstname << "'";
10429 goto update;
7c673cae
FG
10430 } else if (prefix == "osd crush add-bucket") {
10431 // os crush add-bucket <name> <type>
10432 string name, typestr;
11fdf7f2 10433 vector<string> argvec;
9f95a23c
TL
10434 cmd_getval(cmdmap, "name", name);
10435 cmd_getval(cmdmap, "type", typestr);
10436 cmd_getval(cmdmap, "args", argvec);
11fdf7f2
TL
10437 map<string,string> loc;
10438 if (!argvec.empty()) {
10439 CrushWrapper::parse_loc_map(argvec, &loc);
10440 dout(0) << "will create and move bucket '" << name
10441 << "' to location " << loc << dendl;
10442 }
7c673cae
FG
10443
10444 if (!_have_pending_crush() &&
10445 _get_stable_crush().name_exists(name)) {
10446 ss << "bucket '" << name << "' already exists";
10447 goto reply;
10448 }
10449
20effc67 10450 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
10451
10452 if (newcrush.name_exists(name)) {
10453 ss << "bucket '" << name << "' already exists";
10454 goto update;
10455 }
10456 int type = newcrush.get_type_id(typestr);
10457 if (type < 0) {
10458 ss << "type '" << typestr << "' does not exist";
10459 err = -EINVAL;
10460 goto reply;
10461 }
10462 if (type == 0) {
10463 ss << "type '" << typestr << "' is for devices, not buckets";
10464 err = -EINVAL;
10465 goto reply;
10466 }
10467 int bucketno;
10468 err = newcrush.add_bucket(0, 0,
10469 CRUSH_HASH_DEFAULT, type, 0, NULL,
10470 NULL, &bucketno);
10471 if (err < 0) {
10472 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10473 goto reply;
10474 }
10475 err = newcrush.set_item_name(bucketno, name);
10476 if (err < 0) {
10477 ss << "error setting bucket name to '" << name << "'";
10478 goto reply;
10479 }
10480
11fdf7f2
TL
10481 if (!loc.empty()) {
10482 if (!newcrush.check_item_loc(cct, bucketno, loc,
10483 (int *)NULL)) {
10484 err = newcrush.move_bucket(cct, bucketno, loc);
10485 if (err < 0) {
10486 ss << "error moving bucket '" << name << "' to location " << loc;
10487 goto reply;
10488 }
10489 } else {
10490 ss << "no need to move item id " << bucketno << " name '" << name
10491 << "' to location " << loc << " in crush map";
10492 }
10493 }
10494
7c673cae 10495 pending_inc.crush.clear();
f67539c2 10496 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10497 if (loc.empty()) {
10498 ss << "added bucket " << name << " type " << typestr
10499 << " to crush map";
10500 } else {
10501 ss << "added bucket " << name << " type " << typestr
10502 << " to location " << loc;
10503 }
7c673cae
FG
10504 goto update;
10505 } else if (prefix == "osd crush rename-bucket") {
10506 string srcname, dstname;
9f95a23c
TL
10507 cmd_getval(cmdmap, "srcname", srcname);
10508 cmd_getval(cmdmap, "dstname", dstname);
7c673cae
FG
10509
10510 err = crush_rename_bucket(srcname, dstname, &ss);
10511 if (err == -EALREADY) // equivalent to success for idempotency
10512 err = 0;
10513 if (err)
10514 goto reply;
10515 else
10516 goto update;
c07f9fc5
FG
10517 } else if (prefix == "osd crush weight-set create" ||
10518 prefix == "osd crush weight-set create-compat") {
20effc67
TL
10519 if (_have_pending_crush()) {
10520 dout(10) << " first waiting for pending crush changes to commit" << dendl;
10521 goto wait;
10522 }
10523 CrushWrapper newcrush = _get_pending_crush();
c07f9fc5
FG
10524 int64_t pool;
10525 int positions;
10526 if (newcrush.has_non_straw2_buckets()) {
10527 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
10528 err = -EPERM;
10529 goto reply;
10530 }
c07f9fc5 10531 if (prefix == "osd crush weight-set create") {
9f95a23c
TL
10532 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10533 osdmap.require_min_compat_client < ceph_release_t::luminous) {
c07f9fc5 10534 ss << "require_min_compat_client "
9f95a23c 10535 << osdmap.require_min_compat_client
c07f9fc5
FG
10536 << " < luminous, which is required for per-pool weight-sets. "
10537 << "Try 'ceph osd set-require-min-compat-client luminous' "
10538 << "before using the new interface";
10539 err = -EPERM;
10540 goto reply;
10541 }
10542 string poolname, mode;
9f95a23c 10543 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10544 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10545 if (pool < 0) {
10546 ss << "pool '" << poolname << "' not found";
10547 err = -ENOENT;
10548 goto reply;
10549 }
9f95a23c 10550 cmd_getval(cmdmap, "mode", mode);
c07f9fc5
FG
10551 if (mode != "flat" && mode != "positional") {
10552 ss << "unrecognized weight-set mode '" << mode << "'";
10553 err = -EINVAL;
10554 goto reply;
10555 }
10556 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10557 } else {
10558 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10559 positions = 1;
224ce89b 10560 }
11fdf7f2
TL
10561 if (!newcrush.create_choose_args(pool, positions)) {
10562 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10563 ss << "compat weight-set already created";
10564 } else {
10565 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10566 << "' already created";
10567 }
10568 goto reply;
10569 }
c07f9fc5 10570 pending_inc.crush.clear();
f67539c2 10571 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
c07f9fc5 10572 goto update;
224ce89b 10573
c07f9fc5
FG
10574 } else if (prefix == "osd crush weight-set rm" ||
10575 prefix == "osd crush weight-set rm-compat") {
20effc67 10576 CrushWrapper newcrush = _get_pending_crush();
c07f9fc5
FG
10577 int64_t pool;
10578 if (prefix == "osd crush weight-set rm") {
10579 string poolname;
9f95a23c 10580 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10581 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10582 if (pool < 0) {
10583 ss << "pool '" << poolname << "' not found";
10584 err = -ENOENT;
10585 goto reply;
10586 }
10587 } else {
10588 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 10589 }
c07f9fc5
FG
10590 newcrush.rm_choose_args(pool);
10591 pending_inc.crush.clear();
f67539c2 10592 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
c07f9fc5 10593 goto update;
224ce89b 10594
c07f9fc5
FG
10595 } else if (prefix == "osd crush weight-set reweight" ||
10596 prefix == "osd crush weight-set reweight-compat") {
10597 string poolname, item;
10598 vector<double> weight;
9f95a23c
TL
10599 cmd_getval(cmdmap, "pool", poolname);
10600 cmd_getval(cmdmap, "item", item);
10601 cmd_getval(cmdmap, "weight", weight);
20effc67 10602 CrushWrapper newcrush = _get_pending_crush();
c07f9fc5
FG
10603 int64_t pool;
10604 if (prefix == "osd crush weight-set reweight") {
10605 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10606 if (pool < 0) {
10607 ss << "pool '" << poolname << "' not found";
10608 err = -ENOENT;
10609 goto reply;
10610 }
10611 if (!newcrush.have_choose_args(pool)) {
10612 ss << "no weight-set for pool '" << poolname << "'";
10613 err = -ENOENT;
10614 goto reply;
10615 }
10616 auto arg_map = newcrush.choose_args_get(pool);
10617 int positions = newcrush.get_choose_args_positions(arg_map);
10618 if (weight.size() != (size_t)positions) {
10619 ss << "must specify exact " << positions << " weight values";
10620 err = -EINVAL;
10621 goto reply;
10622 }
10623 } else {
10624 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10625 if (!newcrush.have_choose_args(pool)) {
10626 ss << "no backward-compatible weight-set";
10627 err = -ENOENT;
10628 goto reply;
10629 }
224ce89b 10630 }
c07f9fc5
FG
10631 if (!newcrush.name_exists(item)) {
10632 ss << "item '" << item << "' does not exist";
10633 err = -ENOENT;
224ce89b
WB
10634 goto reply;
10635 }
c07f9fc5 10636 err = newcrush.choose_args_adjust_item_weightf(
11fdf7f2 10637 cct,
c07f9fc5
FG
10638 newcrush.choose_args_get(pool),
10639 newcrush.get_item_id(item),
10640 weight,
10641 &ss);
224ce89b 10642 if (err < 0) {
224ce89b
WB
10643 goto reply;
10644 }
c07f9fc5 10645 err = 0;
224ce89b 10646 pending_inc.crush.clear();
f67539c2 10647 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
224ce89b 10648 goto update;
7c673cae
FG
10649 } else if (osdid_present &&
10650 (prefix == "osd crush set" || prefix == "osd crush add")) {
10651 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10652 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10653 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10654
10655 if (!osdmap.exists(osdid)) {
10656 err = -ENOENT;
11fdf7f2
TL
10657 ss << osd_name
10658 << " does not exist. Create it before updating the crush map";
7c673cae
FG
10659 goto reply;
10660 }
10661
10662 double weight;
9f95a23c 10663 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10664 ss << "unable to parse weight value '"
11fdf7f2 10665 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10666 err = -EINVAL;
10667 goto reply;
10668 }
10669
10670 string args;
10671 vector<string> argvec;
9f95a23c 10672 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10673 map<string,string> loc;
10674 CrushWrapper::parse_loc_map(argvec, &loc);
10675
10676 if (prefix == "osd crush set"
10677 && !_get_stable_crush().item_exists(osdid)) {
10678 err = -ENOENT;
11fdf7f2 10679 ss << "unable to set item id " << osdid << " name '" << osd_name
7c673cae
FG
10680 << "' weight " << weight << " at location " << loc
10681 << ": does not exist";
10682 goto reply;
10683 }
10684
10685 dout(5) << "adding/updating crush item id " << osdid << " name '"
11fdf7f2 10686 << osd_name << "' weight " << weight << " at location "
7c673cae 10687 << loc << dendl;
20effc67 10688 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
10689
10690 string action;
10691 if (prefix == "osd crush set" ||
11fdf7f2 10692 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
7c673cae 10693 action = "set";
11fdf7f2 10694 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10695 } else {
10696 action = "add";
11fdf7f2 10697 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10698 if (err == 0)
10699 err = 1;
10700 }
10701
10702 if (err < 0)
10703 goto reply;
10704
10705 if (err == 0 && !_have_pending_crush()) {
11fdf7f2
TL
10706 ss << action << " item id " << osdid << " name '" << osd_name
10707 << "' weight " << weight << " at location " << loc << ": no change";
7c673cae
FG
10708 goto reply;
10709 }
10710
10711 pending_inc.crush.clear();
f67539c2 10712 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10713 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10714 << weight << " at location " << loc << " to crush map";
7c673cae
FG
10715 getline(ss, rs);
10716 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10717 get_last_committed() + 1));
10718 return true;
10719
10720 } else if (prefix == "osd crush create-or-move") {
10721 do {
10722 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10723 if (!osdmap.exists(osdid)) {
10724 err = -ENOENT;
11fdf7f2
TL
10725 ss << osd_name
10726 << " does not exist. create it before updating the crush map";
7c673cae
FG
10727 goto reply;
10728 }
10729
10730 double weight;
9f95a23c 10731 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10732 ss << "unable to parse weight value '"
11fdf7f2 10733 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10734 err = -EINVAL;
10735 goto reply;
10736 }
10737
10738 string args;
10739 vector<string> argvec;
9f95a23c 10740 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10741 map<string,string> loc;
10742 CrushWrapper::parse_loc_map(argvec, &loc);
10743
11fdf7f2
TL
10744 dout(0) << "create-or-move crush item name '" << osd_name
10745 << "' initial_weight " << weight << " at location " << loc
10746 << dendl;
7c673cae 10747
20effc67 10748 CrushWrapper newcrush = _get_pending_crush();
7c673cae 10749
11fdf7f2
TL
10750 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10751 g_conf()->osd_crush_update_weight_set);
7c673cae 10752 if (err == 0) {
11fdf7f2
TL
10753 ss << "create-or-move updated item name '" << osd_name
10754 << "' weight " << weight
7c673cae
FG
10755 << " at location " << loc << " to crush map";
10756 break;
10757 }
10758 if (err > 0) {
10759 pending_inc.crush.clear();
f67539c2 10760 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11fdf7f2
TL
10761 ss << "create-or-move updating item name '" << osd_name
10762 << "' weight " << weight
7c673cae
FG
10763 << " at location " << loc << " to crush map";
10764 getline(ss, rs);
10765 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10766 get_last_committed() + 1));
10767 return true;
10768 }
10769 } while (false);
10770
10771 } else if (prefix == "osd crush move") {
10772 do {
10773 // osd crush move <name> <loc1> [<loc2> ...]
11fdf7f2 10774 string name;
7c673cae 10775 vector<string> argvec;
9f95a23c
TL
10776 cmd_getval(cmdmap, "name", name);
10777 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10778 map<string,string> loc;
10779 CrushWrapper::parse_loc_map(argvec, &loc);
10780
10781 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
20effc67 10782 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
10783
10784 if (!newcrush.name_exists(name)) {
10785 err = -ENOENT;
10786 ss << "item " << name << " does not exist";
10787 break;
10788 }
10789 int id = newcrush.get_item_id(name);
10790
11fdf7f2 10791 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
7c673cae 10792 if (id >= 0) {
11fdf7f2
TL
10793 err = newcrush.create_or_move_item(
10794 cct, id, 0, name, loc,
10795 g_conf()->osd_crush_update_weight_set);
7c673cae 10796 } else {
11fdf7f2 10797 err = newcrush.move_bucket(cct, id, loc);
7c673cae
FG
10798 }
10799 if (err >= 0) {
10800 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10801 pending_inc.crush.clear();
f67539c2 10802 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10803 getline(ss, rs);
10804 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10805 get_last_committed() + 1));
10806 return true;
10807 }
10808 } else {
10809 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10810 err = 0;
10811 }
10812 } while (false);
31f18b77 10813 } else if (prefix == "osd crush swap-bucket") {
11fdf7f2 10814 string source, dest;
9f95a23c
TL
10815 cmd_getval(cmdmap, "source", source);
10816 cmd_getval(cmdmap, "dest", dest);
11fdf7f2
TL
10817
10818 bool force = false;
9f95a23c 10819 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2 10820
20effc67 10821 CrushWrapper newcrush = _get_pending_crush();
31f18b77
FG
10822 if (!newcrush.name_exists(source)) {
10823 ss << "source item " << source << " does not exist";
10824 err = -ENOENT;
10825 goto reply;
10826 }
10827 if (!newcrush.name_exists(dest)) {
10828 ss << "dest item " << dest << " does not exist";
10829 err = -ENOENT;
10830 goto reply;
10831 }
10832 int sid = newcrush.get_item_id(source);
10833 int did = newcrush.get_item_id(dest);
10834 int sparent;
11fdf7f2 10835 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
31f18b77
FG
10836 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10837 err = -EPERM;
10838 goto reply;
10839 }
10840 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
11fdf7f2 10841 !force) {
31f18b77
FG
10842 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10843 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10844 << "; pass --yes-i-really-mean-it to proceed anyway";
10845 err = -EPERM;
10846 goto reply;
10847 }
11fdf7f2 10848 int r = newcrush.swap_bucket(cct, sid, did);
31f18b77
FG
10849 if (r < 0) {
10850 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 10851 err = r;
31f18b77
FG
10852 goto reply;
10853 }
10854 ss << "swapped bucket of " << source << " to " << dest;
10855 pending_inc.crush.clear();
f67539c2 10856 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
31f18b77
FG
10857 wait_for_finished_proposal(op,
10858 new Monitor::C_Command(mon, op, err, ss.str(),
10859 get_last_committed() + 1));
10860 return true;
10861 } else if (prefix == "osd crush link") {
10862 // osd crush link <name> <loc1> [<loc2> ...]
10863 string name;
9f95a23c 10864 cmd_getval(cmdmap, "name", name);
31f18b77 10865 vector<string> argvec;
9f95a23c 10866 cmd_getval(cmdmap, "args", argvec);
31f18b77
FG
10867 map<string,string> loc;
10868 CrushWrapper::parse_loc_map(argvec, &loc);
10869
10870 // Need an explicit check for name_exists because get_item_id returns
10871 // 0 on unfound.
10872 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
10873 if (!osdmap.crush->name_exists(name)) {
10874 err = -ENOENT;
10875 ss << "item " << name << " does not exist";
10876 goto reply;
10877 } else {
10878 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10879 }
11fdf7f2 10880 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
7c673cae
FG
10881 ss << "no need to move item id " << id << " name '" << name
10882 << "' to location " << loc << " in crush map";
10883 err = 0;
10884 goto reply;
10885 }
10886
10887 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
20effc67 10888 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
10889
10890 if (!newcrush.name_exists(name)) {
10891 err = -ENOENT;
10892 ss << "item " << name << " does not exist";
10893 goto reply;
10894 } else {
10895 int id = newcrush.get_item_id(name);
11fdf7f2
TL
10896 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10897 err = newcrush.link_bucket(cct, id, loc);
7c673cae
FG
10898 if (err >= 0) {
10899 ss << "linked item id " << id << " name '" << name
10900 << "' to location " << loc << " in crush map";
10901 pending_inc.crush.clear();
f67539c2 10902 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10903 } else {
10904 ss << "cannot link item id " << id << " name '" << name
10905 << "' to location " << loc;
10906 goto reply;
10907 }
10908 } else {
10909 ss << "no need to move item id " << id << " name '" << name
10910 << "' to location " << loc << " in crush map";
10911 err = 0;
10912 }
10913 }
10914 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10915 get_last_committed() + 1));
10916 return true;
10917 } else if (prefix == "osd crush rm" ||
10918 prefix == "osd crush remove" ||
10919 prefix == "osd crush unlink") {
10920 do {
10921 // osd crush rm <id> [ancestor]
20effc67 10922 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
10923
10924 string name;
9f95a23c 10925 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10926
10927 if (!osdmap.crush->name_exists(name)) {
10928 err = 0;
10929 ss << "device '" << name << "' does not appear in the crush map";
10930 break;
10931 }
10932 if (!newcrush.name_exists(name)) {
10933 err = 0;
10934 ss << "device '" << name << "' does not appear in the crush map";
10935 getline(ss, rs);
10936 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10937 get_last_committed() + 1));
10938 return true;
10939 }
10940 int id = newcrush.get_item_id(name);
31f18b77
FG
10941 int ancestor = 0;
10942
7c673cae
FG
10943 bool unlink_only = prefix == "osd crush unlink";
10944 string ancestor_str;
9f95a23c 10945 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
7c673cae
FG
10946 if (!newcrush.name_exists(ancestor_str)) {
10947 err = -ENOENT;
10948 ss << "ancestor item '" << ancestor_str
10949 << "' does not appear in the crush map";
10950 break;
10951 }
31f18b77 10952 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 10953 }
31f18b77
FG
10954
10955 err = prepare_command_osd_crush_remove(
10956 newcrush,
10957 id, ancestor,
10958 (ancestor < 0), unlink_only);
10959
7c673cae
FG
10960 if (err == -ENOENT) {
10961 ss << "item " << id << " does not appear in that position";
10962 err = 0;
10963 break;
10964 }
10965 if (err == 0) {
81eedcae
TL
10966 if (!unlink_only)
10967 pending_inc.new_crush_node_flags[id] = 0;
7c673cae
FG
10968 ss << "removed item id " << id << " name '" << name << "' from crush map";
10969 getline(ss, rs);
10970 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10971 get_last_committed() + 1));
10972 return true;
10973 }
10974 } while (false);
10975
10976 } else if (prefix == "osd crush reweight-all") {
20effc67 10977 CrushWrapper newcrush = _get_pending_crush();
7c673cae 10978
11fdf7f2 10979 newcrush.reweight(cct);
7c673cae 10980 pending_inc.crush.clear();
f67539c2 10981 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
10982 ss << "reweighted crush hierarchy";
10983 getline(ss, rs);
10984 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10985 get_last_committed() + 1));
10986 return true;
10987 } else if (prefix == "osd crush reweight") {
10988 // osd crush reweight <name> <weight>
20effc67 10989 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
10990
10991 string name;
9f95a23c 10992 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10993 if (!newcrush.name_exists(name)) {
10994 err = -ENOENT;
10995 ss << "device '" << name << "' does not appear in the crush map";
10996 goto reply;
10997 }
10998
10999 int id = newcrush.get_item_id(name);
11000 if (id < 0) {
11001 ss << "device '" << name << "' is not a leaf in the crush map";
11002 err = -EINVAL;
11003 goto reply;
11004 }
11005 double w;
9f95a23c 11006 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 11007 ss << "unable to parse weight value '"
11fdf7f2 11008 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
11009 err = -EINVAL;
11010 goto reply;
11011 }
11012
11fdf7f2
TL
11013 err = newcrush.adjust_item_weightf(cct, id, w,
11014 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
11015 if (err < 0)
11016 goto reply;
11017 pending_inc.crush.clear();
f67539c2 11018 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
11019 ss << "reweighted item id " << id << " name '" << name << "' to " << w
11020 << " in crush map";
11021 getline(ss, rs);
11022 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11023 get_last_committed() + 1));
11024 return true;
11025 } else if (prefix == "osd crush reweight-subtree") {
11026 // osd crush reweight <name> <weight>
20effc67 11027 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
11028
11029 string name;
9f95a23c 11030 cmd_getval(cmdmap, "name", name);
7c673cae
FG
11031 if (!newcrush.name_exists(name)) {
11032 err = -ENOENT;
11033 ss << "device '" << name << "' does not appear in the crush map";
11034 goto reply;
11035 }
11036
11037 int id = newcrush.get_item_id(name);
11038 if (id >= 0) {
11039 ss << "device '" << name << "' is not a subtree in the crush map";
11040 err = -EINVAL;
11041 goto reply;
11042 }
11043 double w;
9f95a23c 11044 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 11045 ss << "unable to parse weight value '"
11fdf7f2 11046 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
11047 err = -EINVAL;
11048 goto reply;
11049 }
11050
11fdf7f2
TL
11051 err = newcrush.adjust_subtree_weightf(cct, id, w,
11052 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
11053 if (err < 0)
11054 goto reply;
11055 pending_inc.crush.clear();
f67539c2 11056 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
11057 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
11058 << " in crush map";
11059 getline(ss, rs);
11060 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11061 get_last_committed() + 1));
11062 return true;
11063 } else if (prefix == "osd crush tunables") {
20effc67 11064 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
11065
11066 err = 0;
11067 string profile;
9f95a23c 11068 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
11069 if (profile == "legacy" || profile == "argonaut") {
11070 newcrush.set_tunables_legacy();
11071 } else if (profile == "bobtail") {
11072 newcrush.set_tunables_bobtail();
11073 } else if (profile == "firefly") {
11074 newcrush.set_tunables_firefly();
11075 } else if (profile == "hammer") {
11076 newcrush.set_tunables_hammer();
11077 } else if (profile == "jewel") {
11078 newcrush.set_tunables_jewel();
11079 } else if (profile == "optimal") {
11080 newcrush.set_tunables_optimal();
11081 } else if (profile == "default") {
11082 newcrush.set_tunables_default();
11083 } else {
11084 ss << "unrecognized profile '" << profile << "'";
11085 err = -EINVAL;
11086 goto reply;
11087 }
11088
11089 if (!validate_crush_against_features(&newcrush, ss)) {
11090 err = -EINVAL;
11091 goto reply;
11092 }
11093
11094 pending_inc.crush.clear();
f67539c2 11095 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
11096 ss << "adjusted tunables profile to " << profile;
11097 getline(ss, rs);
11098 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11099 get_last_committed() + 1));
11100 return true;
11101 } else if (prefix == "osd crush set-tunable") {
20effc67 11102 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
11103
11104 err = 0;
11105 string tunable;
9f95a23c 11106 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
11107
11108 int64_t value = -1;
9f95a23c 11109 if (!cmd_getval(cmdmap, "value", value)) {
7c673cae 11110 err = -EINVAL;
11fdf7f2
TL
11111 ss << "failed to parse integer value "
11112 << cmd_vartype_stringify(cmdmap.at("value"));
7c673cae
FG
11113 goto reply;
11114 }
11115
11116 if (tunable == "straw_calc_version") {
224ce89b 11117 if (value != 0 && value != 1) {
7c673cae
FG
11118 ss << "value must be 0 or 1; got " << value;
11119 err = -EINVAL;
11120 goto reply;
11121 }
11122 newcrush.set_straw_calc_version(value);
11123 } else {
11124 ss << "unrecognized tunable '" << tunable << "'";
11125 err = -EINVAL;
11126 goto reply;
11127 }
11128
11129 if (!validate_crush_against_features(&newcrush, ss)) {
11130 err = -EINVAL;
11131 goto reply;
11132 }
11133
11134 pending_inc.crush.clear();
f67539c2 11135 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
11136 ss << "adjusted tunable " << tunable << " to " << value;
11137 getline(ss, rs);
11138 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11139 get_last_committed() + 1));
11140 return true;
11141
11142 } else if (prefix == "osd crush rule create-simple") {
11143 string name, root, type, mode;
9f95a23c
TL
11144 cmd_getval(cmdmap, "name", name);
11145 cmd_getval(cmdmap, "root", root);
11146 cmd_getval(cmdmap, "type", type);
11147 cmd_getval(cmdmap, "mode", mode);
7c673cae
FG
11148 if (mode == "")
11149 mode = "firstn";
11150
11151 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
11152 // The name is uniquely associated to a ruleid and the rule it contains
11153 // From the user point of view, the rule is more meaningfull.
11154 ss << "rule " << name << " already exists";
7c673cae
FG
11155 err = 0;
11156 goto reply;
11157 }
11158
20effc67 11159 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
11160
11161 if (newcrush.rule_exists(name)) {
31f18b77
FG
11162 // The name is uniquely associated to a ruleid and the rule it contains
11163 // From the user point of view, the rule is more meaningfull.
11164 ss << "rule " << name << " already exists";
7c673cae
FG
11165 err = 0;
11166 } else {
224ce89b 11167 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
11168 pg_pool_t::TYPE_REPLICATED, &ss);
11169 if (ruleno < 0) {
11170 err = ruleno;
11171 goto reply;
11172 }
11173
11174 pending_inc.crush.clear();
f67539c2 11175 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
11176 }
11177 getline(ss, rs);
11178 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11179 get_last_committed() + 1));
11180 return true;
11181
224ce89b
WB
11182 } else if (prefix == "osd crush rule create-replicated") {
11183 string name, root, type, device_class;
9f95a23c
TL
11184 cmd_getval(cmdmap, "name", name);
11185 cmd_getval(cmdmap, "root", root);
11186 cmd_getval(cmdmap, "type", type);
11187 cmd_getval(cmdmap, "class", device_class);
224ce89b
WB
11188
11189 if (osdmap.crush->rule_exists(name)) {
11190 // The name is uniquely associated to a ruleid and the rule it contains
11191 // From the user point of view, the rule is more meaningfull.
11192 ss << "rule " << name << " already exists";
11193 err = 0;
11194 goto reply;
11195 }
11196
20effc67 11197 CrushWrapper newcrush = _get_pending_crush();
224ce89b
WB
11198
11199 if (newcrush.rule_exists(name)) {
11200 // The name is uniquely associated to a ruleid and the rule it contains
11201 // From the user point of view, the rule is more meaningfull.
11202 ss << "rule " << name << " already exists";
11203 err = 0;
11204 } else {
11205 int ruleno = newcrush.add_simple_rule(
11206 name, root, type, device_class,
11207 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11208 if (ruleno < 0) {
11209 err = ruleno;
11210 goto reply;
11211 }
11212
11213 pending_inc.crush.clear();
f67539c2 11214 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
224ce89b
WB
11215 }
11216 getline(ss, rs);
11217 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11218 get_last_committed() + 1));
11219 return true;
11220
7c673cae
FG
11221 } else if (prefix == "osd erasure-code-profile rm") {
11222 string name;
9f95a23c 11223 cmd_getval(cmdmap, "name", name);
7c673cae
FG
11224
11225 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11226 goto wait;
11227
11228 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11229 err = -EBUSY;
11230 goto reply;
11231 }
11232
11233 if (osdmap.has_erasure_code_profile(name) ||
11234 pending_inc.new_erasure_code_profiles.count(name)) {
11235 if (osdmap.has_erasure_code_profile(name)) {
11236 pending_inc.old_erasure_code_profiles.push_back(name);
11237 } else {
11238 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11239 pending_inc.new_erasure_code_profiles.erase(name);
11240 }
11241
11242 getline(ss, rs);
11243 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11244 get_last_committed() + 1));
11245 return true;
11246 } else {
11247 ss << "erasure-code-profile " << name << " does not exist";
11248 err = 0;
11249 goto reply;
11250 }
11251
11252 } else if (prefix == "osd erasure-code-profile set") {
11253 string name;
9f95a23c 11254 cmd_getval(cmdmap, "name", name);
7c673cae 11255 vector<string> profile;
9f95a23c 11256 cmd_getval(cmdmap, "profile", profile);
11fdf7f2
TL
11257
11258 bool force = false;
9f95a23c 11259 cmd_getval(cmdmap, "force", force);
11fdf7f2 11260
7c673cae
FG
11261 map<string,string> profile_map;
11262 err = parse_erasure_code_profile(profile, &profile_map, &ss);
11263 if (err)
11264 goto reply;
adb31ebb
TL
11265 if (auto found = profile_map.find("crush-failure-domain");
11266 found != profile_map.end()) {
11267 const auto& failure_domain = found->second;
11268 int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11269 if (failure_domain_type < 0) {
11270 ss << "erasure-code-profile " << profile_map
11271 << " contains an invalid failure-domain " << std::quoted(failure_domain);
11272 err = -EINVAL;
11273 goto reply;
11274 }
11275 }
11276
7c673cae
FG
11277 if (profile_map.find("plugin") == profile_map.end()) {
11278 ss << "erasure-code-profile " << profile_map
11279 << " must contain a plugin entry" << std::endl;
11280 err = -EINVAL;
11281 goto reply;
11282 }
11283 string plugin = profile_map["plugin"];
11284
11285 if (pending_inc.has_erasure_code_profile(name)) {
11286 dout(20) << "erasure code profile " << name << " try again" << dendl;
11287 goto wait;
11288 } else {
7c673cae
FG
11289 err = normalize_profile(name, profile_map, force, &ss);
11290 if (err)
11291 goto reply;
11292
11293 if (osdmap.has_erasure_code_profile(name)) {
11294 ErasureCodeProfile existing_profile_map =
11295 osdmap.get_erasure_code_profile(name);
11296 err = normalize_profile(name, existing_profile_map, force, &ss);
11297 if (err)
11298 goto reply;
11299
11300 if (existing_profile_map == profile_map) {
11301 err = 0;
11302 goto reply;
11303 }
11304 if (!force) {
11305 err = -EPERM;
11306 ss << "will not override erasure code profile " << name
11307 << " because the existing profile "
11308 << existing_profile_map
11309 << " is different from the proposed profile "
11310 << profile_map;
11311 goto reply;
11312 }
11313 }
11314
11315 dout(20) << "erasure code profile set " << name << "="
11316 << profile_map << dendl;
11317 pending_inc.set_erasure_code_profile(name, profile_map);
11318 }
11319
11320 getline(ss, rs);
11321 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11322 get_last_committed() + 1));
11323 return true;
11324
11325 } else if (prefix == "osd crush rule create-erasure") {
11326 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11327 if (err == -EAGAIN)
11328 goto wait;
11329 if (err)
11330 goto reply;
11331 string name, poolstr;
9f95a23c 11332 cmd_getval(cmdmap, "name", name);
7c673cae 11333 string profile;
9f95a23c 11334 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
11335 if (profile == "")
11336 profile = "default";
11337 if (profile == "default") {
11338 if (!osdmap.has_erasure_code_profile(profile)) {
11339 if (pending_inc.has_erasure_code_profile(profile)) {
11340 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11341 goto wait;
11342 }
11343
11344 map<string,string> profile_map;
11fdf7f2 11345 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
11346 profile_map,
11347 &ss);
11348 if (err)
11349 goto reply;
11350 err = normalize_profile(name, profile_map, true, &ss);
11351 if (err)
11352 goto reply;
11353 dout(20) << "erasure code profile set " << profile << "="
11354 << profile_map << dendl;
11355 pending_inc.set_erasure_code_profile(profile, profile_map);
11356 goto wait;
11357 }
11358 }
11359
31f18b77
FG
11360 int rule;
11361 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
11362 if (err < 0) {
11363 switch(err) {
11364 case -EEXIST: // return immediately
11365 ss << "rule " << name << " already exists";
11366 err = 0;
11367 goto reply;
11368 break;
11369 case -EALREADY: // wait for pending to be proposed
11370 ss << "rule " << name << " already exists";
11371 err = 0;
11372 break;
11373 default: // non recoverable error
11374 goto reply;
11375 break;
11376 }
11377 } else {
31f18b77 11378 ss << "created rule " << name << " at " << rule;
7c673cae
FG
11379 }
11380
11381 getline(ss, rs);
11382 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11383 get_last_committed() + 1));
11384 return true;
11385
11386 } else if (prefix == "osd crush rule rm") {
11387 string name;
9f95a23c 11388 cmd_getval(cmdmap, "name", name);
7c673cae
FG
11389
11390 if (!osdmap.crush->rule_exists(name)) {
11391 ss << "rule " << name << " does not exist";
11392 err = 0;
11393 goto reply;
11394 }
11395
20effc67 11396 CrushWrapper newcrush = _get_pending_crush();
7c673cae
FG
11397
11398 if (!newcrush.rule_exists(name)) {
11399 ss << "rule " << name << " does not exist";
11400 err = 0;
11401 } else {
11402 int ruleno = newcrush.get_rule_id(name);
11fdf7f2 11403 ceph_assert(ruleno >= 0);
7c673cae
FG
11404
11405 // make sure it is not in use.
11406 // FIXME: this is ok in some situations, but let's not bother with that
11407 // complexity now.
20effc67
TL
11408 if (osdmap.crush_rule_in_use(ruleno)) {
11409 ss << "crush rule " << name << " (" << ruleno << ") is in use";
7c673cae
FG
11410 err = -EBUSY;
11411 goto reply;
11412 }
11413
11414 err = newcrush.remove_rule(ruleno);
11415 if (err < 0) {
11416 goto reply;
11417 }
11418
11419 pending_inc.crush.clear();
f67539c2 11420 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7c673cae
FG
11421 }
11422 getline(ss, rs);
11423 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11424 get_last_committed() + 1));
11425 return true;
11426
b5b8bbf5
FG
11427 } else if (prefix == "osd crush rule rename") {
11428 string srcname;
11429 string dstname;
9f95a23c
TL
11430 cmd_getval(cmdmap, "srcname", srcname);
11431 cmd_getval(cmdmap, "dstname", dstname);
b5b8bbf5
FG
11432 if (srcname.empty() || dstname.empty()) {
11433 ss << "must specify both source rule name and destination rule name";
11434 err = -EINVAL;
11435 goto reply;
11436 }
11437 if (srcname == dstname) {
11438 ss << "destination rule name is equal to source rule name";
11439 err = 0;
11440 goto reply;
11441 }
11442
20effc67 11443 CrushWrapper newcrush = _get_pending_crush();
181888fb
FG
11444 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11445 // srcname does not exist and dstname already exists
11446 // suppose this is a replay and return success
11447 // (so this command is idempotent)
11448 ss << "already renamed to '" << dstname << "'";
11449 err = 0;
11450 goto reply;
11451 }
11452
b5b8bbf5
FG
11453 err = newcrush.rename_rule(srcname, dstname, &ss);
11454 if (err < 0) {
11455 // ss has reason for failure
11456 goto reply;
11457 }
11458 pending_inc.crush.clear();
f67539c2 11459 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
b5b8bbf5
FG
11460 getline(ss, rs);
11461 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11462 get_last_committed() + 1));
11463 return true;
11464
7c673cae
FG
11465 } else if (prefix == "osd setmaxosd") {
11466 int64_t newmax;
9f95a23c 11467 if (!cmd_getval(cmdmap, "newmax", newmax)) {
7c673cae 11468 ss << "unable to parse 'newmax' value '"
11fdf7f2 11469 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
7c673cae
FG
11470 err = -EINVAL;
11471 goto reply;
11472 }
11473
11fdf7f2 11474 if (newmax > g_conf()->mon_max_osd) {
7c673cae
FG
11475 err = -ERANGE;
11476 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11fdf7f2 11477 << g_conf()->mon_max_osd << ")";
7c673cae
FG
11478 goto reply;
11479 }
11480
11481 // Don't allow shrinking OSD number as this will cause data loss
11482 // and may cause kernel crashes.
11483 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11484 if (newmax < osdmap.get_max_osd()) {
11485 // Check if the OSDs exist between current max and new value.
11486 // If there are any OSDs exist, then don't allow shrinking number
11487 // of OSDs.
11488 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11489 if (osdmap.exists(i)) {
11490 err = -EBUSY;
11491 ss << "cannot shrink max_osd to " << newmax
11492 << " because osd." << i << " (and possibly others) still in use";
11493 goto reply;
11494 }
11495 }
11496 }
11497
11498 pending_inc.new_max_osd = newmax;
11499 ss << "set new max_osd = " << pending_inc.new_max_osd;
11500 getline(ss, rs);
11501 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11502 get_last_committed() + 1));
11503 return true;
11504
11505 } else if (prefix == "osd set-full-ratio" ||
11506 prefix == "osd set-backfillfull-ratio" ||
11507 prefix == "osd set-nearfull-ratio") {
7c673cae 11508 double n;
9f95a23c 11509 if (!cmd_getval(cmdmap, "ratio", n)) {
7c673cae 11510 ss << "unable to parse 'ratio' value '"
11fdf7f2 11511 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
7c673cae
FG
11512 err = -EINVAL;
11513 goto reply;
11514 }
11515 if (prefix == "osd set-full-ratio")
11516 pending_inc.new_full_ratio = n;
11517 else if (prefix == "osd set-backfillfull-ratio")
11518 pending_inc.new_backfillfull_ratio = n;
11519 else if (prefix == "osd set-nearfull-ratio")
11520 pending_inc.new_nearfull_ratio = n;
11521 ss << prefix << " " << n;
11522 getline(ss, rs);
11523 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11524 get_last_committed() + 1));
11525 return true;
11526 } else if (prefix == "osd set-require-min-compat-client") {
7c673cae 11527 string v;
9f95a23c
TL
11528 cmd_getval(cmdmap, "version", v);
11529 ceph_release_t vno = ceph_release_from_name(v);
11530 if (!vno) {
7c673cae
FG
11531 ss << "version " << v << " is not recognized";
11532 err = -EINVAL;
11533 goto reply;
11534 }
11535 OSDMap newmap;
11536 newmap.deepish_copy_from(osdmap);
11537 newmap.apply_incremental(pending_inc);
31f18b77
FG
11538 newmap.require_min_compat_client = vno;
11539 auto mvno = newmap.get_min_compat_client();
11540 if (vno < mvno) {
9f95a23c
TL
11541 ss << "osdmap current utilizes features that require " << mvno
11542 << "; cannot set require_min_compat_client below that to " << vno;
7c673cae
FG
11543 err = -EPERM;
11544 goto reply;
11545 }
11fdf7f2 11546 bool sure = false;
9f95a23c 11547 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11548 if (!sure) {
31f18b77 11549 FeatureMap m;
f67539c2
TL
11550 mon.get_combined_feature_map(&m);
11551 uint64_t features = ceph_release_features(to_integer<int>(vno));
31f18b77
FG
11552 bool first = true;
11553 bool ok = true;
11554 for (int type : {
11555 CEPH_ENTITY_TYPE_CLIENT,
11556 CEPH_ENTITY_TYPE_MDS,
11557 CEPH_ENTITY_TYPE_MGR }) {
11558 auto p = m.m.find(type);
11559 if (p == m.m.end()) {
11560 continue;
11561 }
11562 for (auto& q : p->second) {
11563 uint64_t missing = ~q.first & features;
11564 if (missing) {
11565 if (first) {
11566 ss << "cannot set require_min_compat_client to " << v << ": ";
11567 } else {
11568 ss << "; ";
11569 }
11570 first = false;
11571 ss << q.second << " connected " << ceph_entity_type_name(type)
11572 << "(s) look like " << ceph_release_name(
11573 ceph_release_from_features(q.first))
11574 << " (missing 0x" << std::hex << missing << std::dec << ")";
11575 ok = false;
11576 }
11577 }
11578 }
11579 if (!ok) {
11580 ss << "; add --yes-i-really-mean-it to do it anyway";
11581 err = -EPERM;
11582 goto reply;
11583 }
11584 }
9f95a23c 11585 ss << "set require_min_compat_client to " << vno;
31f18b77 11586 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
11587 getline(ss, rs);
11588 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11589 get_last_committed() + 1));
11590 return true;
11591 } else if (prefix == "osd pause") {
11592 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11593
11594 } else if (prefix == "osd unpause") {
11595 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11596
11597 } else if (prefix == "osd set") {
11fdf7f2 11598 bool sure = false;
9f95a23c 11599 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11600
7c673cae 11601 string key;
9f95a23c
TL
11602 cmd_getval(cmdmap, "key", key);
11603 if (key == "pause")
7c673cae
FG
11604 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11605 else if (key == "noup")
11606 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11607 else if (key == "nodown")
11608 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11609 else if (key == "noout")
11610 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11611 else if (key == "noin")
11612 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11613 else if (key == "nobackfill")
11614 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11615 else if (key == "norebalance")
11616 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11617 else if (key == "norecover")
11618 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11619 else if (key == "noscrub")
11620 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11621 else if (key == "nodeep-scrub")
11622 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11623 else if (key == "notieragent")
11624 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11625 else if (key == "nosnaptrim")
11626 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11627 else if (key == "pglog_hardlimit") {
11628 if (!osdmap.get_num_up_osds() && !sure) {
f64942e4
AA
11629 ss << "Not advisable to continue since no OSDs are up. Pass "
11630 << "--yes-i-really-mean-it if you really wish to continue.";
11631 err = -EPERM;
11632 goto reply;
11633 }
11634 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11635 // we are reusing a jewel feature bit that was retired in luminous.
9f95a23c 11636 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
f64942e4 11637 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11fdf7f2 11638 || sure)) {
f64942e4
AA
11639 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11640 } else {
11641 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11642 err = -EPERM;
11643 goto reply;
11644 }
7c673cae
FG
11645 } else {
11646 ss << "unrecognized flag '" << key << "'";
11647 err = -EINVAL;
11648 }
11649
11650 } else if (prefix == "osd unset") {
11651 string key;
9f95a23c
TL
11652 cmd_getval(cmdmap, "key", key);
11653 if (key == "pause")
7c673cae
FG
11654 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11655 else if (key == "noup")
11656 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11657 else if (key == "nodown")
11658 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11659 else if (key == "noout")
11660 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11661 else if (key == "noin")
11662 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11663 else if (key == "nobackfill")
11664 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11665 else if (key == "norebalance")
11666 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11667 else if (key == "norecover")
11668 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11669 else if (key == "noscrub")
11670 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11671 else if (key == "nodeep-scrub")
11672 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11673 else if (key == "notieragent")
11674 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11675 else if (key == "nosnaptrim")
11676 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
224ce89b 11677 else {
7c673cae
FG
11678 ss << "unrecognized flag '" << key << "'";
11679 err = -EINVAL;
11680 }
11681
31f18b77
FG
11682 } else if (prefix == "osd require-osd-release") {
11683 string release;
9f95a23c 11684 cmd_getval(cmdmap, "release", release);
11fdf7f2 11685 bool sure = false;
9f95a23c
TL
11686 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11687 ceph_release_t rel = ceph_release_from_name(release.c_str());
11688 if (!rel) {
31f18b77
FG
11689 ss << "unrecognized release " << release;
11690 err = -EINVAL;
11691 goto reply;
11692 }
d2e6a577
FG
11693 if (rel == osdmap.require_osd_release) {
11694 // idempotent
11695 err = 0;
11696 goto reply;
11697 }
1e59de90
TL
11698 if (osdmap.require_osd_release < ceph_release_t::pacific && !sure) {
11699 ss << "Not advisable to continue since current 'require_osd_release' "
11700 << "refers to a very old Ceph release. Pass "
11701 << "--yes-i-really-mean-it if you really wish to continue.";
11702 err = -EPERM;
11703 goto reply;
11704 }
11fdf7f2
TL
11705 if (!osdmap.get_num_up_osds() && !sure) {
11706 ss << "Not advisable to continue since no OSDs are up. Pass "
11707 << "--yes-i-really-mean-it if you really wish to continue.";
11708 err = -EPERM;
11709 goto reply;
11710 }
1e59de90 11711 if (rel == ceph_release_t::pacific) {
f67539c2
TL
11712 if (!mon.monmap->get_required_features().contains_all(
11713 ceph::features::mon::FEATURE_PACIFIC)) {
11714 ss << "not all mons are pacific";
11715 err = -EPERM;
11716 goto reply;
11717 }
11718 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11719 && !sure) {
11720 ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11721 err = -EPERM;
11722 goto reply;
11723 }
20effc67
TL
11724 } else if (rel == ceph_release_t::quincy) {
11725 if (!mon.monmap->get_required_features().contains_all(
11726 ceph::features::mon::FEATURE_QUINCY)) {
11727 ss << "not all mons are quincy";
11728 err = -EPERM;
11729 goto reply;
11730 }
11731 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11732 && !sure) {
11733 ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11734 err = -EPERM;
11735 goto reply;
11736 }
1e59de90
TL
11737 } else if (rel == ceph_release_t::reef) {
11738 if (!mon.monmap->get_required_features().contains_all(
11739 ceph::features::mon::FEATURE_REEF)) {
11740 ss << "not all mons are reef";
11741 err = -EPERM;
11742 goto reply;
11743 }
11744 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_REEF))
11745 && !sure) {
11746 ss << "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
11747 err = -EPERM;
11748 goto reply;
11749 }
31f18b77 11750 } else {
20effc67 11751 ss << "not supported for this release";
31f18b77
FG
11752 err = -EPERM;
11753 goto reply;
11754 }
11755 if (rel < osdmap.require_osd_release) {
11756 ss << "require_osd_release cannot be lowered once it has been set";
11757 err = -EPERM;
11758 goto reply;
11759 }
11760 pending_inc.new_require_osd_release = rel;
11761 goto update;
7c673cae 11762 } else if (prefix == "osd down" ||
9f95a23c
TL
11763 prefix == "osd out" ||
11764 prefix == "osd in" ||
11765 prefix == "osd rm" ||
11766 prefix == "osd stop") {
7c673cae
FG
11767
11768 bool any = false;
31f18b77
FG
11769 bool stop = false;
11770 bool verbose = true;
9f95a23c 11771 bool definitely_dead = false;
7c673cae
FG
11772
11773 vector<string> idvec;
9f95a23c
TL
11774 cmd_getval(cmdmap, "ids", idvec);
11775 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11776 derr << "definitely_dead " << (int)definitely_dead << dendl;
31f18b77
FG
11777 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11778 set<int> osds;
11779
11780 // wildcard?
11781 if (j == 0 &&
11782 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11783 if (prefix == "osd in") {
11784 // touch out osds only
81eedcae 11785 osdmap.get_out_existing_osds(osds);
31f18b77
FG
11786 } else {
11787 osdmap.get_all_osds(osds);
11788 }
11789 stop = true;
11790 verbose = false; // so the output is less noisy.
11791 } else {
11792 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11793 if (osd < 0) {
11794 ss << "invalid osd id" << osd;
11795 err = -EINVAL;
11796 continue;
11797 } else if (!osdmap.exists(osd)) {
11798 ss << "osd." << osd << " does not exist. ";
11799 continue;
11800 }
11801
11802 osds.insert(osd);
7c673cae 11803 }
31f18b77
FG
11804
11805 for (auto &osd : osds) {
11806 if (prefix == "osd down") {
11807 if (osdmap.is_down(osd)) {
11808 if (verbose)
11809 ss << "osd." << osd << " is already down. ";
11810 } else {
11811 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11812 ss << "marked down osd." << osd << ". ";
11813 any = true;
11814 }
9f95a23c
TL
11815 if (definitely_dead) {
11816 if (!pending_inc.new_xinfo.count(osd)) {
11817 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11818 }
11819 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11820 any = true;
11821 }
11822 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11823 }
31f18b77
FG
11824 } else if (prefix == "osd out") {
11825 if (osdmap.is_out(osd)) {
11826 if (verbose)
11827 ss << "osd." << osd << " is already out. ";
11828 } else {
11829 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11830 if (osdmap.osd_weight[osd]) {
11831 if (pending_inc.new_xinfo.count(osd) == 0) {
11832 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11833 }
11834 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 11835 }
31f18b77 11836 ss << "marked out osd." << osd << ". ";
224ce89b
WB
11837 std::ostringstream msg;
11838 msg << "Client " << op->get_session()->entity_name
11839 << " marked osd." << osd << " out";
11840 if (osdmap.is_up(osd)) {
11841 msg << ", while it was still marked up";
11842 } else {
3efd9988
FG
11843 auto period = ceph_clock_now() - down_pending_out[osd];
11844 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
11845 << " seconds";
11846 }
11847
f67539c2 11848 mon.clog->info() << msg.str();
31f18b77 11849 any = true;
7c673cae 11850 }
31f18b77
FG
11851 } else if (prefix == "osd in") {
11852 if (osdmap.is_in(osd)) {
11853 if (verbose)
11854 ss << "osd." << osd << " is already in. ";
11855 } else {
11856 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11857 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11858 if (pending_inc.new_xinfo.count(osd) == 0) {
11859 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11860 }
11861 pending_inc.new_xinfo[osd].old_weight = 0;
11862 } else {
11863 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 11864 }
31f18b77
FG
11865 ss << "marked in osd." << osd << ". ";
11866 any = true;
11867 }
11868 } else if (prefix == "osd rm") {
11869 err = prepare_command_osd_remove(osd);
11870
11871 if (err == -EBUSY) {
11872 if (any)
11873 ss << ", ";
11874 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 11875 } else {
11fdf7f2 11876 ceph_assert(err == 0);
31f18b77
FG
11877 if (any) {
11878 ss << ", osd." << osd;
11879 } else {
11880 ss << "removed osd." << osd;
11881 }
11882 any = true;
7c673cae 11883 }
9f95a23c
TL
11884 } else if (prefix == "osd stop") {
11885 if (osdmap.is_stop(osd)) {
11886 if (verbose)
11887 ss << "osd." << osd << " is already stopped. ";
11888 } else if (osdmap.is_down(osd)) {
11889 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11890 ss << "stop down osd." << osd << ". ";
11891 any = true;
11892 } else {
11893 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11894 ss << "stop osd." << osd << ". ";
11895 any = true;
11896 }
31f18b77
FG
11897 }
11898 }
11899 }
11900 if (any) {
11901 getline(ss, rs);
11902 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11903 get_last_committed() + 1));
11904 return true;
11905 }
81eedcae
TL
11906 } else if (prefix == "osd set-group" ||
11907 prefix == "osd unset-group" ||
11908 prefix == "osd add-noup" ||
31f18b77
FG
11909 prefix == "osd add-nodown" ||
11910 prefix == "osd add-noin" ||
81eedcae
TL
11911 prefix == "osd add-noout" ||
11912 prefix == "osd rm-noup" ||
11913 prefix == "osd rm-nodown" ||
11914 prefix == "osd rm-noin" ||
11915 prefix == "osd rm-noout") {
11916 bool do_set = prefix == "osd set-group" ||
11917 prefix.find("add") != string::npos;
11918 string flag_str;
11919 unsigned flags = 0;
11920 vector<string> who;
11921 if (prefix == "osd set-group" || prefix == "osd unset-group") {
9f95a23c
TL
11922 cmd_getval(cmdmap, "flags", flag_str);
11923 cmd_getval(cmdmap, "who", who);
81eedcae
TL
11924 vector<string> raw_flags;
11925 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11926 for (auto& f : raw_flags) {
11927 if (f == "noup")
11928 flags |= CEPH_OSD_NOUP;
11929 else if (f == "nodown")
11930 flags |= CEPH_OSD_NODOWN;
11931 else if (f == "noin")
11932 flags |= CEPH_OSD_NOIN;
11933 else if (f == "noout")
11934 flags |= CEPH_OSD_NOOUT;
11935 else {
11936 ss << "unrecognized flag '" << f << "', must be one of "
11937 << "{noup,nodown,noin,noout}";
11938 err = -EINVAL;
11939 goto reply;
11940 }
11941 }
31f18b77 11942 } else {
9f95a23c 11943 cmd_getval(cmdmap, "ids", who);
81eedcae
TL
11944 if (prefix.find("noup") != string::npos)
11945 flags = CEPH_OSD_NOUP;
11946 else if (prefix.find("nodown") != string::npos)
11947 flags = CEPH_OSD_NODOWN;
11948 else if (prefix.find("noin") != string::npos)
11949 flags = CEPH_OSD_NOIN;
11950 else if (prefix.find("noout") != string::npos)
11951 flags = CEPH_OSD_NOOUT;
11952 else
11953 ceph_assert(0 == "Unreachable!");
31f18b77 11954 }
81eedcae
TL
11955 if (flags == 0) {
11956 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11957 err = -EINVAL;
11958 goto reply;
11959 }
11960 if (who.empty()) {
11961 ss << "must specify at least one or more targets to set/unset";
11962 err = -EINVAL;
11963 goto reply;
11964 }
11965 set<int> osds;
11966 set<int> crush_nodes;
11967 set<int> device_classes;
11968 for (auto& w : who) {
11969 if (w == "any" || w == "all" || w == "*") {
31f18b77 11970 osdmap.get_all_osds(osds);
81eedcae 11971 break;
31f18b77 11972 }
81eedcae
TL
11973 std::stringstream ts;
11974 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11975 osds.insert(osd);
11976 } else if (osdmap.crush->name_exists(w)) {
11977 crush_nodes.insert(osdmap.crush->get_item_id(w));
11978 } else if (osdmap.crush->class_exists(w)) {
11979 device_classes.insert(osdmap.crush->get_class_id(w));
11980 } else {
11981 ss << "unable to parse osd id or crush node or device class: "
11982 << "\"" << w << "\". ";
7c673cae
FG
11983 }
11984 }
81eedcae
TL
11985 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11986 // ss has reason for failure
11987 err = -EINVAL;
11988 goto reply;
31f18b77 11989 }
31f18b77 11990 bool any = false;
81eedcae
TL
11991 for (auto osd : osds) {
11992 if (!osdmap.exists(osd)) {
11993 ss << "osd." << osd << " does not exist. ";
11994 continue;
11995 }
11996 if (do_set) {
11997 if (flags & CEPH_OSD_NOUP) {
11998 any |= osdmap.is_noup_by_osd(osd) ?
11999 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
12000 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
31f18b77 12001 }
81eedcae
TL
12002 if (flags & CEPH_OSD_NODOWN) {
12003 any |= osdmap.is_nodown_by_osd(osd) ?
12004 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
12005 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
12006 }
12007 if (flags & CEPH_OSD_NOIN) {
12008 any |= osdmap.is_noin_by_osd(osd) ?
12009 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
12010 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
12011 }
12012 if (flags & CEPH_OSD_NOOUT) {
12013 any |= osdmap.is_noout_by_osd(osd) ?
12014 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
12015 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
31f18b77 12016 }
31f18b77 12017 } else {
81eedcae
TL
12018 if (flags & CEPH_OSD_NOUP) {
12019 any |= osdmap.is_noup_by_osd(osd) ?
12020 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
12021 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
31f18b77 12022 }
81eedcae
TL
12023 if (flags & CEPH_OSD_NODOWN) {
12024 any |= osdmap.is_nodown_by_osd(osd) ?
12025 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
12026 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
31f18b77 12027 }
81eedcae
TL
12028 if (flags & CEPH_OSD_NOIN) {
12029 any |= osdmap.is_noin_by_osd(osd) ?
12030 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
12031 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
12032 }
12033 if (flags & CEPH_OSD_NOOUT) {
12034 any |= osdmap.is_noout_by_osd(osd) ?
12035 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
12036 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
31f18b77
FG
12037 }
12038 }
12039 }
81eedcae
TL
12040 for (auto& id : crush_nodes) {
12041 auto old_flags = osdmap.get_crush_node_flags(id);
12042 auto& pending_flags = pending_inc.new_crush_node_flags[id];
12043 pending_flags |= old_flags; // adopt existing flags first!
12044 if (do_set) {
12045 pending_flags |= flags;
12046 } else {
12047 pending_flags &= ~flags;
12048 }
12049 any = true;
12050 }
12051 for (auto& id : device_classes) {
12052 auto old_flags = osdmap.get_device_class_flags(id);
12053 auto& pending_flags = pending_inc.new_device_class_flags[id];
12054 pending_flags |= old_flags;
12055 if (do_set) {
12056 pending_flags |= flags;
12057 } else {
12058 pending_flags &= ~flags;
12059 }
12060 any = true;
12061 }
31f18b77
FG
12062 if (any) {
12063 getline(ss, rs);
12064 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
12065 get_last_committed() + 1));
7c673cae
FG
12066 return true;
12067 }
12068 } else if (prefix == "osd pg-temp") {
7c673cae 12069 pg_t pgid;
1e59de90
TL
12070 err = parse_pgid(cmdmap, ss, pgid);
12071 if (err < 0)
7c673cae 12072 goto reply;
7c673cae
FG
12073 if (pending_inc.new_pg_temp.count(pgid)) {
12074 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
12075 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12076 return true;
12077 }
12078
12079 vector<int64_t> id_vec;
12080 vector<int32_t> new_pg_temp;
9f95a23c 12081 cmd_getval(cmdmap, "id", id_vec);
11fdf7f2
TL
12082 if (id_vec.empty()) {
12083 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
12084 ss << "done cleaning up pg_temp of " << pgid;
12085 goto update;
7c673cae
FG
12086 }
12087 for (auto osd : id_vec) {
12088 if (!osdmap.exists(osd)) {
12089 ss << "osd." << osd << " does not exist";
12090 err = -ENOENT;
12091 goto reply;
12092 }
12093 new_pg_temp.push_back(osd);
12094 }
12095
224ce89b
WB
12096 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12097 if ((int)new_pg_temp.size() < pool_min_size) {
12098 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
12099 << pool_min_size << ")";
12100 err = -EINVAL;
12101 goto reply;
12102 }
12103
12104 int pool_size = osdmap.get_pg_pool_size(pgid);
12105 if ((int)new_pg_temp.size() > pool_size) {
12106 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
12107 << pool_size << ")";
12108 err = -EINVAL;
12109 goto reply;
12110 }
12111
7c673cae
FG
12112 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
12113 new_pg_temp.begin(), new_pg_temp.end());
12114 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
12115 goto update;
1e59de90
TL
12116 } else if (prefix == "osd primary-temp" ||
12117 prefix == "osd rm-primary-temp") {
7c673cae 12118 pg_t pgid;
1e59de90
TL
12119 err = parse_pgid(cmdmap, ss, pgid);
12120 if (err < 0)
7c673cae 12121 goto reply;
7c673cae
FG
12122
12123 int64_t osd;
1e59de90
TL
12124 if (prefix == "osd primary-temp") {
12125 if (!cmd_getval(cmdmap, "id", osd)) {
12126 ss << "unable to parse 'id' value '"
12127 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12128 err = -EINVAL;
12129 goto reply;
12130 }
12131 if (!osdmap.exists(osd)) {
12132 ss << "osd." << osd << " does not exist";
12133 err = -ENOENT;
12134 goto reply;
12135 }
7c673cae 12136 }
1e59de90
TL
12137 else if (prefix == "osd rm-primary-temp") {
12138 osd = -1;
12139 }
12140 else {
12141 ceph_assert(0 == "Unreachable!");
7c673cae
FG
12142 }
12143
9f95a23c
TL
12144 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12145 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 12146 ss << "require_min_compat_client "
9f95a23c 12147 << osdmap.require_min_compat_client
7c673cae
FG
12148 << " < firefly, which is required for primary-temp";
12149 err = -EPERM;
12150 goto reply;
7c673cae
FG
12151 }
12152
12153 pending_inc.new_primary_temp[pgid] = osd;
12154 ss << "set " << pgid << " primary_temp mapping to " << osd;
12155 goto update;
11fdf7f2
TL
12156 } else if (prefix == "pg repeer") {
12157 pg_t pgid;
1e59de90
TL
12158 err = parse_pgid(cmdmap, ss, pgid);
12159 if (err < 0)
11fdf7f2 12160 goto reply;
11fdf7f2
TL
12161 vector<int> acting;
12162 int primary;
12163 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12164 if (primary < 0) {
12165 err = -EAGAIN;
12166 ss << "pg currently has no primary";
12167 goto reply;
12168 }
12169 if (acting.size() > 1) {
12170 // map to just primary; it will map back to what it wants
12171 pending_inc.new_pg_temp[pgid] = { primary };
12172 } else {
12173 // hmm, pick another arbitrary osd to induce a change. Note
12174 // that this won't work if there is only one suitable OSD in the cluster.
12175 int i;
12176 bool done = false;
12177 for (i = 0; i < osdmap.get_max_osd(); ++i) {
12178 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12179 continue;
12180 }
12181 pending_inc.new_pg_temp[pgid] = { primary, i };
12182 done = true;
12183 break;
12184 }
12185 if (!done) {
12186 err = -EAGAIN;
12187 ss << "not enough up OSDs in the cluster to force repeer";
12188 goto reply;
12189 }
12190 }
12191 goto update;
224ce89b
WB
12192 } else if (prefix == "osd pg-upmap" ||
12193 prefix == "osd rm-pg-upmap" ||
12194 prefix == "osd pg-upmap-items" ||
1e59de90
TL
12195 prefix == "osd rm-pg-upmap-items" ||
12196 prefix == "osd pg-upmap-primary" ||
12197 prefix == "osd rm-pg-upmap-primary") {
12198 enum {
12199 OP_PG_UPMAP,
12200 OP_RM_PG_UPMAP,
12201 OP_PG_UPMAP_ITEMS,
12202 OP_RM_PG_UPMAP_ITEMS,
12203 OP_PG_UPMAP_PRIMARY,
12204 OP_RM_PG_UPMAP_PRIMARY,
12205 } upmap_option;
12206
12207 if (prefix == "osd pg-upmap") {
12208 upmap_option = OP_PG_UPMAP;
12209 } else if (prefix == "osd rm-pg-upmap") {
12210 upmap_option = OP_RM_PG_UPMAP;
12211 } else if (prefix == "osd pg-upmap-items") {
12212 upmap_option = OP_PG_UPMAP_ITEMS;
12213 } else if (prefix == "osd rm-pg-upmap-items") {
12214 upmap_option = OP_RM_PG_UPMAP_ITEMS;
12215 } else if (prefix == "osd pg-upmap-primary") {
12216 upmap_option = OP_PG_UPMAP_PRIMARY;
12217 } else if (prefix == "osd rm-pg-upmap-primary") {
12218 upmap_option = OP_RM_PG_UPMAP_PRIMARY;
12219 } else {
12220 ceph_abort_msg("invalid upmap option");
12221 }
12222
12223 ceph_release_t min_release = ceph_release_t::unknown;
12224 string feature_name = "unknown";
12225 switch (upmap_option) {
12226 case OP_PG_UPMAP: // fall through
12227 case OP_RM_PG_UPMAP: // fall through
12228 case OP_PG_UPMAP_ITEMS: // fall through
12229 case OP_RM_PG_UPMAP_ITEMS:
12230 min_release = ceph_release_t::luminous;
12231 feature_name = "pg-upmap";
12232 break;
12233
12234 case OP_PG_UPMAP_PRIMARY: // fall through
12235 case OP_RM_PG_UPMAP_PRIMARY:
12236 min_release = ceph_release_t::reef;
12237 feature_name = "pg-upmap-primary";
12238 break;
12239
12240 default:
12241 ceph_abort_msg("invalid upmap option");
12242 }
12243 uint64_t min_feature = CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
12244 string min_release_name = ceph_release_name(static_cast<int>(min_release));
12245
12246 if (osdmap.require_min_compat_client < min_release) {
31f18b77 12247 ss << "min_compat_client "
9f95a23c 12248 << osdmap.require_min_compat_client
1e59de90
TL
12249 << " < " << min_release_name << ", which is required for " << feature_name << ". "
12250 << "Try 'ceph osd set-require-min-compat-client " << min_release_name << "' "
224ce89b 12251 << "before using the new interface";
7c673cae
FG
12252 err = -EPERM;
12253 goto reply;
12254 }
1e59de90
TL
12255
12256 //TODO: Should I add feature and test for upmap-primary?
12257 err = check_cluster_features(min_feature, ss);
7c673cae
FG
12258 if (err == -EAGAIN)
12259 goto wait;
12260 if (err < 0)
12261 goto reply;
7c673cae 12262 pg_t pgid;
1e59de90
TL
12263 err = parse_pgid(cmdmap, ss, pgid);
12264 if (err < 0)
7c673cae 12265 goto reply;
94b18763
FG
12266 if (pending_inc.old_pools.count(pgid.pool())) {
12267 ss << "pool of " << pgid << " is pending removal";
12268 err = -ENOENT;
12269 getline(ss, rs);
12270 wait_for_finished_proposal(op,
12271 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12272 return true;
12273 }
224ce89b 12274
224ce89b 12275 // check pending upmap changes
1e59de90 12276 switch (upmap_option) {
224ce89b
WB
12277 case OP_PG_UPMAP: // fall through
12278 case OP_RM_PG_UPMAP:
12279 if (pending_inc.new_pg_upmap.count(pgid) ||
12280 pending_inc.old_pg_upmap.count(pgid)) {
12281 dout(10) << __func__ << " waiting for pending update on "
12282 << pgid << dendl;
12283 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12284 return true;
7c673cae 12285 }
224ce89b 12286 break;
7c673cae 12287
1e59de90
TL
12288 case OP_PG_UPMAP_PRIMARY: // fall through
12289 case OP_RM_PG_UPMAP_PRIMARY:
12290 {
12291 const pg_pool_t *pt = osdmap.get_pg_pool(pgid.pool());
12292 if (! pt->is_replicated()) {
12293 ss << "pg-upmap-primary is only supported for replicated pools";
12294 err = -EINVAL;
12295 goto reply;
12296 }
12297 }
12298 // fall through
12299 case OP_PG_UPMAP_ITEMS: // fall through
12300 case OP_RM_PG_UPMAP_ITEMS: // fall through
224ce89b
WB
12301 if (pending_inc.new_pg_upmap_items.count(pgid) ||
12302 pending_inc.old_pg_upmap_items.count(pgid)) {
12303 dout(10) << __func__ << " waiting for pending update on "
12304 << pgid << dendl;
12305 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12306 return true;
12307 }
12308 break;
7c673cae 12309
224ce89b 12310 default:
1e59de90 12311 ceph_abort_msg("invalid upmap option");
7c673cae 12312 }
224ce89b 12313
1e59de90 12314 switch (upmap_option) {
224ce89b
WB
12315 case OP_PG_UPMAP:
12316 {
12317 vector<int64_t> id_vec;
9f95a23c 12318 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 12319 ss << "unable to parse 'id' value(s) '"
11fdf7f2 12320 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
12321 err = -EINVAL;
12322 goto reply;
12323 }
12324
12325 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12326 if ((int)id_vec.size() < pool_min_size) {
12327 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12328 << pool_min_size << ")";
12329 err = -EINVAL;
12330 goto reply;
12331 }
12332
12333 int pool_size = osdmap.get_pg_pool_size(pgid);
12334 if ((int)id_vec.size() > pool_size) {
12335 ss << "num of osds (" << id_vec.size() <<") > pool size ("
12336 << pool_size << ")";
12337 err = -EINVAL;
12338 goto reply;
12339 }
12340
12341 vector<int32_t> new_pg_upmap;
12342 for (auto osd : id_vec) {
12343 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12344 ss << "osd." << osd << " does not exist";
12345 err = -ENOENT;
12346 goto reply;
12347 }
12348 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12349 if (it != new_pg_upmap.end()) {
12350 ss << "osd." << osd << " already exists, ";
12351 continue;
12352 }
12353 new_pg_upmap.push_back(osd);
12354 }
12355
12356 if (new_pg_upmap.empty()) {
12357 ss << "no valid upmap items(pairs) is specified";
12358 err = -EINVAL;
12359 goto reply;
12360 }
12361
12362 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12363 new_pg_upmap.begin(), new_pg_upmap.end());
12364 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 12365 }
224ce89b
WB
12366 break;
12367
12368 case OP_RM_PG_UPMAP:
12369 {
12370 pending_inc.old_pg_upmap.insert(pgid);
12371 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 12372 }
224ce89b 12373 break;
7c673cae 12374
224ce89b
WB
12375 case OP_PG_UPMAP_ITEMS:
12376 {
12377 vector<int64_t> id_vec;
9f95a23c 12378 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 12379 ss << "unable to parse 'id' value(s) '"
11fdf7f2 12380 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
12381 err = -EINVAL;
12382 goto reply;
12383 }
12384
12385 if (id_vec.size() % 2) {
12386 ss << "you must specify pairs of osd ids to be remapped";
12387 err = -EINVAL;
12388 goto reply;
12389 }
12390
12391 int pool_size = osdmap.get_pg_pool_size(pgid);
12392 if ((int)(id_vec.size() / 2) > pool_size) {
12393 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12394 << pool_size << ")";
12395 err = -EINVAL;
12396 goto reply;
12397 }
12398
12399 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12400 ostringstream items;
12401 items << "[";
12402 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12403 int from = *p++;
12404 int to = *p;
12405 if (from == to) {
12406 ss << "from osd." << from << " == to osd." << to << ", ";
12407 continue;
12408 }
12409 if (!osdmap.exists(from)) {
12410 ss << "osd." << from << " does not exist";
12411 err = -ENOENT;
12412 goto reply;
12413 }
12414 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12415 ss << "osd." << to << " does not exist";
12416 err = -ENOENT;
12417 goto reply;
12418 }
c07f9fc5
FG
12419 pair<int32_t,int32_t> entry = make_pair(from, to);
12420 auto it = std::find(new_pg_upmap_items.begin(),
12421 new_pg_upmap_items.end(), entry);
12422 if (it != new_pg_upmap_items.end()) {
12423 ss << "osd." << from << " -> osd." << to << " already exists, ";
12424 continue;
12425 }
12426 new_pg_upmap_items.push_back(entry);
224ce89b
WB
12427 items << from << "->" << to << ",";
12428 }
12429 string out(items.str());
12430 out.resize(out.size() - 1); // drop last ','
12431 out += "]";
12432
12433 if (new_pg_upmap_items.empty()) {
12434 ss << "no valid upmap items(pairs) is specified";
12435 err = -EINVAL;
12436 goto reply;
12437 }
12438
12439 pending_inc.new_pg_upmap_items[pgid] =
12440 mempool::osdmap::vector<pair<int32_t,int32_t>>(
12441 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12442 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12443 }
12444 break;
12445
12446 case OP_RM_PG_UPMAP_ITEMS:
12447 {
12448 pending_inc.old_pg_upmap_items.insert(pgid);
12449 ss << "clear " << pgid << " pg_upmap_items mapping";
12450 }
12451 break;
12452
1e59de90
TL
12453 case OP_PG_UPMAP_PRIMARY:
12454 {
12455 int64_t id;
12456 if (!cmd_getval(cmdmap, "id", id)) {
12457 ss << "invalid osd id value '"
12458 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12459 err = -EINVAL;
12460 goto reply;
12461 }
12462 if (id != CRUSH_ITEM_NONE && !osdmap.exists(id)) {
12463 ss << "osd." << id << " does not exist";
12464 err = -ENOENT;
12465 goto reply;
12466 }
12467 vector<int> acting;
12468 int primary;
12469 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12470 if (id == primary) {
12471 ss << "osd." << id << " is already primary for pg " << pgid;
12472 err = -EINVAL;
12473 goto reply;
12474 }
12475 int found_idx = 0;
12476 for (int i = 1 ; i < (int)acting.size(); i++) { // skip 0 on purpose
12477 if (acting[i] == id) {
12478 found_idx = i;
12479 break;
12480 }
12481 }
12482 if (found_idx == 0) {
12483 ss << "osd." << id << " is not in acting set for pg " << pgid;
12484 err = -EINVAL;
12485 goto reply;
12486 }
12487 vector<int> new_acting(acting);
12488 new_acting[found_idx] = new_acting[0];
12489 new_acting[0] = id;
12490 int pool_size = osdmap.get_pg_pool_size(pgid);
12491 if (osdmap.crush->verify_upmap(cct, osdmap.get_pg_pool_crush_rule(pgid),
12492 pool_size, new_acting) >= 0) {
12493 ss << "change primary for pg " << pgid << " to osd." << id;
12494 }
12495 else {
12496 ss << "can't change primary for pg " << pgid << " to osd." << id
12497 << " - illegal pg after the change";
12498 err = -EINVAL;
12499 goto reply;
12500 }
12501 pending_inc.new_pg_upmap_primary[pgid] = id;
12502 //TO-REMOVE:
12503 ldout(cct, 20) << "pg " << pgid << ": set pg_upmap_primary to " << id << dendl;
12504 }
12505 break;
12506
12507 case OP_RM_PG_UPMAP_PRIMARY:
12508 {
12509 pending_inc.old_pg_upmap_primary.insert(pgid);
12510 ss << "clear " << pgid << " pg_upmap_primary mapping";
12511 }
12512 break;
12513
224ce89b 12514 default:
1e59de90 12515 ceph_abort_msg("invalid upmap option");
7c673cae
FG
12516 }
12517
7c673cae
FG
12518 goto update;
12519 } else if (prefix == "osd primary-affinity") {
12520 int64_t id;
9f95a23c 12521 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12522 ss << "invalid osd id value '"
11fdf7f2 12523 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12524 err = -EINVAL;
12525 goto reply;
12526 }
12527 double w;
9f95a23c 12528 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 12529 ss << "unable to parse 'weight' value '"
11fdf7f2 12530 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
12531 err = -EINVAL;
12532 goto reply;
12533 }
12534 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12535 if (ww < 0L) {
12536 ss << "weight must be >= 0";
12537 err = -EINVAL;
12538 goto reply;
12539 }
9f95a23c
TL
12540 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12541 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 12542 ss << "require_min_compat_client "
9f95a23c 12543 << osdmap.require_min_compat_client
7c673cae
FG
12544 << " < firefly, which is required for primary-affinity";
12545 err = -EPERM;
12546 goto reply;
7c673cae 12547 }
7c673cae
FG
12548 if (osdmap.exists(id)) {
12549 pending_inc.new_primary_affinity[id] = ww;
f67539c2 12550 ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
7c673cae
FG
12551 getline(ss, rs);
12552 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12553 get_last_committed() + 1));
12554 return true;
12555 } else {
12556 ss << "osd." << id << " does not exist";
12557 err = -ENOENT;
12558 goto reply;
12559 }
12560 } else if (prefix == "osd reweight") {
12561 int64_t id;
9f95a23c 12562 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12563 ss << "unable to parse osd id value '"
11fdf7f2 12564 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12565 err = -EINVAL;
12566 goto reply;
12567 }
12568 double w;
9f95a23c 12569 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 12570 ss << "unable to parse weight value '"
11fdf7f2 12571 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
12572 err = -EINVAL;
12573 goto reply;
12574 }
12575 long ww = (int)((double)CEPH_OSD_IN*w);
12576 if (ww < 0L) {
12577 ss << "weight must be >= 0";
12578 err = -EINVAL;
12579 goto reply;
12580 }
12581 if (osdmap.exists(id)) {
12582 pending_inc.new_weight[id] = ww;
12583 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12584 getline(ss, rs);
12585 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12586 get_last_committed() + 1));
12587 return true;
12588 } else {
12589 ss << "osd." << id << " does not exist";
12590 err = -ENOENT;
12591 goto reply;
12592 }
12593 } else if (prefix == "osd reweightn") {
12594 map<int32_t, uint32_t> weights;
11fdf7f2 12595 err = parse_reweights(cct, cmdmap, osdmap, &weights);
7c673cae
FG
12596 if (err) {
12597 ss << "unable to parse 'weights' value '"
11fdf7f2 12598 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
7c673cae
FG
12599 goto reply;
12600 }
12601 pending_inc.new_weight.insert(weights.begin(), weights.end());
12602 wait_for_finished_proposal(
12603 op,
12604 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 12605 return true;
7c673cae
FG
12606 } else if (prefix == "osd lost") {
12607 int64_t id;
9f95a23c 12608 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12609 ss << "unable to parse osd id value '"
11fdf7f2 12610 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12611 err = -EINVAL;
12612 goto reply;
12613 }
11fdf7f2 12614 bool sure = false;
9f95a23c 12615 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 12616 if (!sure) {
7c673cae
FG
12617 ss << "are you SURE? this might mean real, permanent data loss. pass "
12618 "--yes-i-really-mean-it if you really do.";
12619 err = -EPERM;
12620 goto reply;
12621 } else if (!osdmap.exists(id)) {
12622 ss << "osd." << id << " does not exist";
12623 err = -ENOENT;
12624 goto reply;
12625 } else if (!osdmap.is_down(id)) {
12626 ss << "osd." << id << " is not down";
12627 err = -EBUSY;
12628 goto reply;
12629 } else {
12630 epoch_t e = osdmap.get_info(id).down_at;
12631 pending_inc.new_lost[id] = e;
12632 ss << "marked osd lost in epoch " << e;
12633 getline(ss, rs);
12634 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12635 get_last_committed() + 1));
12636 return true;
12637 }
12638
11fdf7f2
TL
12639 } else if (prefix == "osd destroy-actual" ||
12640 prefix == "osd purge-actual" ||
12641 prefix == "osd purge-new") {
31f18b77
FG
12642 /* Destroying an OSD means that we don't expect to further make use of
12643 * the OSDs data (which may even become unreadable after this operation),
12644 * and that we are okay with scrubbing all its cephx keys and config-key
12645 * data (which may include lockbox keys, thus rendering the osd's data
12646 * unreadable).
12647 *
12648 * The OSD will not be removed. Instead, we will mark it as destroyed,
12649 * such that a subsequent call to `create` will not reuse the osd id.
12650 * This will play into being able to recreate the OSD, at the same
12651 * crush location, with minimal data movement.
12652 */
12653
12654 // make sure authmon is writeable.
f67539c2 12655 if (!mon.authmon()->is_writeable()) {
31f18b77
FG
12656 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12657 << "osd destroy" << dendl;
f67539c2 12658 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
31f18b77
FG
12659 return false;
12660 }
12661
12662 int64_t id;
9f95a23c 12663 if (!cmd_getval(cmdmap, "id", id)) {
11fdf7f2
TL
12664 auto p = cmdmap.find("id");
12665 if (p == cmdmap.end()) {
12666 ss << "no osd id specified";
12667 } else {
12668 ss << "unable to parse osd id value '"
12669 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12670 }
31f18b77
FG
12671 err = -EINVAL;
12672 goto reply;
12673 }
12674
11fdf7f2 12675 bool is_destroy = (prefix == "osd destroy-actual");
31f18b77 12676 if (!is_destroy) {
11fdf7f2
TL
12677 ceph_assert("osd purge-actual" == prefix ||
12678 "osd purge-new" == prefix);
31f18b77
FG
12679 }
12680
11fdf7f2 12681 bool sure = false;
9f95a23c 12682 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
12683 if (!sure) {
12684 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12685 << "This will mean real, permanent data loss, as well "
12686 << "as deletion of cephx and lockbox keys. "
12687 << "Pass --yes-i-really-mean-it if you really do.";
31f18b77
FG
12688 err = -EPERM;
12689 goto reply;
d2e6a577 12690 } else if (!osdmap.exists(id)) {
31f18b77 12691 ss << "osd." << id << " does not exist";
d2e6a577 12692 err = 0; // idempotent
31f18b77
FG
12693 goto reply;
12694 } else if (osdmap.is_up(id)) {
12695 ss << "osd." << id << " is not `down`.";
12696 err = -EBUSY;
12697 goto reply;
12698 } else if (is_destroy && osdmap.is_destroyed(id)) {
12699 ss << "destroyed osd." << id;
12700 err = 0;
12701 goto reply;
12702 }
12703
11fdf7f2
TL
12704 if (prefix == "osd purge-new" &&
12705 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12706 ss << "osd." << id << " is not new";
12707 err = -EPERM;
12708 goto reply;
12709 }
12710
31f18b77
FG
12711 bool goto_reply = false;
12712
f67539c2 12713 paxos.plug();
31f18b77
FG
12714 if (is_destroy) {
12715 err = prepare_command_osd_destroy(id, ss);
12716 // we checked above that it should exist.
11fdf7f2 12717 ceph_assert(err != -ENOENT);
31f18b77
FG
12718 } else {
12719 err = prepare_command_osd_purge(id, ss);
12720 if (err == -ENOENT) {
12721 err = 0;
12722 ss << "osd." << id << " does not exist.";
12723 goto_reply = true;
12724 }
12725 }
f67539c2 12726 paxos.unplug();
31f18b77
FG
12727
12728 if (err < 0 || goto_reply) {
12729 goto reply;
12730 }
12731
12732 if (is_destroy) {
12733 ss << "destroyed osd." << id;
12734 } else {
12735 ss << "purged osd." << id;
12736 }
12737
12738 getline(ss, rs);
12739 wait_for_finished_proposal(op,
12740 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12741 force_immediate_propose();
12742 return true;
12743
12744 } else if (prefix == "osd new") {
12745
12746 // make sure authmon is writeable.
f67539c2 12747 if (!mon.authmon()->is_writeable()) {
31f18b77 12748 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 12749 << "osd new" << dendl;
f67539c2 12750 mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
31f18b77
FG
12751 return false;
12752 }
12753
2a845540
TL
12754 // make sure kvmon is writeable.
12755 if (!mon.kvmon()->is_writeable()) {
12756 dout(10) << __func__ << " waiting for kv mon to be writeable for "
12757 << "osd new" << dendl;
12758 mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12759 return false;
12760 }
12761
3a9019d9 12762 map<string,string> param_map;
31f18b77
FG
12763
12764 bufferlist bl = m->get_data();
3a9019d9
FG
12765 string param_json = bl.to_str();
12766 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 12767
3a9019d9 12768 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
12769 if (err < 0)
12770 goto reply;
12771
3a9019d9 12772 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77 12773
f67539c2 12774 paxos.plug();
3a9019d9 12775 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
f67539c2 12776 paxos.unplug();
31f18b77
FG
12777
12778 if (err < 0) {
12779 goto reply;
12780 }
12781
12782 if (f) {
12783 f->flush(rdata);
12784 } else {
12785 rdata.append(ss);
12786 }
12787
12788 if (err == EEXIST) {
12789 // idempotent operation
12790 err = 0;
12791 goto reply;
12792 }
12793
12794 wait_for_finished_proposal(op,
12795 new Monitor::C_Command(mon, op, 0, rs, rdata,
12796 get_last_committed() + 1));
12797 force_immediate_propose();
12798 return true;
12799
7c673cae 12800 } else if (prefix == "osd create") {
7c673cae
FG
12801
12802 // optional id provided?
31f18b77 12803 int64_t id = -1, cmd_id = -1;
9f95a23c 12804 if (cmd_getval(cmdmap, "id", cmd_id)) {
31f18b77
FG
12805 if (cmd_id < 0) {
12806 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
12807 err = -EINVAL;
12808 goto reply;
12809 }
31f18b77 12810 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
12811 }
12812
7c673cae
FG
12813 uuid_d uuid;
12814 string uuidstr;
9f95a23c 12815 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
7c673cae 12816 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
12817 ss << "invalid uuid value '" << uuidstr << "'";
12818 err = -EINVAL;
12819 goto reply;
7c673cae 12820 }
31f18b77
FG
12821 // we only care about the id if we also have the uuid, to
12822 // ensure the operation's idempotency.
12823 id = cmd_id;
7c673cae
FG
12824 }
12825
31f18b77
FG
12826 int32_t new_id = -1;
12827 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12828 if (err < 0) {
12829 if (err == -EAGAIN) {
12830 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12831 return true;
12832 }
12833 // a check has failed; reply to the user.
12834 goto reply;
12835
12836 } else if (err == EEXIST) {
12837 // this is an idempotent operation; we can go ahead and reply.
12838 if (f) {
12839 f->open_object_section("created_osd");
12840 f->dump_int("osdid", new_id);
12841 f->close_section();
12842 f->flush(rdata);
12843 } else {
12844 ss << new_id;
12845 rdata.append(ss);
7c673cae 12846 }
31f18b77
FG
12847 err = 0;
12848 goto reply;
7c673cae
FG
12849 }
12850
3a9019d9
FG
12851 string empty_device_class;
12852 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 12853
7c673cae
FG
12854 if (f) {
12855 f->open_object_section("created_osd");
31f18b77 12856 f->dump_int("osdid", new_id);
7c673cae
FG
12857 f->close_section();
12858 f->flush(rdata);
12859 } else {
31f18b77 12860 ss << new_id;
7c673cae
FG
12861 rdata.append(ss);
12862 }
31f18b77
FG
12863 wait_for_finished_proposal(op,
12864 new Monitor::C_Command(mon, op, 0, rs, rdata,
12865 get_last_committed() + 1));
7c673cae
FG
12866 return true;
12867
f67539c2
TL
12868 } else if (prefix == "osd blocklist clear" ||
12869 prefix == "osd blacklist clear") {
12870 pending_inc.new_blocklist.clear();
12871 std::list<std::pair<entity_addr_t,utime_t > > blocklist;
33c7a0ef
TL
12872 std::list<std::pair<entity_addr_t,utime_t > > range_b;
12873 osdmap.get_blocklist(&blocklist, &range_b);
f67539c2
TL
12874 for (const auto &entry : blocklist) {
12875 pending_inc.old_blocklist.push_back(entry.first);
7c673cae 12876 }
33c7a0ef
TL
12877 for (const auto &entry : range_b) {
12878 pending_inc.old_range_blocklist.push_back(entry.first);
12879 }
f67539c2 12880 ss << " removed all blocklist entries";
7c673cae
FG
12881 getline(ss, rs);
12882 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12883 get_last_committed() + 1));
12884 return true;
f67539c2
TL
12885 } else if (prefix == "osd blocklist" ||
12886 prefix == "osd blacklist") {
33c7a0ef
TL
12887 string addrstr, rangestr;
12888 bool range = false;
9f95a23c 12889 cmd_getval(cmdmap, "addr", addrstr);
33c7a0ef
TL
12890 if (cmd_getval(cmdmap, "range", rangestr)) {
12891 if (rangestr == "range") {
12892 range = true;
12893 } else {
12894 ss << "Did you mean to specify \"osd blocklist range\"?";
12895 err = -EINVAL;
12896 goto reply;
12897 }
12898 }
7c673cae 12899 entity_addr_t addr;
20effc67 12900 if (!addr.parse(addrstr)) {
7c673cae
FG
12901 ss << "unable to parse address " << addrstr;
12902 err = -EINVAL;
12903 goto reply;
12904 }
12905 else {
33c7a0ef
TL
12906 if (range) {
12907 if (!addr.maybe_cidr()) {
12908 ss << "You specified a range command, but " << addr
12909 << " does not parse as a CIDR range";
12910 err = -EINVAL;
12911 goto reply;
12912 }
12913 addr.type = entity_addr_t::TYPE_CIDR;
12914 err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
12915 if (err) {
12916 goto reply;
12917 }
12918 if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
12919 (addr.is_ipv6() && addr.get_nonce() > 128)) {
12920 ss << "Too many bits in range for that protocol!";
12921 err = -EINVAL;
12922 goto reply;
12923 }
11fdf7f2 12924 } else {
33c7a0ef
TL
12925 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12926 // always blocklist type ANY
12927 addr.set_type(entity_addr_t::TYPE_ANY);
12928 } else {
12929 addr.set_type(entity_addr_t::TYPE_LEGACY);
12930 }
11fdf7f2
TL
12931 }
12932
f67539c2
TL
12933 string blocklistop;
12934 if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12935 cmd_getval(cmdmap, "blacklistop", blocklistop);
12936 }
12937 if (blocklistop == "add") {
7c673cae 12938 utime_t expires = ceph_clock_now();
7c673cae 12939 // default one hour
20effc67 12940 double d = cmd_getval_or<double>(cmdmap, "expire",
f67539c2 12941 g_conf()->mon_osd_blocklist_default_expire);
7c673cae
FG
12942 expires += d;
12943
33c7a0ef
TL
12944 auto add_to_pending_blocklists = [](auto& nb, auto& ob,
12945 const auto& addr,
12946 const auto& expires) {
12947 nb[addr] = expires;
12948 // cancel any pending un-blocklisting request too
12949 auto it = std::find(ob.begin(),
12950 ob.end(), addr);
12951 if (it != ob.end()) {
12952 ob.erase(it);
12953 }
12954 };
12955 if (range) {
12956 add_to_pending_blocklists(pending_inc.new_range_blocklist,
12957 pending_inc.old_range_blocklist,
12958 addr, expires);
224ce89b 12959
33c7a0ef
TL
12960 } else {
12961 add_to_pending_blocklists(pending_inc.new_blocklist,
12962 pending_inc.old_blocklist,
12963 addr, expires);
12964 }
224ce89b 12965
f67539c2 12966 ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
7c673cae
FG
12967 getline(ss, rs);
12968 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12969 get_last_committed() + 1));
12970 return true;
f67539c2 12971 } else if (blocklistop == "rm") {
33c7a0ef
TL
12972 auto rm_from_pending_blocklists = [](const auto& addr,
12973 auto& blocklist,
12974 auto& ob, auto& pb) {
12975 if (blocklist.count(addr)) {
12976 ob.push_back(addr);
12977 return true;
12978 } else if (pb.count(addr)) {
12979 pb.erase(addr);
12980 return true;
12981 }
12982 return false;
12983 };
12984 if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
12985 pending_inc.old_blocklist,
12986 pending_inc.new_blocklist)) ||
12987 (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
12988 pending_inc.old_range_blocklist,
12989 pending_inc.new_range_blocklist))) {
f67539c2 12990 ss << "un-blocklisting " << addr;
7c673cae
FG
12991 getline(ss, rs);
12992 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12993 get_last_committed() + 1));
12994 return true;
12995 }
f67539c2 12996 ss << addr << " isn't blocklisted";
7c673cae
FG
12997 err = 0;
12998 goto reply;
12999 }
13000 }
13001 } else if (prefix == "osd pool mksnap") {
13002 string poolstr;
9f95a23c 13003 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13004 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13005 if (pool < 0) {
13006 ss << "unrecognized pool '" << poolstr << "'";
13007 err = -ENOENT;
13008 goto reply;
13009 }
13010 string snapname;
9f95a23c 13011 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
13012 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13013 if (p->is_unmanaged_snaps_mode()) {
13014 ss << "pool " << poolstr << " is in unmanaged snaps mode";
13015 err = -EINVAL;
13016 goto reply;
13017 } else if (p->snap_exists(snapname.c_str())) {
13018 ss << "pool " << poolstr << " snap " << snapname << " already exists";
13019 err = 0;
13020 goto reply;
13021 } else if (p->is_tier()) {
13022 ss << "pool " << poolstr << " is a cache tier";
13023 err = -EINVAL;
13024 goto reply;
13025 }
13026 pg_pool_t *pp = 0;
13027 if (pending_inc.new_pools.count(pool))
13028 pp = &pending_inc.new_pools[pool];
13029 if (!pp) {
13030 pp = &pending_inc.new_pools[pool];
13031 *pp = *p;
13032 }
13033 if (pp->snap_exists(snapname.c_str())) {
13034 ss << "pool " << poolstr << " snap " << snapname << " already exists";
13035 } else {
13036 pp->add_snap(snapname.c_str(), ceph_clock_now());
13037 pp->set_snap_epoch(pending_inc.epoch);
13038 ss << "created pool " << poolstr << " snap " << snapname;
13039 }
13040 getline(ss, rs);
13041 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13042 get_last_committed() + 1));
13043 return true;
13044 } else if (prefix == "osd pool rmsnap") {
13045 string poolstr;
9f95a23c 13046 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13047 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13048 if (pool < 0) {
13049 ss << "unrecognized pool '" << poolstr << "'";
13050 err = -ENOENT;
13051 goto reply;
13052 }
13053 string snapname;
9f95a23c 13054 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
13055 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13056 if (p->is_unmanaged_snaps_mode()) {
13057 ss << "pool " << poolstr << " is in unmanaged snaps mode";
13058 err = -EINVAL;
13059 goto reply;
13060 } else if (!p->snap_exists(snapname.c_str())) {
13061 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
13062 err = 0;
13063 goto reply;
13064 }
13065 pg_pool_t *pp = 0;
13066 if (pending_inc.new_pools.count(pool))
13067 pp = &pending_inc.new_pools[pool];
13068 if (!pp) {
13069 pp = &pending_inc.new_pools[pool];
13070 *pp = *p;
13071 }
13072 snapid_t sn = pp->snap_exists(snapname.c_str());
13073 if (sn) {
13074 pp->remove_snap(sn);
13075 pp->set_snap_epoch(pending_inc.epoch);
13076 ss << "removed pool " << poolstr << " snap " << snapname;
13077 } else {
13078 ss << "already removed pool " << poolstr << " snap " << snapname;
13079 }
13080 getline(ss, rs);
13081 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13082 get_last_committed() + 1));
13083 return true;
13084 } else if (prefix == "osd pool create") {
20effc67
TL
13085 int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
13086 int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
13087 int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
13088 int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
7c673cae 13089 string pool_type_str;
9f95a23c 13090 cmd_getval(cmdmap, "pool_type", pool_type_str);
7c673cae 13091 if (pool_type_str.empty())
11fdf7f2 13092 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
7c673cae
FG
13093
13094 string poolstr;
9f95a23c 13095 cmd_getval(cmdmap, "pool", poolstr);
1e59de90
TL
13096 bool confirm = false;
13097 //confirmation may be set to true only by internal operations.
13098 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13099 if (poolstr[0] == '.' && !confirm) {
13100 ss << "pool names beginning with . are not allowed";
13101 err = 0;
13102 goto reply;
13103 }
7c673cae
FG
13104 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13105 if (pool_id >= 0) {
13106 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13107 if (pool_type_str != p->get_type_name()) {
13108 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
13109 err = -EINVAL;
13110 } else {
13111 ss << "pool '" << poolstr << "' already exists";
13112 err = 0;
13113 }
13114 goto reply;
13115 }
13116
13117 int pool_type;
13118 if (pool_type_str == "replicated") {
13119 pool_type = pg_pool_t::TYPE_REPLICATED;
13120 } else if (pool_type_str == "erasure") {
7c673cae
FG
13121 pool_type = pg_pool_t::TYPE_ERASURE;
13122 } else {
13123 ss << "unknown pool type '" << pool_type_str << "'";
13124 err = -EINVAL;
13125 goto reply;
13126 }
13127
31f18b77 13128 bool implicit_rule_creation = false;
94b18763 13129 int64_t expected_num_objects = 0;
31f18b77 13130 string rule_name;
9f95a23c 13131 cmd_getval(cmdmap, "rule", rule_name);
7c673cae 13132 string erasure_code_profile;
9f95a23c 13133 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
7c673cae
FG
13134
13135 if (pool_type == pg_pool_t::TYPE_ERASURE) {
13136 if (erasure_code_profile == "")
13137 erasure_code_profile = "default";
13138 //handle the erasure code profile
13139 if (erasure_code_profile == "default") {
13140 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
13141 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
13142 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
13143 goto wait;
13144 }
13145
13146 map<string,string> profile_map;
11fdf7f2 13147 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
13148 profile_map,
13149 &ss);
13150 if (err)
13151 goto reply;
13152 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
13153 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
13154 goto wait;
13155 }
13156 }
31f18b77
FG
13157 if (rule_name == "") {
13158 implicit_rule_creation = true;
7c673cae 13159 if (erasure_code_profile == "default") {
31f18b77 13160 rule_name = "erasure-code";
7c673cae 13161 } else {
31f18b77 13162 dout(1) << "implicitly use rule named after the pool: "
7c673cae 13163 << poolstr << dendl;
31f18b77 13164 rule_name = poolstr;
7c673cae
FG
13165 }
13166 }
20effc67
TL
13167 expected_num_objects =
13168 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
7c673cae 13169 } else {
31f18b77 13170 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
13171 // and put expected_num_objects to rule field
13172 if (erasure_code_profile != "") { // cmd is from CLI
13173 if (rule_name != "") {
13174 string interr;
13175 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
13176 if (interr.length()) {
13177 ss << "error parsing integer value '" << rule_name << "': " << interr;
13178 err = -EINVAL;
13179 goto reply;
13180 }
13181 }
13182 rule_name = erasure_code_profile;
13183 } else { // cmd is well-formed
20effc67
TL
13184 expected_num_objects =
13185 cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
94b18763 13186 }
7c673cae
FG
13187 }
13188
31f18b77
FG
13189 if (!implicit_rule_creation && rule_name != "") {
13190 int rule;
13191 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
13192 if (err == -EAGAIN) {
13193 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13194 return true;
13195 }
13196 if (err)
13197 goto reply;
13198 }
13199
7c673cae
FG
13200 if (expected_num_objects < 0) {
13201 ss << "'expected_num_objects' must be non-negative";
13202 err = -EINVAL;
13203 goto reply;
13204 }
13205
f6b5b4d7
TL
13206 set<int32_t> osds;
13207 osdmap.get_all_osds(osds);
13208 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
13209 string type;
13210 if (!get_osd_objectstore_type(osd, &type)) {
13211 return type == "filestore";
13212 } else {
13213 return false;
13214 }
13215 });
13216
13217 if (has_filestore_osd &&
13218 expected_num_objects > 0 &&
13219 cct->_conf->filestore_merge_threshold > 0) {
91327a77
AA
13220 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13221 err = -EINVAL;
13222 goto reply;
13223 }
13224
f6b5b4d7
TL
13225 if (has_filestore_osd &&
13226 expected_num_objects == 0 &&
13227 cct->_conf->filestore_merge_threshold < 0) {
91327a77 13228 int osds = osdmap.get_num_osds();
f6b5b4d7
TL
13229 bool sure = false;
13230 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13231 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
91327a77 13232 ss << "For better initial performance on pools expected to store a "
f6b5b4d7
TL
13233 << "large number of objects, consider supplying the "
13234 << "expected_num_objects parameter when creating the pool."
13235 << " Pass --yes-i-really-mean-it to ignore it";
13236 err = -EPERM;
13237 goto reply;
91327a77
AA
13238 }
13239 }
13240
20effc67 13241 int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
7c673cae
FG
13242 FastReadType fast_read = FAST_READ_DEFAULT;
13243 if (fast_read_param == 0)
13244 fast_read = FAST_READ_OFF;
13245 else if (fast_read_param > 0)
13246 fast_read = FAST_READ_ON;
11fdf7f2
TL
13247
13248 int64_t repl_size = 0;
9f95a23c 13249 cmd_getval(cmdmap, "size", repl_size);
11fdf7f2
TL
13250 int64_t target_size_bytes = 0;
13251 double target_size_ratio = 0.0;
9f95a23c
TL
13252 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
13253 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
13254
13255 string pg_autoscale_mode;
13256 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
11fdf7f2 13257
20effc67 13258 bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
1e59de90
TL
13259
13260 bool crimson = cmd_getval_or<bool>(cmdmap, "crimson", false) ||
13261 cct->_conf.get_val<bool>("osd_pool_default_crimson");
13262
11fdf7f2 13263 err = prepare_new_pool(poolstr,
7c673cae 13264 -1, // default crush rule
31f18b77 13265 rule_name,
20effc67 13266 pg_num, pgp_num, pg_num_min, pg_num_max,
11fdf7f2 13267 repl_size, target_size_bytes, target_size_ratio,
7c673cae
FG
13268 erasure_code_profile, pool_type,
13269 (uint64_t)expected_num_objects,
13270 fast_read,
9f95a23c 13271 pg_autoscale_mode,
20effc67 13272 bulk,
1e59de90 13273 crimson,
7c673cae
FG
13274 &ss);
13275 if (err < 0) {
13276 switch(err) {
13277 case -EEXIST:
13278 ss << "pool '" << poolstr << "' already exists";
13279 break;
13280 case -EAGAIN:
13281 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13282 return true;
13283 case -ERANGE:
13284 goto reply;
13285 default:
13286 goto reply;
13287 break;
13288 }
13289 } else {
13290 ss << "pool '" << poolstr << "' created";
13291 }
13292 getline(ss, rs);
13293 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13294 get_last_committed() + 1));
13295 return true;
13296
13297 } else if (prefix == "osd pool delete" ||
13298 prefix == "osd pool rm") {
13299 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13300 string poolstr, poolstr2, sure;
9f95a23c
TL
13301 cmd_getval(cmdmap, "pool", poolstr);
13302 cmd_getval(cmdmap, "pool2", poolstr2);
7c673cae
FG
13303 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13304 if (pool < 0) {
13305 ss << "pool '" << poolstr << "' does not exist";
13306 err = 0;
13307 goto reply;
13308 }
13309
11fdf7f2 13310 bool force_no_fake = false;
9f95a23c 13311 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11fdf7f2 13312 bool force = false;
9f95a23c 13313 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
7c673cae 13314 if (poolstr2 != poolstr ||
11fdf7f2 13315 (!force && !force_no_fake)) {
7c673cae
FG
13316 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13317 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13318 << "followed by --yes-i-really-really-mean-it.";
13319 err = -EPERM;
13320 goto reply;
13321 }
13322 err = _prepare_remove_pool(pool, &ss, force_no_fake);
13323 if (err == -EAGAIN) {
13324 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13325 return true;
13326 }
13327 if (err < 0)
13328 goto reply;
13329 goto update;
13330 } else if (prefix == "osd pool rename") {
13331 string srcpoolstr, destpoolstr;
9f95a23c
TL
13332 cmd_getval(cmdmap, "srcpool", srcpoolstr);
13333 cmd_getval(cmdmap, "destpool", destpoolstr);
7c673cae
FG
13334 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13335 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
1e59de90
TL
13336 bool confirm = false;
13337 //confirmation may be set to true only by internal operations.
13338 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13339 if (destpoolstr[0] == '.' && !confirm) {
13340 ss << "pool names beginning with . are not allowed";
13341 err = 0;
13342 goto reply;
13343 }
7c673cae
FG
13344 if (pool_src < 0) {
13345 if (pool_dst >= 0) {
13346 // src pool doesn't exist, dst pool does exist: to ensure idempotency
13347 // of operations, assume this rename succeeded, as it is not changing
13348 // the current state. Make sure we output something understandable
13349 // for whoever is issuing the command, if they are paying attention,
13350 // in case it was not intentional; or to avoid a "wtf?" and a bug
13351 // report in case it was intentional, while expecting a failure.
13352 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13353 << destpoolstr << "' does -- assuming successful rename";
13354 err = 0;
13355 } else {
13356 ss << "unrecognized pool '" << srcpoolstr << "'";
13357 err = -ENOENT;
13358 }
13359 goto reply;
13360 } else if (pool_dst >= 0) {
13361 // source pool exists and so does the destination pool
13362 ss << "pool '" << destpoolstr << "' already exists";
13363 err = -EEXIST;
13364 goto reply;
13365 }
13366
13367 int ret = _prepare_rename_pool(pool_src, destpoolstr);
13368 if (ret == 0) {
13369 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13370 } else {
13371 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13372 << cpp_strerror(ret);
13373 }
13374 getline(ss, rs);
13375 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13376 get_last_committed() + 1));
13377 return true;
13378
13379 } else if (prefix == "osd pool set") {
13380 err = prepare_command_pool_set(cmdmap, ss);
13381 if (err == -EAGAIN)
13382 goto wait;
13383 if (err < 0)
13384 goto reply;
13385
13386 getline(ss, rs);
13387 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13388 get_last_committed() + 1));
13389 return true;
13390 } else if (prefix == "osd tier add") {
13391 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13392 if (err == -EAGAIN)
13393 goto wait;
13394 if (err)
13395 goto reply;
13396 string poolstr;
9f95a23c 13397 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13398 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13399 if (pool_id < 0) {
13400 ss << "unrecognized pool '" << poolstr << "'";
13401 err = -ENOENT;
13402 goto reply;
13403 }
13404 string tierpoolstr;
9f95a23c 13405 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13406 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13407 if (tierpool_id < 0) {
13408 ss << "unrecognized pool '" << tierpoolstr << "'";
13409 err = -ENOENT;
13410 goto reply;
13411 }
13412 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13413 ceph_assert(p);
7c673cae 13414 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13415 ceph_assert(tp);
7c673cae
FG
13416
13417 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13418 goto reply;
13419 }
13420
13421 // make sure new tier is empty
20effc67
TL
13422 bool force_nonempty = false;
13423 cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
f67539c2 13424 const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 13425 if (pstats && pstats->stats.sum.num_objects != 0 &&
20effc67 13426 !force_nonempty) {
7c673cae
FG
13427 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13428 err = -ENOTEMPTY;
13429 goto reply;
13430 }
11fdf7f2 13431 if (tp->is_erasure()) {
7c673cae
FG
13432 ss << "tier pool '" << tierpoolstr
13433 << "' is an ec pool, which cannot be a tier";
13434 err = -ENOTSUP;
13435 goto reply;
13436 }
13437 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
20effc67
TL
13438 (!force_nonempty ||
13439 !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
7c673cae
FG
13440 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13441 err = -ENOTEMPTY;
13442 goto reply;
13443 }
13444 // go
13445 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13446 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13447 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13448 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13449 return true;
13450 }
13451 np->tiers.insert(tierpool_id);
13452 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13453 ntp->tier_of = pool_id;
13454 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13455 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13456 get_last_committed() + 1));
13457 return true;
13458 } else if (prefix == "osd tier remove" ||
13459 prefix == "osd tier rm") {
13460 string poolstr;
9f95a23c 13461 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13462 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13463 if (pool_id < 0) {
13464 ss << "unrecognized pool '" << poolstr << "'";
13465 err = -ENOENT;
13466 goto reply;
13467 }
13468 string tierpoolstr;
9f95a23c 13469 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13470 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13471 if (tierpool_id < 0) {
13472 ss << "unrecognized pool '" << tierpoolstr << "'";
13473 err = -ENOENT;
13474 goto reply;
13475 }
13476 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13477 ceph_assert(p);
7c673cae 13478 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13479 ceph_assert(tp);
7c673cae
FG
13480
13481 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13482 goto reply;
13483 }
13484
13485 if (p->tiers.count(tierpool_id) == 0) {
13486 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13487 err = 0;
13488 goto reply;
13489 }
13490 if (tp->tier_of != pool_id) {
13491 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13492 << osdmap.get_pool_name(tp->tier_of) << "': "
13493 // be scary about it; this is an inconsistency and bells must go off
13494 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13495 err = -EINVAL;
13496 goto reply;
13497 }
13498 if (p->read_tier == tierpool_id) {
13499 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13500 err = -EBUSY;
13501 goto reply;
13502 }
13503 // go
13504 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13505 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13506 if (np->tiers.count(tierpool_id) == 0 ||
13507 ntp->tier_of != pool_id ||
13508 np->read_tier == tierpool_id) {
13509 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13510 return true;
13511 }
13512 np->tiers.erase(tierpool_id);
13513 ntp->clear_tier();
13514 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13515 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13516 get_last_committed() + 1));
13517 return true;
13518 } else if (prefix == "osd tier set-overlay") {
13519 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13520 if (err == -EAGAIN)
13521 goto wait;
13522 if (err)
13523 goto reply;
13524 string poolstr;
9f95a23c 13525 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13526 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13527 if (pool_id < 0) {
13528 ss << "unrecognized pool '" << poolstr << "'";
13529 err = -ENOENT;
13530 goto reply;
13531 }
13532 string overlaypoolstr;
9f95a23c 13533 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
7c673cae
FG
13534 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13535 if (overlaypool_id < 0) {
13536 ss << "unrecognized pool '" << overlaypoolstr << "'";
13537 err = -ENOENT;
13538 goto reply;
13539 }
13540 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13541 ceph_assert(p);
7c673cae 13542 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11fdf7f2 13543 ceph_assert(overlay_p);
7c673cae
FG
13544 if (p->tiers.count(overlaypool_id) == 0) {
13545 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13546 err = -EINVAL;
13547 goto reply;
13548 }
13549 if (p->read_tier == overlaypool_id) {
13550 err = 0;
13551 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13552 goto reply;
13553 }
13554 if (p->has_read_tier()) {
13555 ss << "pool '" << poolstr << "' has overlay '"
13556 << osdmap.get_pool_name(p->read_tier)
13557 << "'; please remove-overlay first";
13558 err = -EINVAL;
13559 goto reply;
13560 }
13561
13562 // go
13563 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13564 np->read_tier = overlaypool_id;
13565 np->write_tier = overlaypool_id;
13566 np->set_last_force_op_resend(pending_inc.epoch);
13567 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13568 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13569 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13570 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13571 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13572 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13573 get_last_committed() + 1));
13574 return true;
13575 } else if (prefix == "osd tier remove-overlay" ||
13576 prefix == "osd tier rm-overlay") {
13577 string poolstr;
9f95a23c 13578 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13579 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13580 if (pool_id < 0) {
13581 ss << "unrecognized pool '" << poolstr << "'";
13582 err = -ENOENT;
13583 goto reply;
13584 }
13585 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13586 ceph_assert(p);
7c673cae
FG
13587 if (!p->has_read_tier()) {
13588 err = 0;
13589 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13590 goto reply;
13591 }
13592
13593 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13594 goto reply;
13595 }
13596
13597 // go
13598 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13599 if (np->has_read_tier()) {
13600 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13601 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13602 nop->set_last_force_op_resend(pending_inc.epoch);
13603 }
13604 if (np->has_write_tier()) {
13605 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13606 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13607 nop->set_last_force_op_resend(pending_inc.epoch);
13608 }
13609 np->clear_read_tier();
13610 np->clear_write_tier();
13611 np->set_last_force_op_resend(pending_inc.epoch);
13612 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13613 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13614 get_last_committed() + 1));
13615 return true;
13616 } else if (prefix == "osd tier cache-mode") {
13617 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13618 if (err == -EAGAIN)
13619 goto wait;
13620 if (err)
13621 goto reply;
13622 string poolstr;
9f95a23c 13623 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13624 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13625 if (pool_id < 0) {
13626 ss << "unrecognized pool '" << poolstr << "'";
13627 err = -ENOENT;
13628 goto reply;
13629 }
13630 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13631 ceph_assert(p);
7c673cae
FG
13632 if (!p->is_tier()) {
13633 ss << "pool '" << poolstr << "' is not a tier";
13634 err = -EINVAL;
13635 goto reply;
13636 }
13637 string modestr;
9f95a23c 13638 cmd_getval(cmdmap, "mode", modestr);
7c673cae 13639 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 13640 if (int(mode) < 0) {
7c673cae
FG
13641 ss << "'" << modestr << "' is not a valid cache mode";
13642 err = -EINVAL;
13643 goto reply;
13644 }
13645
11fdf7f2 13646 bool sure = false;
9f95a23c 13647 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 13648
9f95a23c
TL
13649 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13650 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13651 ss << "'" << modestr << "' is no longer a supported cache mode";
13652 err = -EPERM;
13653 goto reply;
13654 }
7c673cae
FG
13655 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13656 mode != pg_pool_t::CACHEMODE_NONE &&
13657 mode != pg_pool_t::CACHEMODE_PROXY &&
13658 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11fdf7f2 13659 !sure) {
7c673cae
FG
13660 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13661 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13662 err = -EPERM;
13663 goto reply;
13664 }
13665
13666 // pool already has this cache-mode set and there are no pending changes
13667 if (p->cache_mode == mode &&
13668 (pending_inc.new_pools.count(pool_id) == 0 ||
13669 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13670 ss << "set cache-mode for pool '" << poolstr << "'"
13671 << " to " << pg_pool_t::get_cache_mode_name(mode);
13672 err = 0;
13673 goto reply;
13674 }
13675
13676 /* Mode description:
13677 *
13678 * none: No cache-mode defined
9f95a23c 13679 * forward: Forward all reads and writes to base pool [removed]
7c673cae
FG
13680 * writeback: Cache writes, promote reads from base pool
13681 * readonly: Forward writes to base pool
9f95a23c 13682 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
7c673cae
FG
13683 * proxy: Proxy all reads and writes to base pool
13684 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13685 *
13686 * Hence, these are the allowed transitions:
13687 *
13688 * none -> any
13689 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c 13690 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
7c673cae 13691 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c
TL
13692 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13693 * writeback -> readproxy || proxy
7c673cae
FG
13694 * readonly -> any
13695 */
13696
13697 // We check if the transition is valid against the current pool mode, as
13698 // it is the only committed state thus far. We will blantly squash
13699 // whatever mode is on the pending state.
13700
13701 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
9f95a23c 13702 (mode != pg_pool_t::CACHEMODE_PROXY &&
7c673cae
FG
13703 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13704 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13705 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13706 << "' pool; only '"
1e59de90
TL
13707 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
13708 << "','"
7c673cae
FG
13709 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13710 << "' allowed.";
13711 err = -EINVAL;
13712 goto reply;
13713 }
13714 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13715 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13716 mode != pg_pool_t::CACHEMODE_PROXY &&
13717 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13718
13719 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13720 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13721 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13722
13723 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13724 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13725 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13726
13727 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13728 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13729 mode != pg_pool_t::CACHEMODE_PROXY &&
13730 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13731
31f18b77 13732 const pool_stat_t* pstats =
f67539c2 13733 mon.mgrstatmon()->get_pool_stat(pool_id);
7c673cae 13734
31f18b77 13735 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
13736 ss << "unable to set cache-mode '"
13737 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13738 << "': dirty objects found";
13739 err = -EBUSY;
13740 goto reply;
13741 }
13742 }
13743 // go
13744 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13745 np->cache_mode = mode;
13746 // set this both when moving to and from cache_mode NONE. this is to
13747 // capture legacy pools that were set up before this flag existed.
13748 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13749 ss << "set cache-mode for pool '" << poolstr
13750 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13751 if (mode == pg_pool_t::CACHEMODE_NONE) {
13752 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11fdf7f2 13753 ceph_assert(base_pool);
7c673cae
FG
13754 if (base_pool->read_tier == pool_id ||
13755 base_pool->write_tier == pool_id)
13756 ss <<" (WARNING: pool is still configured as read or write tier)";
13757 }
13758 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13759 get_last_committed() + 1));
13760 return true;
13761 } else if (prefix == "osd tier add-cache") {
13762 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13763 if (err == -EAGAIN)
13764 goto wait;
13765 if (err)
13766 goto reply;
13767 string poolstr;
9f95a23c 13768 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13769 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13770 if (pool_id < 0) {
13771 ss << "unrecognized pool '" << poolstr << "'";
13772 err = -ENOENT;
13773 goto reply;
13774 }
13775 string tierpoolstr;
9f95a23c 13776 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13777 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13778 if (tierpool_id < 0) {
13779 ss << "unrecognized pool '" << tierpoolstr << "'";
13780 err = -ENOENT;
13781 goto reply;
13782 }
13783 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13784 ceph_assert(p);
7c673cae 13785 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13786 ceph_assert(tp);
7c673cae
FG
13787
13788 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13789 goto reply;
13790 }
13791
13792 int64_t size = 0;
9f95a23c 13793 if (!cmd_getval(cmdmap, "size", size)) {
7c673cae 13794 ss << "unable to parse 'size' value '"
11fdf7f2 13795 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
7c673cae
FG
13796 err = -EINVAL;
13797 goto reply;
13798 }
13799 // make sure new tier is empty
31f18b77 13800 const pool_stat_t *pstats =
f67539c2 13801 mon.mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 13802 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
13803 ss << "tier pool '" << tierpoolstr << "' is not empty";
13804 err = -ENOTEMPTY;
13805 goto reply;
13806 }
11fdf7f2 13807 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
7c673cae 13808 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 13809 if (int(mode) < 0) {
7c673cae
FG
13810 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13811 err = -EINVAL;
13812 goto reply;
13813 }
13814 HitSet::Params hsp;
11fdf7f2
TL
13815 auto& cache_hit_set_type =
13816 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13817 if (cache_hit_set_type == "bloom") {
7c673cae 13818 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 13819 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae 13820 hsp = HitSet::Params(bsp);
11fdf7f2 13821 } else if (cache_hit_set_type == "explicit_hash") {
7c673cae 13822 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11fdf7f2 13823 } else if (cache_hit_set_type == "explicit_object") {
7c673cae
FG
13824 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13825 } else {
11fdf7f2
TL
13826 ss << "osd tier cache default hit set type '"
13827 << cache_hit_set_type << "' is not a known type";
7c673cae
FG
13828 err = -EINVAL;
13829 goto reply;
13830 }
13831 // go
13832 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13833 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13834 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13835 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13836 return true;
13837 }
13838 np->tiers.insert(tierpool_id);
13839 np->read_tier = np->write_tier = tierpool_id;
13840 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13841 np->set_last_force_op_resend(pending_inc.epoch);
13842 ntp->set_last_force_op_resend(pending_inc.epoch);
13843 ntp->tier_of = pool_id;
13844 ntp->cache_mode = mode;
11fdf7f2
TL
13845 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13846 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13847 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13848 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13849 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13850 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
7c673cae
FG
13851 ntp->hit_set_params = hsp;
13852 ntp->target_max_bytes = size;
13853 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13854 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13855 get_last_committed() + 1));
13856 return true;
13857 } else if (prefix == "osd pool set-quota") {
13858 string poolstr;
9f95a23c 13859 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13860 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13861 if (pool_id < 0) {
13862 ss << "unrecognized pool '" << poolstr << "'";
13863 err = -ENOENT;
13864 goto reply;
13865 }
13866
13867 string field;
9f95a23c 13868 cmd_getval(cmdmap, "field", field);
7c673cae
FG
13869 if (field != "max_objects" && field != "max_bytes") {
13870 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13871 err = -EINVAL;
13872 goto reply;
13873 }
13874
13875 // val could contain unit designations, so we treat as a string
13876 string val;
9f95a23c 13877 cmd_getval(cmdmap, "val", val);
1adf2230
AA
13878 string tss;
13879 int64_t value;
13880 if (field == "max_objects") {
20effc67 13881 value = strict_si_cast<uint64_t>(val, &tss);
1adf2230 13882 } else if (field == "max_bytes") {
20effc67 13883 value = strict_iecstrtoll(val, &tss);
1adf2230 13884 } else {
11fdf7f2 13885 ceph_abort_msg("unrecognized option");
1adf2230
AA
13886 }
13887 if (!tss.empty()) {
13888 ss << "error parsing value '" << val << "': " << tss;
13889 err = -EINVAL;
7c673cae
FG
13890 goto reply;
13891 }
13892
13893 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13894 if (field == "max_objects") {
13895 pi->quota_max_objects = value;
13896 } else if (field == "max_bytes") {
13897 pi->quota_max_bytes = value;
13898 } else {
11fdf7f2 13899 ceph_abort_msg("unrecognized option");
7c673cae
FG
13900 }
13901 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13902 rs = ss.str();
13903 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13904 get_last_committed() + 1));
13905 return true;
c07f9fc5
FG
13906 } else if (prefix == "osd pool application enable" ||
13907 prefix == "osd pool application disable" ||
13908 prefix == "osd pool application set" ||
13909 prefix == "osd pool application rm") {
13910 err = prepare_command_pool_application(prefix, cmdmap, ss);
11fdf7f2 13911 if (err == -EAGAIN) {
c07f9fc5 13912 goto wait;
11fdf7f2 13913 } else if (err < 0) {
7c673cae 13914 goto reply;
7c673cae 13915 } else {
11fdf7f2 13916 goto update;
7c673cae 13917 }
c07f9fc5
FG
13918 } else if (prefix == "osd force-create-pg") {
13919 pg_t pgid;
13920 string pgidstr;
1e59de90
TL
13921 err = parse_pgid(cmdmap, ss, pgid, pgidstr);
13922 if (err < 0)
94b18763 13923 goto reply;
11fdf7f2 13924 bool sure = false;
9f95a23c 13925 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
13926 if (!sure) {
13927 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13928 << "that the cluster will give up ever trying to recover the lost data. Do this "
13929 << "only if you are certain that all copies of the PG are in fact lost and you are "
13930 << "willing to accept that the data is permanently destroyed. Pass "
13931 << "--yes-i-really-mean-it to proceed.";
13932 err = -EPERM;
13933 goto reply;
13934 }
c07f9fc5
FG
13935 bool creating_now;
13936 {
13937 std::lock_guard<std::mutex> l(creating_pgs_lock);
9f95a23c
TL
13938 auto emplaced = creating_pgs.pgs.emplace(
13939 pgid,
13940 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13941 ceph_clock_now()));
c07f9fc5
FG
13942 creating_now = emplaced.second;
13943 }
13944 if (creating_now) {
13945 ss << "pg " << pgidstr << " now creating, ok";
11fdf7f2
TL
13946 // set the pool's CREATING flag so that (1) the osd won't ignore our
13947 // create message and (2) we won't propose any future pg_num changes
13948 // until after the PG has been instantiated.
13949 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13950 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13951 }
13952 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
c07f9fc5
FG
13953 err = 0;
13954 goto update;
13955 } else {
13956 ss << "pg " << pgid << " already creating";
13957 err = 0;
13958 goto reply;
13959 }
f67539c2
TL
13960 } else if (prefix == "osd force_healthy_stretch_mode") {
13961 bool sure = false;
13962 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13963 if (!sure) {
13964 ss << "This command will require peering across multiple CRUSH buckets "
13965 "(probably two data centers or availability zones?) and may result in PGs "
13966 "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13967 err = -EPERM;
13968 goto reply;
13969 }
13970 try_end_recovery_stretch_mode(true);
13971 ss << "Triggering healthy stretch mode";
13972 err = 0;
13973 goto reply;
13974 } else if (prefix == "osd force_recovery_stretch_mode") {
13975 bool sure = false;
13976 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13977 if (!sure) {
13978 ss << "This command will increase pool sizes to try and spread them "
13979 "across multiple CRUSH buckets (probably two data centers or "
13980 "availability zones?) and should have happened automatically"
13981 "Pass --yes-i-really-mean-it to proceed.";
13982 err = -EPERM;
13983 goto reply;
13984 }
13985 mon.go_recovery_stretch_mode();
13986 ss << "Triggering recovery stretch mode";
13987 err = 0;
13988 goto reply;
1e59de90
TL
13989 } else if (prefix == "osd set-allow-crimson") {
13990
13991 bool sure = false;
13992 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13993
13994 bool experimental_enabled =
13995 g_ceph_context->check_experimental_feature_enabled("crimson");
13996 if (!sure || !experimental_enabled) {
13997 ss << "This command will allow usage of crimson-osd osd daemons. "
13998 << "crimson-osd is not considered stable and will likely cause "
13999 << "crashes or data corruption. At this time, crimson-osd is mainly "
14000 << "useful for performance evaluation, testing, and development. "
14001 << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
14002 << "the experimental features config. This setting is irrevocable.";
14003 err = -EPERM;
14004 goto reply;
14005 }
14006
14007 err = 0;
14008 if (osdmap.get_allow_crimson()) {
14009 goto reply;
14010 } else {
14011 pending_inc.set_allow_crimson();
14012 goto update;
14013 }
7c673cae
FG
14014 } else {
14015 err = -EINVAL;
14016 }
14017
14018 reply:
14019 getline(ss, rs);
14020 if (err < 0 && rs.length() == 0)
14021 rs = cpp_strerror(err);
f67539c2 14022 mon.reply_command(op, err, rs, rdata, get_last_committed());
7c673cae
FG
14023 return ret;
14024
14025 update:
14026 getline(ss, rs);
14027 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
14028 get_last_committed() + 1));
14029 return true;
14030
14031 wait:
14032 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14033 return true;
14034}
14035
28e407b8 14036bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
7c673cae
FG
14037{
14038 op->mark_osdmon_event(__func__);
28e407b8 14039
9f95a23c 14040 auto m = op->get_req<MPoolOp>();
11fdf7f2 14041 MonSession *session = op->get_session();
28e407b8
AA
14042 if (!session) {
14043 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14044 return true;
14045 }
14046
14047 switch (m->op) {
14048 case POOL_OP_CREATE_UNMANAGED_SNAP:
14049 case POOL_OP_DELETE_UNMANAGED_SNAP:
14050 {
14051 const std::string* pool_name = nullptr;
14052 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
14053 if (pg_pool != nullptr) {
14054 pool_name = &osdmap.get_pool_name(m->pool);
14055 }
14056
f67539c2 14057 if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
28e407b8 14058 session->entity_name, session->caps,
11fdf7f2 14059 session->get_peer_socket_addr(),
28e407b8
AA
14060 pool_name)) {
14061 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
14062 << "privileges. message: " << *m << std::endl
14063 << "caps: " << session->caps << dendl;
14064 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14065 return true;
14066 }
14067 }
14068 break;
14069 default:
14070 if (!session->is_capable("osd", MON_CAP_W)) {
14071 dout(0) << "got pool op from entity with insufficient privileges. "
14072 << "message: " << *m << std::endl
14073 << "caps: " << session->caps << dendl;
14074 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14075 return true;
14076 }
14077 break;
14078 }
14079
14080 return false;
14081}
14082
14083bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
14084{
14085 op->mark_osdmon_event(__func__);
9f95a23c 14086 auto m = op->get_req<MPoolOp>();
28e407b8
AA
14087
14088 if (enforce_pool_op_caps(op)) {
14089 return true;
14090 }
14091
f67539c2 14092 if (m->fsid != mon.monmap->fsid) {
7c673cae 14093 dout(0) << __func__ << " drop message on fsid " << m->fsid
f67539c2 14094 << " != " << mon.monmap->fsid << " for " << *m << dendl;
7c673cae
FG
14095 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14096 return true;
14097 }
14098
14099 if (m->op == POOL_OP_CREATE)
14100 return preprocess_pool_op_create(op);
14101
11fdf7f2
TL
14102 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
14103 if (p == nullptr) {
7c673cae 14104 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11fdf7f2
TL
14105 if (m->op == POOL_OP_DELETE) {
14106 _pool_op_reply(op, 0, osdmap.get_epoch());
14107 } else {
14108 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14109 }
7c673cae
FG
14110 return true;
14111 }
14112
14113 // check if the snap and snapname exist
14114 bool snap_exists = false;
7c673cae
FG
14115 if (p->snap_exists(m->name.c_str()))
14116 snap_exists = true;
14117
14118 switch (m->op) {
14119 case POOL_OP_CREATE_SNAP:
14120 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
14121 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14122 return true;
14123 }
14124 if (snap_exists) {
14125 _pool_op_reply(op, 0, osdmap.get_epoch());
14126 return true;
14127 }
14128 return false;
14129 case POOL_OP_CREATE_UNMANAGED_SNAP:
14130 if (p->is_pool_snaps_mode()) {
14131 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14132 return true;
14133 }
14134 return false;
14135 case POOL_OP_DELETE_SNAP:
14136 if (p->is_unmanaged_snaps_mode()) {
14137 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14138 return true;
14139 }
14140 if (!snap_exists) {
14141 _pool_op_reply(op, 0, osdmap.get_epoch());
14142 return true;
14143 }
14144 return false;
14145 case POOL_OP_DELETE_UNMANAGED_SNAP:
14146 if (p->is_pool_snaps_mode()) {
14147 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14148 return true;
14149 }
9f95a23c 14150 if (_is_removed_snap(m->pool, m->snapid)) {
7c673cae
FG
14151 _pool_op_reply(op, 0, osdmap.get_epoch());
14152 return true;
14153 }
14154 return false;
14155 case POOL_OP_DELETE:
14156 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
14157 _pool_op_reply(op, 0, osdmap.get_epoch());
14158 return true;
14159 }
14160 return false;
14161 case POOL_OP_AUID_CHANGE:
14162 return false;
14163 default:
14164 ceph_abort();
14165 break;
14166 }
14167
14168 return false;
14169}
14170
9f95a23c
TL
14171bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
14172{
14173 if (!osdmap.have_pg_pool(pool)) {
14174 dout(10) << __func__ << " pool " << pool << " snap " << snap
14175 << " - pool dne" << dendl;
14176 return true;
14177 }
14178 if (osdmap.in_removed_snaps_queue(pool, snap)) {
14179 dout(10) << __func__ << " pool " << pool << " snap " << snap
14180 << " - in osdmap removed_snaps_queue" << dendl;
14181 return true;
14182 }
14183 snapid_t begin, end;
14184 int r = lookup_purged_snap(pool, snap, &begin, &end);
14185 if (r == 0) {
14186 dout(10) << __func__ << " pool " << pool << " snap " << snap
14187 << " - purged, [" << begin << "," << end << ")" << dendl;
14188 return true;
14189 }
14190 return false;
14191}
14192
14193bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
14194{
14195 if (pending_inc.old_pools.count(pool)) {
14196 dout(10) << __func__ << " pool " << pool << " snap " << snap
14197 << " - pool pending deletion" << dendl;
14198 return true;
14199 }
14200 if (pending_inc.in_new_removed_snaps(pool, snap)) {
14201 dout(10) << __func__ << " pool " << pool << " snap " << snap
14202 << " - in pending new_removed_snaps" << dendl;
14203 return true;
14204 }
14205 return false;
14206}
14207
7c673cae
FG
14208bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
14209{
14210 op->mark_osdmon_event(__func__);
9f95a23c 14211 auto m = op->get_req<MPoolOp>();
7c673cae
FG
14212 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
14213 if (pool >= 0) {
14214 _pool_op_reply(op, 0, osdmap.get_epoch());
14215 return true;
14216 }
14217
14218 return false;
14219}
14220
14221bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
14222{
14223 op->mark_osdmon_event(__func__);
9f95a23c 14224 auto m = op->get_req<MPoolOp>();
7c673cae
FG
14225 dout(10) << "prepare_pool_op " << *m << dendl;
14226 if (m->op == POOL_OP_CREATE) {
14227 return prepare_pool_op_create(op);
14228 } else if (m->op == POOL_OP_DELETE) {
14229 return prepare_pool_op_delete(op);
14230 }
14231
14232 int ret = 0;
14233 bool changed = false;
14234
14235 if (!osdmap.have_pg_pool(m->pool)) {
14236 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14237 return false;
14238 }
14239
14240 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
14241
39ae355f
TL
14242 if (m->op == POOL_OP_CREATE_SNAP ||
14243 m->op == POOL_OP_CREATE_UNMANAGED_SNAP) {
14244 if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(m->pool)) {
14245 dout(20) << "monitor-managed snapshots have been disabled for pools "
14246 " attached to an fs - pool:" << m->pool << dendl;
14247 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14248 return false;
14249 }
14250 }
14251
7c673cae
FG
14252 switch (m->op) {
14253 case POOL_OP_CREATE_SNAP:
14254 if (pool->is_tier()) {
14255 ret = -EINVAL;
14256 _pool_op_reply(op, ret, osdmap.get_epoch());
14257 return false;
14258 } // else, fall through
14259 case POOL_OP_DELETE_SNAP:
14260 if (!pool->is_unmanaged_snaps_mode()) {
14261 bool snap_exists = pool->snap_exists(m->name.c_str());
14262 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
14263 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
14264 ret = 0;
14265 } else {
14266 break;
14267 }
14268 } else {
14269 ret = -EINVAL;
14270 }
14271 _pool_op_reply(op, ret, osdmap.get_epoch());
14272 return false;
14273
14274 case POOL_OP_DELETE_UNMANAGED_SNAP:
14275 // we won't allow removal of an unmanaged snapshot from a pool
14276 // not in unmanaged snaps mode.
14277 if (!pool->is_unmanaged_snaps_mode()) {
14278 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
14279 return false;
14280 }
14281 /* fall-thru */
14282 case POOL_OP_CREATE_UNMANAGED_SNAP:
14283 // but we will allow creating an unmanaged snapshot on any pool
14284 // as long as it is not in 'pool' snaps mode.
14285 if (pool->is_pool_snaps_mode()) {
14286 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14287 return false;
14288 }
14289 }
14290
14291 // projected pool info
14292 pg_pool_t pp;
14293 if (pending_inc.new_pools.count(m->pool))
14294 pp = pending_inc.new_pools[m->pool];
14295 else
14296 pp = *osdmap.get_pg_pool(m->pool);
14297
14298 bufferlist reply_data;
14299
14300 // pool snaps vs unmanaged snaps are mutually exclusive
14301 switch (m->op) {
14302 case POOL_OP_CREATE_SNAP:
14303 case POOL_OP_DELETE_SNAP:
14304 if (pp.is_unmanaged_snaps_mode()) {
14305 ret = -EINVAL;
14306 goto out;
14307 }
14308 break;
14309
14310 case POOL_OP_CREATE_UNMANAGED_SNAP:
14311 case POOL_OP_DELETE_UNMANAGED_SNAP:
14312 if (pp.is_pool_snaps_mode()) {
14313 ret = -EINVAL;
14314 goto out;
14315 }
14316 }
14317
14318 switch (m->op) {
14319 case POOL_OP_CREATE_SNAP:
14320 if (!pp.snap_exists(m->name.c_str())) {
14321 pp.add_snap(m->name.c_str(), ceph_clock_now());
11fdf7f2
TL
14322 dout(10) << "create snap in pool " << m->pool << " " << m->name
14323 << " seq " << pp.get_snap_epoch() << dendl;
7c673cae
FG
14324 changed = true;
14325 }
14326 break;
14327
14328 case POOL_OP_DELETE_SNAP:
14329 {
14330 snapid_t s = pp.snap_exists(m->name.c_str());
14331 if (s) {
14332 pp.remove_snap(s);
11fdf7f2 14333 pending_inc.new_removed_snaps[m->pool].insert(s);
7c673cae
FG
14334 changed = true;
14335 }
14336 }
14337 break;
14338
14339 case POOL_OP_CREATE_UNMANAGED_SNAP:
14340 {
9f95a23c
TL
14341 uint64_t snapid = pp.add_unmanaged_snap(
14342 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 14343 encode(snapid, reply_data);
7c673cae
FG
14344 changed = true;
14345 }
14346 break;
14347
14348 case POOL_OP_DELETE_UNMANAGED_SNAP:
9f95a23c
TL
14349 if (!_is_removed_snap(m->pool, m->snapid) &&
14350 !_is_pending_removed_snap(m->pool, m->snapid)) {
28e407b8
AA
14351 if (m->snapid > pp.get_snap_seq()) {
14352 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14353 return false;
14354 }
9f95a23c
TL
14355 pp.remove_unmanaged_snap(
14356 m->snapid,
14357 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 14358 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
9f95a23c
TL
14359 // also record the new seq as purged: this avoids a discontinuity
14360 // after all of the snaps have been purged, since the seq assigned
14361 // during removal lives in the same namespace as the actual snaps.
14362 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
7c673cae
FG
14363 changed = true;
14364 }
14365 break;
14366
14367 case POOL_OP_AUID_CHANGE:
11fdf7f2
TL
14368 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14369 return false;
7c673cae
FG
14370
14371 default:
14372 ceph_abort();
14373 break;
14374 }
14375
14376 if (changed) {
14377 pp.set_snap_epoch(pending_inc.epoch);
14378 pending_inc.new_pools[m->pool] = pp;
14379 }
14380
14381 out:
14382 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14383 return true;
14384}
14385
14386bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14387{
14388 op->mark_osdmon_event(__func__);
14389 int err = prepare_new_pool(op);
14390 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14391 return true;
14392}
14393
14394int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14395 ostream *ss)
14396{
14397 const string& poolstr = osdmap.get_pool_name(pool_id);
14398
14399 // If the Pool is in use by CephFS, refuse to delete it
f67539c2 14400 FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
7c673cae
FG
14401 if (pending_fsmap.pool_in_use(pool_id)) {
14402 *ss << "pool '" << poolstr << "' is in use by CephFS";
14403 return -EBUSY;
14404 }
14405
14406 if (pool.tier_of >= 0) {
14407 *ss << "pool '" << poolstr << "' is a tier of '"
14408 << osdmap.get_pool_name(pool.tier_of) << "'";
14409 return -EBUSY;
14410 }
14411 if (!pool.tiers.empty()) {
14412 *ss << "pool '" << poolstr << "' has tiers";
14413 for(auto tier : pool.tiers) {
14414 *ss << " " << osdmap.get_pool_name(tier);
14415 }
14416 return -EBUSY;
14417 }
14418
11fdf7f2 14419 if (!g_conf()->mon_allow_pool_delete) {
7c673cae
FG
14420 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14421 return -EPERM;
14422 }
14423
14424 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14425 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14426 return -EPERM;
14427 }
14428
14429 *ss << "pool '" << poolstr << "' removed";
14430 return 0;
14431}
14432
14433/**
14434 * Check if it is safe to add a tier to a base pool
14435 *
14436 * @return
14437 * True if the operation should proceed, false if we should abort here
14438 * (abort doesn't necessarily mean error, could be idempotency)
14439 */
14440bool OSDMonitor::_check_become_tier(
14441 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14442 const int64_t base_pool_id, const pg_pool_t *base_pool,
14443 int *err,
14444 ostream *ss) const
14445{
14446 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14447 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14448
1e59de90
TL
14449 if (tier_pool->is_crimson()) {
14450 *ss << "pool '" << tier_pool_name << "' is a crimson pool, tiering "
14451 << "features are not supported";
14452 *err = -EINVAL;
14453 return false;
14454 }
14455 if (base_pool->is_crimson()) {
14456 *ss << "pool '" << base_pool_name << "' is a crimson pool, tiering "
14457 << "features are not supported";
14458 *err = -EINVAL;
14459 return false;
14460 }
14461
f67539c2 14462 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
7c673cae
FG
14463 if (pending_fsmap.pool_in_use(tier_pool_id)) {
14464 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14465 *err = -EBUSY;
14466 return false;
14467 }
14468
14469 if (base_pool->tiers.count(tier_pool_id)) {
11fdf7f2 14470 ceph_assert(tier_pool->tier_of == base_pool_id);
7c673cae
FG
14471 *err = 0;
14472 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14473 << base_pool_name << "'";
14474 return false;
14475 }
14476
14477 if (base_pool->is_tier()) {
14478 *ss << "pool '" << base_pool_name << "' is already a tier of '"
14479 << osdmap.get_pool_name(base_pool->tier_of) << "', "
14480 << "multiple tiers are not yet supported.";
14481 *err = -EINVAL;
14482 return false;
14483 }
14484
14485 if (tier_pool->has_tiers()) {
14486 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14487 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14488 it != tier_pool->tiers.end(); ++it)
14489 *ss << "'" << osdmap.get_pool_name(*it) << "',";
14490 *ss << " multiple tiers are not yet supported.";
14491 *err = -EINVAL;
14492 return false;
14493 }
14494
14495 if (tier_pool->is_tier()) {
14496 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14497 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14498 *err = -EINVAL;
14499 return false;
14500 }
14501
14502 *err = 0;
14503 return true;
14504}
14505
14506
14507/**
14508 * Check if it is safe to remove a tier from this base pool
14509 *
14510 * @return
14511 * True if the operation should proceed, false if we should abort here
14512 * (abort doesn't necessarily mean error, could be idempotency)
14513 */
14514bool OSDMonitor::_check_remove_tier(
14515 const int64_t base_pool_id, const pg_pool_t *base_pool,
14516 const pg_pool_t *tier_pool,
14517 int *err, ostream *ss) const
14518{
14519 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14520
14521 // Apply CephFS-specific checks
f67539c2 14522 const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
7c673cae 14523 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
14524 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14525 // If the underlying pool is erasure coded and does not allow EC
14526 // overwrites, we can't permit the removal of the replicated tier that
14527 // CephFS relies on to access it
14528 *ss << "pool '" << base_pool_name <<
14529 "' does not allow EC overwrites and is in use by CephFS"
14530 " via its tier";
7c673cae
FG
14531 *err = -EBUSY;
14532 return false;
14533 }
14534
14535 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14536 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14537 "tier is still in use as a writeback cache. Change the cache "
14538 "mode and flush the cache before removing it";
14539 *err = -EBUSY;
14540 return false;
14541 }
14542 }
14543
14544 *err = 0;
14545 return true;
14546}
14547
14548int OSDMonitor::_prepare_remove_pool(
14549 int64_t pool, ostream *ss, bool no_fake)
14550{
224ce89b 14551 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
14552 const pg_pool_t *p = osdmap.get_pg_pool(pool);
14553 int r = _check_remove_pool(pool, *p, ss);
14554 if (r < 0)
14555 return r;
14556
14557 auto new_pool = pending_inc.new_pools.find(pool);
14558 if (new_pool != pending_inc.new_pools.end()) {
14559 // if there is a problem with the pending info, wait and retry
14560 // this op.
14561 const auto& p = new_pool->second;
14562 int r = _check_remove_pool(pool, p, ss);
14563 if (r < 0)
14564 return -EAGAIN;
14565 }
14566
14567 if (pending_inc.old_pools.count(pool)) {
224ce89b 14568 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
14569 << dendl;
14570 return 0;
14571 }
14572
11fdf7f2 14573 if (g_conf()->mon_fake_pool_delete && !no_fake) {
7c673cae
FG
14574 string old_name = osdmap.get_pool_name(pool);
14575 string new_name = old_name + "." + stringify(pool) + ".DELETED";
14576 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14577 << old_name << " -> " << new_name << dendl;
14578 pending_inc.new_pool_names[pool] = new_name;
14579 return 0;
14580 }
14581
14582 // remove
14583 pending_inc.old_pools.insert(pool);
14584
224ce89b 14585 // remove any pg_temp mappings for this pool
7c673cae
FG
14586 for (auto p = osdmap.pg_temp->begin();
14587 p != osdmap.pg_temp->end();
14588 ++p) {
11fdf7f2 14589 if (p->first.pool() == pool) {
224ce89b 14590 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
14591 << p->first << dendl;
14592 pending_inc.new_pg_temp[p->first].clear();
14593 }
14594 }
224ce89b 14595 // remove any primary_temp mappings for this pool
7c673cae
FG
14596 for (auto p = osdmap.primary_temp->begin();
14597 p != osdmap.primary_temp->end();
14598 ++p) {
11fdf7f2 14599 if (p->first.pool() == pool) {
224ce89b 14600 dout(10) << __func__ << " " << pool
7c673cae
FG
14601 << " removing obsolete primary_temp" << p->first << dendl;
14602 pending_inc.new_primary_temp[p->first] = -1;
14603 }
14604 }
224ce89b
WB
14605 // remove any pg_upmap mappings for this pool
14606 for (auto& p : osdmap.pg_upmap) {
11fdf7f2 14607 if (p.first.pool() == pool) {
224ce89b
WB
14608 dout(10) << __func__ << " " << pool
14609 << " removing obsolete pg_upmap "
14610 << p.first << dendl;
14611 pending_inc.old_pg_upmap.insert(p.first);
14612 }
14613 }
94b18763
FG
14614 // remove any pending pg_upmap mappings for this pool
14615 {
14616 auto it = pending_inc.new_pg_upmap.begin();
14617 while (it != pending_inc.new_pg_upmap.end()) {
11fdf7f2 14618 if (it->first.pool() == pool) {
94b18763
FG
14619 dout(10) << __func__ << " " << pool
14620 << " removing pending pg_upmap "
14621 << it->first << dendl;
14622 it = pending_inc.new_pg_upmap.erase(it);
14623 } else {
14624 it++;
14625 }
14626 }
14627 }
224ce89b
WB
14628 // remove any pg_upmap_items mappings for this pool
14629 for (auto& p : osdmap.pg_upmap_items) {
11fdf7f2 14630 if (p.first.pool() == pool) {
224ce89b
WB
14631 dout(10) << __func__ << " " << pool
14632 << " removing obsolete pg_upmap_items " << p.first
14633 << dendl;
14634 pending_inc.old_pg_upmap_items.insert(p.first);
14635 }
14636 }
94b18763
FG
14637 // remove any pending pg_upmap mappings for this pool
14638 {
14639 auto it = pending_inc.new_pg_upmap_items.begin();
14640 while (it != pending_inc.new_pg_upmap_items.end()) {
11fdf7f2 14641 if (it->first.pool() == pool) {
94b18763
FG
14642 dout(10) << __func__ << " " << pool
14643 << " removing pending pg_upmap_items "
14644 << it->first << dendl;
14645 it = pending_inc.new_pg_upmap_items.erase(it);
14646 } else {
14647 it++;
14648 }
14649 }
14650 }
35e4c445
FG
14651
14652 // remove any choose_args for this pool
20effc67 14653 CrushWrapper newcrush = _get_pending_crush();
35e4c445
FG
14654 if (newcrush.have_choose_args(pool)) {
14655 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14656 newcrush.rm_choose_args(pool);
14657 pending_inc.crush.clear();
f67539c2 14658 newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
35e4c445 14659 }
7c673cae
FG
14660 return 0;
14661}
14662
14663int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14664{
14665 dout(10) << "_prepare_rename_pool " << pool << dendl;
14666 if (pending_inc.old_pools.count(pool)) {
14667 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14668 return -ENOENT;
14669 }
14670 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14671 p != pending_inc.new_pool_names.end();
14672 ++p) {
14673 if (p->second == newname && p->first != pool) {
14674 return -EEXIST;
14675 }
14676 }
14677
14678 pending_inc.new_pool_names[pool] = newname;
14679 return 0;
14680}
14681
14682bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14683{
14684 op->mark_osdmon_event(__func__);
9f95a23c 14685 auto m = op->get_req<MPoolOp>();
7c673cae
FG
14686 ostringstream ss;
14687 int ret = _prepare_remove_pool(m->pool, &ss, false);
14688 if (ret == -EAGAIN) {
14689 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14690 return true;
14691 }
14692 if (ret < 0)
14693 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14694 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14695 pending_inc.epoch));
14696 return true;
14697}
14698
14699void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14700 int ret, epoch_t epoch, bufferlist *blp)
14701{
14702 op->mark_osdmon_event(__func__);
9f95a23c 14703 auto m = op->get_req<MPoolOp>();
7c673cae
FG
14704 dout(20) << "_pool_op_reply " << ret << dendl;
14705 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14706 ret, epoch, get_last_committed(), blp);
f67539c2 14707 mon.send_reply(op, reply);
7c673cae 14708}
81eedcae
TL
14709
14710void OSDMonitor::convert_pool_priorities(void)
14711{
14712 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14713 int64_t max_prio = 0;
14714 int64_t min_prio = 0;
14715 for (const auto &i : osdmap.get_pools()) {
14716 const auto &pool = i.second;
14717
14718 if (pool.opts.is_set(key)) {
9f95a23c 14719 int64_t prio = 0;
81eedcae
TL
14720 pool.opts.get(key, &prio);
14721 if (prio > max_prio)
14722 max_prio = prio;
14723 if (prio < min_prio)
14724 min_prio = prio;
14725 }
14726 }
14727 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14728 dout(20) << __func__ << " nothing to fix" << dendl;
14729 return;
14730 }
14731 // Current pool priorities exceeds new maximum
14732 for (const auto &i : osdmap.get_pools()) {
14733 const auto pool_id = i.first;
14734 pg_pool_t pool = i.second;
14735
14736 int64_t prio = 0;
14737 pool.opts.get(key, &prio);
14738 int64_t n;
14739
14740 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14741 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14742 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14743 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14744 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14745 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14746 } else {
14747 continue;
14748 }
14749 if (n == 0) {
14750 pool.opts.unset(key);
14751 } else {
14752 pool.opts.set(key, static_cast<int64_t>(n));
14753 }
14754 dout(10) << __func__ << " pool " << pool_id
14755 << " recovery_priority adjusted "
14756 << prio << " to " << n << dendl;
14757 pool.last_change = pending_inc.epoch;
14758 pending_inc.new_pools[pool_id] = pool;
14759 }
14760}
f67539c2
TL
14761
14762void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14763 int *errcode,
14764 set<pg_pool_t*>* pools,
14765 const string& new_crush_rule)
14766{
14767 dout(20) << __func__ << dendl;
14768 *okay = false;
14769 int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14770 if (new_crush_rule_result < 0) {
14771 ss << "unrecognized crush rule " << new_crush_rule_result;
14772 *errcode = new_crush_rule_result;
14773 return;
14774 }
14775 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14776 for (const auto& pooli : osdmap.pools) {
14777 int64_t poolid = pooli.first;
14778 const pg_pool_t *p = &pooli.second;
14779 if (!p->is_replicated()) {
14780 ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14781 *errcode = -EINVAL;
14782 return;
14783 }
14784 uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14785 if ((p->get_size() != default_size ||
14786 (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14787 (p->get_crush_rule() != new_rule)) {
14788 ss << "we currently require stretch mode pools start out with the"
14789 " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14790 *errcode = -EINVAL;
14791 return;
14792 }
14793 pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14794 // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14795 // the attempt may fail and then we have these pool updates...but they won't do anything
14796 // if there is a failure, so if it's hard to change the interface, no need to bother
14797 pools->insert(pp);
14798 }
14799 *okay = true;
14800 return;
14801}
14802
14803void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14804 int *errcode, bool commit,
14805 const string& dividing_bucket,
14806 uint32_t bucket_count,
14807 const set<pg_pool_t*>& pools,
14808 const string& new_crush_rule)
14809{
14810 dout(20) << __func__ << dendl;
14811 *okay = false;
20effc67
TL
14812 CrushWrapper crush = _get_pending_crush();
14813 int dividing_id = -1;
14814 if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14815 !type_id.has_value()) {
f67539c2
TL
14816 ss << dividing_bucket << " is not a valid crush bucket type";
14817 *errcode = -ENOENT;
20effc67 14818 ceph_assert(!commit);
f67539c2 14819 return;
20effc67
TL
14820 } else {
14821 dividing_id = *type_id;
f67539c2
TL
14822 }
14823 vector<int> subtrees;
14824 crush.get_subtree_of_type(dividing_id, &subtrees);
14825 if (subtrees.size() != 2) {
14826 ss << "there are " << subtrees.size() << dividing_bucket
14827 << "'s in the cluster but stretch mode currently only works with 2!";
14828 *errcode = -EINVAL;
14829 ceph_assert(!commit || subtrees.size() == 2);
14830 return;
14831 }
14832
14833 int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14834 if (new_crush_rule_result < 0) {
14835 ss << "unrecognized crush rule " << new_crush_rule;
14836 *errcode = new_crush_rule_result;
14837 ceph_assert(!commit || (new_crush_rule_result > 0));
14838 return;
14839 }
14840 __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14841
14842 int weight1 = crush.get_item_weight(subtrees[0]);
14843 int weight2 = crush.get_item_weight(subtrees[1]);
14844 if (weight1 != weight2) {
14845 // TODO: I'm really not sure this is a good idea?
14846 ss << "the 2 " << dividing_bucket
14847 << "instances in the cluster have differing weights "
14848 << weight1 << " and " << weight2
14849 <<" but stretch mode currently requires they be the same!";
14850 *errcode = -EINVAL;
14851 ceph_assert(!commit || (weight1 == weight2));
14852 return;
14853 }
14854 if (bucket_count != 2) {
14855 ss << "currently we only support 2-site stretch clusters!";
14856 *errcode = -EINVAL;
14857 ceph_assert(!commit || bucket_count == 2);
14858 return;
14859 }
14860 // TODO: check CRUSH rules for pools so that we are appropriately divided
14861 if (commit) {
14862 for (auto pool : pools) {
14863 pool->crush_rule = new_rule;
14864 pool->peering_crush_bucket_count = bucket_count;
14865 pool->peering_crush_bucket_target = bucket_count;
14866 pool->peering_crush_bucket_barrier = dividing_id;
14867 pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14868 pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14869 pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14870 }
14871 pending_inc.change_stretch_mode = true;
14872 pending_inc.stretch_mode_enabled = true;
14873 pending_inc.new_stretch_bucket_count = bucket_count;
14874 pending_inc.new_degraded_stretch_mode = 0;
14875 pending_inc.new_stretch_mode_bucket = dividing_id;
14876 }
14877 *okay = true;
14878 return;
14879}
14880
14881bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14882 set<int> *really_down_buckets,
14883 set<string> *really_down_mons)
14884{
14885 dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14886 ceph_assert(is_readable());
14887 if (dead_buckets.empty()) return false;
14888 set<int> down_cache;
14889 bool really_down = false;
14890 for (auto dbi : dead_buckets) {
14891 const string& bucket_name = dbi.first;
14892 ceph_assert(osdmap.crush->name_exists(bucket_name));
14893 int bucket_id = osdmap.crush->get_item_id(bucket_name);
14894 dout(20) << "Checking " << bucket_name << " id " << bucket_id
14895 << " to see if OSDs are also down" << dendl;
14896 bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14897 if (subtree_down) {
14898 dout(20) << "subtree is down!" << dendl;
14899 really_down = true;
14900 really_down_buckets->insert(bucket_id);
14901 really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14902 }
14903 }
14904 dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14905 << " and mons " << *really_down_mons << " are really down" << dendl;
14906 return really_down;
14907}
14908
14909void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14910 const set<string>& live_zones)
14911{
14912 dout(20) << __func__ << dendl;
14913 stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14914 // update the general OSDMap changes
14915 pending_inc.change_stretch_mode = true;
14916 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14917 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14918 int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14919 ceph_assert(new_site_count == 1); // stretch count 2!
14920 pending_inc.new_degraded_stretch_mode = new_site_count;
14921 pending_inc.new_recovering_stretch_mode = 0;
14922 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14923
14924 // and then apply them to all the pg_pool_ts
14925 ceph_assert(live_zones.size() == 1); // only support 2 zones now
14926 const string& remaining_site_name = *(live_zones.begin());
14927 ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14928 int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14929 for (auto pgi : osdmap.pools) {
14930 if (pgi.second.peering_crush_bucket_count) {
14931 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14932 newp.peering_crush_bucket_count = new_site_count;
14933 newp.peering_crush_mandatory_member = remaining_site;
14934 newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
33c7a0ef 14935 newp.set_last_force_op_resend(pending_inc.epoch);
f67539c2
TL
14936 }
14937 }
14938 propose_pending();
14939}
14940
14941void OSDMonitor::trigger_recovery_stretch_mode()
14942{
14943 dout(20) << __func__ << dendl;
14944 stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14945 pending_inc.change_stretch_mode = true;
14946 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14947 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14948 pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14949 pending_inc.new_recovering_stretch_mode = 1;
14950 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14951
14952 for (auto pgi : osdmap.pools) {
14953 if (pgi.second.peering_crush_bucket_count) {
14954 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
33c7a0ef 14955 newp.set_last_force_op_resend(pending_inc.epoch);
f67539c2
TL
14956 }
14957 }
14958 propose_pending();
14959}
14960
b3b6e05e
TL
14961void OSDMonitor::set_degraded_stretch_mode()
14962{
14963 stretch_recovery_triggered.set_from_double(0);
14964}
14965
14966void OSDMonitor::set_recovery_stretch_mode()
14967{
14968 if (stretch_recovery_triggered.is_zero()) {
14969 stretch_recovery_triggered = ceph_clock_now();
14970 }
14971}
14972
14973void OSDMonitor::set_healthy_stretch_mode()
14974{
14975 stretch_recovery_triggered.set_from_double(0);
14976}
14977
f67539c2
TL
14978void OSDMonitor::notify_new_pg_digest()
14979{
14980 dout(20) << __func__ << dendl;
14981 if (!stretch_recovery_triggered.is_zero()) {
14982 try_end_recovery_stretch_mode(false);
14983 }
14984}
14985
14986struct CMonExitRecovery : public Context {
14987 OSDMonitor *m;
14988 bool force;
14989 CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14990 void finish(int r) {
14991 m->try_end_recovery_stretch_mode(force);
14992 }
14993};
14994
14995void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14996{
14997 dout(20) << __func__ << dendl;
14998 if (!mon.is_leader()) return;
14999 if (!mon.is_degraded_stretch_mode()) return;
15000 if (!mon.is_recovering_stretch_mode()) return;
15001 if (!is_readable()) {
15002 wait_for_readable_ctx(new CMonExitRecovery(this, force));
15003 return;
15004 }
15005
15006 if (osdmap.recovering_stretch_mode &&
15007 ((!stretch_recovery_triggered.is_zero() &&
15008 ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
15009 stretch_recovery_triggered) ||
15010 force)) {
15011 if (!mon.mgrstatmon()->is_readable()) {
15012 mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
15013 return;
15014 }
15015 const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
15016 double misplaced, degraded, inactive, unknown;
15017 pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
15018 if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
15019 // we can exit degraded stretch mode!
15020 mon.trigger_healthy_stretch_mode();
15021 }
15022 }
15023}
15024
15025void OSDMonitor::trigger_healthy_stretch_mode()
15026{
15027 ceph_assert(is_writeable());
15028 stretch_recovery_triggered.set_from_double(0);
15029 pending_inc.change_stretch_mode = true;
15030 pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
15031 pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
15032 pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
15033 pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
15034 pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
15035 for (auto pgi : osdmap.pools) {
15036 if (pgi.second.peering_crush_bucket_count) {
15037 pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
15038 newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
15039 newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
15040 newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
33c7a0ef 15041 newp.set_last_force_op_resend(pending_inc.epoch);
f67539c2
TL
15042 }
15043 }
15044 propose_pending();
15045}