]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
import 15.2.4
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b 20#include <boost/algorithm/string.hpp>
11fdf7f2 21#include <experimental/iterator>
224ce89b 22#include <locale>
7c673cae
FG
23#include <sstream>
24
31f18b77
FG
25#include "mon/OSDMonitor.h"
26#include "mon/Monitor.h"
27#include "mon/MDSMonitor.h"
31f18b77
FG
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
9f95a23c 42#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
43#include "messages/MOSDFull.h"
44#include "messages/MOSDMap.h"
45#include "messages/MMonGetOSDMap.h"
46#include "messages/MOSDBoot.h"
47#include "messages/MOSDAlive.h"
48#include "messages/MPoolOp.h"
49#include "messages/MPoolOpReply.h"
50#include "messages/MOSDPGCreate.h"
11fdf7f2 51#include "messages/MOSDPGCreate2.h"
7c673cae
FG
52#include "messages/MOSDPGCreated.h"
53#include "messages/MOSDPGTemp.h"
11fdf7f2 54#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
55#include "messages/MMonCommand.h"
56#include "messages/MRemoveSnaps.h"
57#include "messages/MOSDScrub.h"
58#include "messages/MRoute.h"
9f95a23c
TL
59#include "messages/MMonGetPurgedSnaps.h"
60#include "messages/MMonGetPurgedSnapsReply.h"
7c673cae
FG
61
62#include "common/TextTable.h"
63#include "common/Timer.h"
64#include "common/ceph_argparse.h"
65#include "common/perf_counters.h"
eafe8130 66#include "common/PriorityCache.h"
7c673cae 67#include "common/strtol.h"
11fdf7f2 68#include "common/numa.h"
7c673cae
FG
69
70#include "common/config.h"
71#include "common/errno.h"
72
73#include "erasure-code/ErasureCodePlugin.h"
74#include "compressor/Compressor.h"
75#include "common/Checksummer.h"
76
77#include "include/compat.h"
11fdf7f2 78#include "include/ceph_assert.h"
7c673cae
FG
79#include "include/stringify.h"
80#include "include/util.h"
81#include "common/cmdparse.h"
82#include "include/str_list.h"
83#include "include/str_map.h"
224ce89b 84#include "include/scope_guard.h"
eafe8130 85#include "perfglue/heap_profiler.h"
7c673cae 86
28e407b8
AA
87#include "auth/cephx/CephxKeyServer.h"
88#include "osd/OSDCap.h"
89
7c673cae
FG
90#include "json_spirit/json_spirit_reader.h"
91
c07f9fc5
FG
92#include <boost/algorithm/string/predicate.hpp>
93
7c673cae 94#define dout_subsys ceph_subsys_mon
3efd9988
FG
95static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
96static const string OSD_METADATA_PREFIX("osd_metadata");
11fdf7f2 97static const string OSD_SNAP_PREFIX("osd_snap");
7c673cae 98
9f95a23c
TL
99/*
100
101 OSD snapshot metadata
102 ---------------------
103
104 -- starting with mimic, removed in octopus --
105
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
108
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
111
112
113 -- starting with mimic --
114
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
117
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
122
123
124 -- starting with octopus --
125
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
128
129 */
130using namespace TOPNSPC::common;
c07f9fc5
FG
131namespace {
132
eafe8130
TL
133struct OSDMemCache : public PriorityCache::PriCache {
134 OSDMonitor *osdmon;
135 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
136 int64_t committed_bytes = 0;
137 double cache_ratio = 0;
138
139 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
140
141 virtual uint64_t _get_used_bytes() const = 0;
142
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri, uint64_t total_cache) const {
145 int64_t assigned = get_cache_bytes(pri);
146
147 switch (pri) {
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1:
150 {
151 int64_t request = _get_used_bytes();
152 return (request > assigned) ? request - assigned : 0;
153 }
154 default:
155 break;
156 }
157 return -EOPNOTSUPP;
158 }
159
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
161 return cache_bytes[pri];
162 }
163
164 virtual int64_t get_cache_bytes() const {
165 int64_t total = 0;
166
167 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
168 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
169 total += get_cache_bytes(pri);
170 }
171 return total;
172 }
173
174 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
175 cache_bytes[pri] = bytes;
176 }
177 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
178 cache_bytes[pri] += bytes;
179 }
180 virtual int64_t commit_cache_size(uint64_t total_cache) {
181 committed_bytes = PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache);
183 return committed_bytes;
184 }
185 virtual int64_t get_committed_size() const {
186 return committed_bytes;
187 }
188 virtual double get_cache_ratio() const {
189 return cache_ratio;
190 }
191 virtual void set_cache_ratio(double ratio) {
192 cache_ratio = ratio;
193 }
194 virtual string get_cache_name() const = 0;
195};
196
197struct IncCache : public OSDMemCache {
198 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
199
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon->inc_osd_cache.get_bytes();
202 }
203
204 virtual string get_cache_name() const {
205 return "OSDMap Inc Cache";
206 }
207
208 uint64_t _get_num_osdmaps() const {
209 return osdmon->inc_osd_cache.get_size();
210 }
211};
212
213struct FullCache : public OSDMemCache {
214 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
215
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon->full_osd_cache.get_bytes();
218 }
219
220 virtual string get_cache_name() const {
221 return "OSDMap Full Cache";
222 }
223
224 uint64_t _get_num_osdmaps() const {
225 return osdmon->full_osd_cache.get_size();
226 }
227};
228
229std::shared_ptr<IncCache> inc_cache;
230std::shared_ptr<FullCache> full_cache;
231
c07f9fc5
FG
232const uint32_t MAX_POOL_APPLICATIONS = 4;
233const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
234const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
235
28e407b8
AA
236bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant.spec.allow & OSD_CAP_W) != 0) {
239 auto& match = grant.match;
240 if (match.is_match_all()) {
241 return true;
11fdf7f2 242 } else if (pool_name != nullptr &&
28e407b8
AA
243 !match.pool_namespace.pool_name.empty() &&
244 match.pool_namespace.pool_name == *pool_name) {
245 return true;
246 }
247 }
248 return false;
249}
250
251bool is_unmanaged_snap_op_permitted(CephContext* cct,
252 const KeyServer& key_server,
253 const EntityName& entity_name,
254 const MonCap& mon_caps,
11fdf7f2 255 const entity_addr_t& peer_socket_addr,
28e407b8
AA
256 const std::string* pool_name)
257{
258 typedef std::map<std::string, std::string> CommandArgs;
259
11fdf7f2 260 if (mon_caps.is_capable(
92f5a8d4 261 cct, entity_name, "osd",
11fdf7f2
TL
262 "osd pool op unmanaged-snap",
263 (pool_name == nullptr ?
264 CommandArgs{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs{{"poolname", *pool_name}}),
266 false, true, false,
267 peer_socket_addr)) {
28e407b8
AA
268 return true;
269 }
270
271 AuthCapsInfo caps_info;
272 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
273 caps_info)) {
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl;
276 return false;
277 }
278
279 string caps_str;
280 if (caps_info.caps.length() > 0) {
11fdf7f2 281 auto p = caps_info.caps.cbegin();
28e407b8
AA
282 try {
283 decode(caps_str, p);
284 } catch (const buffer::error &err) {
285 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
286 << dendl;
287 return false;
288 }
289 }
290
291 OSDCap osd_cap;
292 if (!osd_cap.parse(caps_str, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl;
295 return false;
296 }
297
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap.allow_all()) {
301 return true;
302 }
303
304 for (auto& grant : osd_cap.grants) {
305 if (grant.profile.is_valid()) {
306 for (auto& profile_grant : grant.profile_grants) {
307 if (is_osd_writable(profile_grant, pool_name)) {
308 return true;
309 }
310 }
311 } else if (is_osd_writable(grant, pool_name)) {
312 return true;
313 }
314 }
315
316 return false;
317}
318
c07f9fc5
FG
319} // anonymous namespace
320
7c673cae
FG
321void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
322{
323 if (epoch_by_pg.size() <= ps) {
324 epoch_by_pg.resize(ps + 1, 0);
325 }
326 const auto old_lec = epoch_by_pg[ps];
327 if (old_lec >= last_epoch_clean) {
328 // stale lec
329 return;
330 }
331 epoch_by_pg[ps] = last_epoch_clean;
332 if (last_epoch_clean < floor) {
333 floor = last_epoch_clean;
334 } else if (last_epoch_clean > floor) {
335 if (old_lec == floor) {
336 // probably should increase floor?
337 auto new_floor = std::min_element(std::begin(epoch_by_pg),
338 std::end(epoch_by_pg));
339 floor = *new_floor;
340 }
341 }
342 if (ps != next_missing) {
343 return;
344 }
345 for (; next_missing < epoch_by_pg.size(); next_missing++) {
346 if (epoch_by_pg[next_missing] == 0) {
347 break;
348 }
349 }
350}
351
352void LastEpochClean::remove_pool(uint64_t pool)
353{
354 report_by_pool.erase(pool);
355}
356
357void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
358{
359 auto& lec = report_by_pool[pg.pool()];
360 return lec.report(pg.ps(), last_epoch_clean);
361}
362
363epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
364{
365 auto floor = latest.get_epoch();
366 for (auto& pool : latest.get_pools()) {
367 auto reported = report_by_pool.find(pool.first);
368 if (reported == report_by_pool.end()) {
369 return 0;
370 }
371 if (reported->second.next_missing < pool.second.get_pg_num()) {
372 return 0;
373 }
374 if (reported->second.floor < floor) {
375 floor = reported->second.floor;
376 }
377 }
378 return floor;
379}
380
1911f103
TL
381void LastEpochClean::dump(Formatter *f) const
382{
383 f->open_array_section("per_pool");
384
385 for (auto& it : report_by_pool) {
386 f->open_object_section("pool");
387 f->dump_unsigned("poolid", it.first);
388 f->dump_unsigned("floor", it.second.floor);
389 f->close_section();
390 }
391
392 f->close_section();
393}
7c673cae 394
11fdf7f2
TL
395class C_UpdateCreatingPGs : public Context {
396public:
7c673cae
FG
397 OSDMonitor *osdmon;
398 utime_t start;
399 epoch_t epoch;
400 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
401 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
402 void finish(int r) override {
403 if (r >= 0) {
404 utime_t end = ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch << " mapping took "
406 << (end - start) << " seconds" << dendl;
407 osdmon->update_creating_pgs();
408 osdmon->check_pg_creates_subs();
409 }
410 }
411};
412
413#undef dout_prefix
414#define dout_prefix _prefix(_dout, mon, osdmap)
415static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
416 return *_dout << "mon." << mon->name << "@" << mon->rank
417 << "(" << mon->get_state_name()
418 << ").osd e" << osdmap.get_epoch() << " ";
419}
420
421OSDMonitor::OSDMonitor(
422 CephContext *cct,
423 Monitor *mn,
424 Paxos *p,
425 const string& service_name)
426 : PaxosService(mn, p, service_name),
427 cct(cct),
11fdf7f2
TL
428 inc_osd_cache(g_conf()->mon_osd_cache_size),
429 full_osd_cache(g_conf()->mon_osd_cache_size),
430 has_osdmap_manifest(false),
431 mapper(mn->cct, &mn->cpu_tp)
eafe8130
TL
432{
433 inc_cache = std::make_shared<IncCache>(this);
434 full_cache = std::make_shared<FullCache>(this);
435 cct->_conf.add_observer(this);
436 int r = _set_cache_sizes();
437 if (r < 0) {
438 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
441 << dendl;
442 }
443}
444
445const char **OSDMonitor::get_tracked_conf_keys() const
446{
447 static const char* KEYS[] = {
448 "mon_memory_target",
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
451 NULL
452 };
453 return KEYS;
454}
455
456void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
457 const std::set<std::string> &changed)
458{
459 dout(10) << __func__ << " " << changed << dendl;
460
461 if (changed.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
463 }
464 if (changed.count("mon_memory_target") ||
465 changed.count("rocksdb_cache_size")) {
466 int r = _update_mon_cache_settings();
467 if (r < 0) {
468 derr << __func__ << " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
92f5a8d4 472 << ". Unable to update cache size."
eafe8130
TL
473 << dendl;
474 }
475 }
476}
477
478void OSDMonitor::_set_cache_autotuning()
479{
480 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
481 // Disable cache autotuning
482 std::lock_guard l(balancer_lock);
483 pcm = nullptr;
484 }
485
486 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
487 int r = register_cache_with_pcm();
488 if (r < 0) {
489 dout(10) << __func__
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
492 << dendl;
493 mon_memory_autotune = false;
494 } else {
495 mon_memory_autotune = true;
496 }
497 }
498}
499
500int OSDMonitor::_update_mon_cache_settings()
501{
502 if (g_conf()->mon_memory_target <= 0 ||
503 g_conf()->mon_memory_target < mon_memory_min ||
504 g_conf()->rocksdb_cache_size <= 0) {
505 return -EINVAL;
506 }
507
92f5a8d4
TL
508 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
509 derr << __func__ << " not using pcm and rocksdb" << dendl;
510 return -EINVAL;
511 }
512
eafe8130
TL
513 uint64_t old_mon_memory_target = mon_memory_target;
514 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
515
516 // Set the new pcm memory cache sizes
517 mon_memory_target = g_conf()->mon_memory_target;
518 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
519
520 uint64_t base = mon_memory_base;
521 double fragmentation = mon_memory_fragmentation;
522 uint64_t target = mon_memory_target;
523 uint64_t min = mon_memory_min;
524 uint64_t max = min;
525
526 uint64_t ltarget = (1.0 - fragmentation) * target;
527 if (ltarget > base + min) {
528 max = ltarget - base;
529 }
530
531 int r = _set_cache_ratios();
532 if (r < 0) {
533 derr << __func__ << " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
535 << dendl;
536 mon_memory_target = old_mon_memory_target;
537 rocksdb_cache_size = old_rocksdb_cache_size;
538 return -EINVAL;
539 }
540
541 if (mon_memory_autotune && pcm != nullptr) {
542 std::lock_guard l(balancer_lock);
543 // set pcm cache levels
544 pcm->set_target_memory(target);
545 pcm->set_min_memory(min);
546 pcm->set_max_memory(max);
547 // tune memory based on new values
548 pcm->tune_memory();
549 pcm->balance();
550 _set_new_cache_sizes();
92f5a8d4 551 dout(1) << __func__ << " Updated mon cache setting."
eafe8130
TL
552 << " target: " << target
553 << " min: " << min
554 << " max: " << max
555 << dendl;
556 }
557 return 0;
558}
559
560int OSDMonitor::_set_cache_sizes()
561{
562 if (g_conf()->mon_memory_autotune) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
565 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
566 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
568 mon_memory_target = g_conf()->mon_memory_target;
569 mon_memory_min = g_conf()->mon_osd_cache_size_min;
570 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
571 derr << __func__ << " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
574 << dendl;
575 return -EINVAL;
576 }
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache.set_bytes(mon_memory_min);
579 full_osd_cache.set_bytes(mon_memory_min);
580 mon_memory_autotune = g_conf()->mon_memory_autotune;
581 }
582 return 0;
583}
7c673cae
FG
584
585bool OSDMonitor::_have_pending_crush()
586{
587 return pending_inc.crush.length() > 0;
588}
589
590CrushWrapper &OSDMonitor::_get_stable_crush()
591{
592 return *osdmap.crush;
593}
594
595void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
596{
597 bufferlist bl;
598 if (pending_inc.crush.length())
599 bl = pending_inc.crush;
600 else
601 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
602
11fdf7f2 603 auto p = bl.cbegin();
7c673cae
FG
604 newcrush.decode(p);
605}
606
607void OSDMonitor::create_initial()
608{
609 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
610
611 OSDMap newmap;
612
613 bufferlist bl;
614 mon->store->get("mkfs", "osdmap", bl);
615
616 if (bl.length()) {
617 newmap.decode(bl);
618 newmap.set_fsid(mon->monmap->fsid);
619 } else {
11fdf7f2 620 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
7c673cae
FG
621 }
622 newmap.set_epoch(1);
623 newmap.created = newmap.modified = ceph_clock_now();
624
625 // new clusters should sort bitwise by default.
626 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
627
11fdf7f2
TL
628 newmap.flags |=
629 CEPH_OSDMAP_RECOVERY_DELETES |
630 CEPH_OSDMAP_PURGED_SNAPDIRS |
631 CEPH_OSDMAP_PGLOG_HARDLIMIT;
632 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
633 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
634 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
635 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
636 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
637 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
638
7c673cae 639 // new cluster should require latest by default
9f95a23c
TL
640 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
642 derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
643 newmap.require_osd_release = ceph_release_t::mimic;
11fdf7f2 644 } else {
9f95a23c
TL
645 derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
646 newmap.require_osd_release = ceph_release_t::nautilus;
11fdf7f2 647 }
31f18b77 648 } else {
9f95a23c
TL
649 newmap.require_osd_release = ceph_release_t::octopus;
650 ceph_release_t r = ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client);
652 if (!r) {
11fdf7f2 653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
31f18b77
FG
654 }
655 newmap.require_min_compat_client = r;
7c673cae
FG
656 }
657
658 // encode into pending incremental
28e407b8 659 uint64_t features = newmap.get_encoding_features();
7c673cae 660 newmap.encode(pending_inc.fullmap,
28e407b8 661 features | CEPH_FEATURE_RESERVED);
7c673cae
FG
662 pending_inc.full_crc = newmap.get_crc();
663 dout(20) << " full crc " << pending_inc.full_crc << dendl;
664}
665
11fdf7f2 666void OSDMonitor::get_store_prefixes(std::set<string>& s) const
7c673cae
FG
667{
668 s.insert(service_name);
669 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 670 s.insert(OSD_METADATA_PREFIX);
11fdf7f2 671 s.insert(OSD_SNAP_PREFIX);
7c673cae
FG
672}
673
674void OSDMonitor::update_from_paxos(bool *need_bootstrap)
675{
11fdf7f2
TL
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
680
7c673cae
FG
681 version_t version = get_last_committed();
682 if (version == osdmap.epoch)
683 return;
11fdf7f2 684 ceph_assert(version > osdmap.epoch);
7c673cae
FG
685
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap.epoch << dendl;
688
31f18b77
FG
689 if (mapping_job) {
690 if (!mapping_job->is_done()) {
691 dout(1) << __func__ << " mapping job "
692 << mapping_job.get() << " did not complete, "
693 << mapping_job->shards << " left, canceling" << dendl;
694 mapping_job->abort();
695 }
696 mapping_job.reset();
697 }
7c673cae 698
224ce89b
WB
699 load_health();
700
7c673cae
FG
701 /*
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
705 * transaction.
706 *
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
711 */
712 version_t latest_full = get_version_latest_full();
713 if (latest_full == 0 && get_first_committed() > 1)
714 latest_full = get_first_committed();
715
716 if (get_first_committed() > 1 &&
717 latest_full < get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc = get_last_committed();
722 version_t fc = get_first_committed();
723
724 dout(10) << __func__ << " looking for valid full map in interval"
725 << " [" << fc << ", " << lc << "]" << dendl;
726
727 latest_full = 0;
728 for (version_t v = lc; v >= fc; v--) {
729 string full_key = "full_" + stringify(v);
730 if (mon->store->exists(get_service_name(), full_key)) {
731 dout(10) << __func__ << " found latest full map v " << v << dendl;
732 latest_full = v;
733 break;
734 }
735 }
736
11fdf7f2 737 ceph_assert(latest_full > 0);
7c673cae
FG
738 auto t(std::make_shared<MonitorDBStore::Transaction>());
739 put_version_latest_full(t, latest_full);
740 mon->store->apply_transaction(t);
741 dout(10) << __func__ << " updated the on-disk full map version to "
742 << latest_full << dendl;
743 }
744
745 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
746 bufferlist latest_bl;
747 get_version_full(latest_full, latest_bl);
11fdf7f2 748 ceph_assert(latest_bl.length() != 0);
7c673cae 749 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
11fdf7f2 750 osdmap = OSDMap();
7c673cae
FG
751 osdmap.decode(latest_bl);
752 }
753
11fdf7f2
TL
754 bufferlist bl;
755 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
756 auto p = bl.cbegin();
757 std::lock_guard<std::mutex> l(creating_pgs_lock);
758 creating_pgs.decode(p);
759 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
760 << creating_pgs.last_scan_epoch
761 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
31f18b77 762 } else {
11fdf7f2
TL
763 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
764 << dendl;
31f18b77
FG
765 }
766
7c673cae
FG
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t;
769 size_t tx_size = 0;
770 while (version > osdmap.epoch) {
771 bufferlist inc_bl;
772 int err = get_version(osdmap.epoch+1, inc_bl);
11fdf7f2
TL
773 ceph_assert(err == 0);
774 ceph_assert(inc_bl.length());
eafe8130
TL
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune && pcm == nullptr) {
778 int r = register_cache_with_pcm();
779 if (r < 0) {
780 dout(10) << __func__
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
783 << dendl;
784 }
785 }
7c673cae
FG
786
787 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
788 << dendl;
789 OSDMap::Incremental inc(inc_bl);
790 err = osdmap.apply_incremental(inc);
11fdf7f2 791 ceph_assert(err == 0);
7c673cae
FG
792
793 if (!t)
794 t.reset(new MonitorDBStore::Transaction);
795
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f = inc.encode_features;
801 if (!f)
802 f = mon->get_quorum_con_features();
803 if (!f)
804 f = -1;
805 bufferlist full_bl;
806 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
807 tx_size += full_bl.length();
808
809 bufferlist orig_full_bl;
810 get_version_full(osdmap.epoch, orig_full_bl);
811 if (orig_full_bl.length()) {
812 // the primary provided the full map
11fdf7f2 813 ceph_assert(inc.have_crc);
7c673cae
FG
814 if (inc.full_crc != osdmap.crc) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr << __func__ << " full map CRC mismatch, resetting to canonical"
822 << dendl;
11fdf7f2
TL
823
824 dout(20) << __func__ << " my (bad) full osdmap:\n";
825 JSONFormatter jf(true);
826 jf.dump_object("osdmap", osdmap);
827 jf.flush(*_dout);
828 *_dout << "\nhexdump:\n";
829 full_bl.hexdump(*_dout);
830 *_dout << dendl;
831
7c673cae
FG
832 osdmap = OSDMap();
833 osdmap.decode(orig_full_bl);
11fdf7f2
TL
834
835 dout(20) << __func__ << " canonical full osdmap:\n";
836 JSONFormatter jf(true);
837 jf.dump_object("osdmap", osdmap);
838 jf.flush(*_dout);
839 *_dout << "\nhexdump:\n";
840 orig_full_bl.hexdump(*_dout);
841 *_dout << dendl;
7c673cae
FG
842 }
843 } else {
11fdf7f2 844 ceph_assert(!inc.have_crc);
7c673cae
FG
845 put_version_full(t, osdmap.epoch, full_bl);
846 }
847 put_version_latest_full(t, osdmap.epoch);
848
849 // share
850 dout(1) << osdmap << dendl;
851
852 if (osdmap.epoch == 1) {
853 t->erase("mkfs", "osdmap");
854 }
855
11fdf7f2 856 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
7c673cae
FG
857 mon->store->apply_transaction(t);
858 t = MonitorDBStore::TransactionRef();
859 tx_size = 0;
860 }
11fdf7f2
TL
861 for (const auto &osd_state : inc.new_state) {
862 if (osd_state.second & CEPH_OSD_UP) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report.erase(osd_state.first);
865 }
866 if (osd_state.second & CEPH_OSD_EXISTS) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs.erase(osd_state.first);
7c673cae
FG
869 }
870 }
871 }
872
873 if (t) {
874 mon->store->apply_transaction(t);
875 }
876
877 for (int o = 0; o < osdmap.get_max_osd(); o++) {
878 if (osdmap.is_out(o))
879 continue;
880 auto found = down_pending_out.find(o);
881 if (osdmap.is_down(o)) {
882 // populate down -> out map
883 if (found == down_pending_out.end()) {
884 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
885 down_pending_out[o] = ceph_clock_now();
886 }
887 } else {
888 if (found != down_pending_out.end()) {
889 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
890 down_pending_out.erase(found);
891 }
892 }
893 }
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
895
7c673cae
FG
896 check_osdmap_subs();
897 check_pg_creates_subs();
898
899 share_map_with_random_osd();
900 update_logger();
7c673cae
FG
901 process_failures();
902
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
905
906 if (!mon->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
908 start_mapping();
909 }
910}
911
eafe8130
TL
912int OSDMonitor::register_cache_with_pcm()
913{
914 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
915 derr << __func__ << " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
917 << dendl;
918 return -EINVAL;
919 }
920 uint64_t base = mon_memory_base;
921 double fragmentation = mon_memory_fragmentation;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target = mon_memory_target;
924 uint64_t min = mon_memory_min;
925 uint64_t max = min;
926
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget = (1.0 - fragmentation) * target;
931 if (ltarget > base + min) {
932 max = ltarget - base;
933 }
934
935 rocksdb_binned_kv_cache = mon->store->get_priority_cache();
936 if (!rocksdb_binned_kv_cache) {
937 derr << __func__ << " not using rocksdb" << dendl;
938 return -EINVAL;
939 }
940
941 int r = _set_cache_ratios();
942 if (r < 0) {
943 derr << __func__ << " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
945 << dendl;
946 return -EINVAL;
947 }
948
949 pcm = std::make_shared<PriorityCache::Manager>(
950 cct, min, max, target, true);
951 pcm->insert("kv", rocksdb_binned_kv_cache, true);
952 pcm->insert("inc", inc_cache, true);
953 pcm->insert("full", full_cache, true);
92f5a8d4 954 dout(1) << __func__ << " pcm target: " << target
eafe8130
TL
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache.get_size()
958 << dendl;
959 return 0;
960}
961
962int OSDMonitor::_set_cache_ratios()
963{
964 double old_cache_kv_ratio = cache_kv_ratio;
965
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
968 if (cache_kv_ratio >= 1.0) {
969 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
971 << dendl;
972 cache_kv_ratio = old_cache_kv_ratio;
973 return -EINVAL;
974 }
975 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
976 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
977 inc_cache->set_cache_ratio(cache_inc_ratio);
978 full_cache->set_cache_ratio(cache_full_ratio);
979
92f5a8d4 980 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
eafe8130
TL
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
983 << dendl;
984 return 0;
985}
986
7c673cae
FG
987void OSDMonitor::start_mapping()
988{
989 // initiate mapping job
990 if (mapping_job) {
991 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
992 << dendl;
993 mapping_job->abort();
994 }
224ce89b
WB
995 if (!osdmap.get_pools().empty()) {
996 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
997 mapping_job = mapping.start_update(osdmap, mapper,
11fdf7f2 998 g_conf()->mon_osd_mapping_pgs_per_chunk);
224ce89b
WB
999 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000 << " at " << fin->start << dendl;
1001 mapping_job->set_finish_event(fin);
1002 } else {
1003 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004 mapping_job = nullptr;
1005 }
7c673cae
FG
1006}
1007
1008void OSDMonitor::update_msgr_features()
1009{
1010 set<int> types;
1011 types.insert((int)entity_name_t::TYPE_OSD);
1012 types.insert((int)entity_name_t::TYPE_CLIENT);
1013 types.insert((int)entity_name_t::TYPE_MDS);
1014 types.insert((int)entity_name_t::TYPE_MON);
1015 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016 uint64_t mask;
1017 uint64_t features = osdmap.get_features(*q, &mask);
1018 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
11fdf7f2 1020 ceph::net::Policy p = mon->messenger->get_policy(*q);
7c673cae
FG
1021 p.features_required = (p.features_required & ~mask) | features;
1022 mon->messenger->set_policy(*q, p);
1023 }
1024 }
1025}
1026
1027void OSDMonitor::on_active()
1028{
1029 update_logger();
1030
1031 if (mon->is_leader()) {
224ce89b 1032 mon->clog->debug() << "osdmap " << osdmap;
81eedcae
TL
1033 if (!priority_convert) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert = true;
1037 }
7c673cae
FG
1038 } else {
1039 list<MonOpRequestRef> ls;
1040 take_all_failures(ls);
1041 while (!ls.empty()) {
1042 MonOpRequestRef op = ls.front();
1043 op->mark_osdmon_event(__func__);
1044 dispatch(op);
1045 ls.pop_front();
1046 }
1047 }
1048 start_mapping();
1049}
1050
1051void OSDMonitor::on_restart()
1052{
1053 last_osd_report.clear();
1054}
1055
1056void OSDMonitor::on_shutdown()
1057{
1058 dout(10) << __func__ << dendl;
1059 if (mapping_job) {
1060 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061 << dendl;
1062 mapping_job->abort();
1063 }
1064
1065 // discard failure info, waiters
1066 list<MonOpRequestRef> ls;
1067 take_all_failures(ls);
1068 ls.clear();
1069}
1070
1071void OSDMonitor::update_logger()
1072{
1073 dout(10) << "update_logger" << dendl;
1074
1075 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079}
1080
7c673cae
FG
1081void OSDMonitor::create_pending()
1082{
1083 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084 pending_inc.fsid = mon->monmap->fsid;
11fdf7f2
TL
1085 pending_metadata.clear();
1086 pending_metadata_rm.clear();
9f95a23c 1087 pending_pseudo_purged_snaps.clear();
7c673cae
FG
1088
1089 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
11fdf7f2
TL
1091 // safety checks (this shouldn't really happen)
1092 {
1093 if (osdmap.backfillfull_ratio <= 0) {
1094 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095 if (pending_inc.new_backfillfull_ratio > 1.0)
1096 pending_inc.new_backfillfull_ratio /= 100;
1097 dout(1) << __func__ << " setting backfillfull_ratio = "
1098 << pending_inc.new_backfillfull_ratio << dendl;
7c673cae 1099 }
7c673cae 1100 if (osdmap.full_ratio <= 0) {
11fdf7f2 1101 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
7c673cae
FG
1102 if (pending_inc.new_full_ratio > 1.0)
1103 pending_inc.new_full_ratio /= 100;
1104 dout(1) << __func__ << " setting full_ratio = "
1105 << pending_inc.new_full_ratio << dendl;
1106 }
1107 if (osdmap.nearfull_ratio <= 0) {
11fdf7f2 1108 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
7c673cae
FG
1109 if (pending_inc.new_nearfull_ratio > 1.0)
1110 pending_inc.new_nearfull_ratio /= 100;
1111 dout(1) << __func__ << " setting nearfull_ratio = "
1112 << pending_inc.new_nearfull_ratio << dendl;
1113 }
1114 }
3efd9988
FG
1115
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117 // structure.
1118 if (osdmap.crush->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush;
1120 _get_pending_crush(newcrush);
1121
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i : osdmap.get_pools()) {
1125 const auto pool_id = i.first;
1126 const auto &pool = i.second;
1127 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128 pool.type, pool.size);
1129
1130 dout(1) << __func__ << " rewriting pool "
1131 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133 if (pending_inc.new_pools.count(pool_id) == 0) {
1134 pending_inc.new_pools[pool_id] = pool;
1135 }
1136 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137 }
1138
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new = newcrush.renumber_rules();
1142 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143 for (const auto &i : old_to_new) {
1144 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145 }
1146 pending_inc.crush.clear();
1147 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148 }
7c673cae
FG
1149}
1150
1151creating_pgs_t
94b18763
FG
1152OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153 const OSDMap& nextmap)
7c673cae 1154{
31f18b77 1155 dout(10) << __func__ << dendl;
7c673cae
FG
1156 creating_pgs_t pending_creatings;
1157 {
1158 std::lock_guard<std::mutex> l(creating_pgs_lock);
1159 pending_creatings = creating_pgs;
1160 }
31f18b77
FG
1161 // check for new or old pools
1162 if (pending_creatings.last_scan_epoch < inc.epoch) {
31f18b77
FG
1163 unsigned queued = 0;
1164 queued += scan_for_creating_pgs(osdmap.get_pools(),
1165 inc.old_pools,
1166 inc.modified,
1167 &pending_creatings);
1168 queued += scan_for_creating_pgs(inc.new_pools,
1169 inc.old_pools,
1170 inc.modified,
1171 &pending_creatings);
1172 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173 for (auto deleted_pool : inc.old_pools) {
1174 auto removed = pending_creatings.remove_pool(deleted_pool);
1175 dout(10) << __func__ << " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool << dendl;
1178 last_epoch_clean.remove_pool(deleted_pool);
1179 }
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed = 0;
1185 for (auto& pg : pending_created_pgs) {
1186 dout(20) << __func__ << " noting created pg " << pg << dendl;
1187 pending_creatings.created_pools.insert(pg.pool());
1188 removed += pending_creatings.pgs.erase(pg);
1189 }
1190 pending_created_pgs.clear();
1191 dout(10) << __func__ << " " << removed
1192 << " pgs removed because they're created" << dendl;
1193 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194 }
1195
94b18763
FG
1196 // filter out any pgs that shouldn't exist.
1197 {
1198 auto i = pending_creatings.pgs.begin();
1199 while (i != pending_creatings.pgs.end()) {
1200 if (!nextmap.pg_exists(i->first)) {
1201 dout(10) << __func__ << " removing pg " << i->first
1202 << " which should not exist" << dendl;
1203 i = pending_creatings.pgs.erase(i);
1204 } else {
1205 ++i;
1206 }
1207 }
1208 }
1209
31f18b77 1210 // process queue
11fdf7f2 1211 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
31f18b77
FG
1212 const auto total = pending_creatings.pgs.size();
1213 while (pending_creatings.pgs.size() < max &&
1214 !pending_creatings.queue.empty()) {
1215 auto p = pending_creatings.queue.begin();
1216 int64_t poolid = p->first;
1217 dout(10) << __func__ << " pool " << poolid
1218 << " created " << p->second.created
1219 << " modified " << p->second.modified
1220 << " [" << p->second.start << "-" << p->second.end << ")"
1221 << dendl;
11fdf7f2
TL
1222 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223 p->second.end - p->second.start);
31f18b77
FG
1224 ps_t first = p->second.start;
1225 ps_t end = first + n;
1226 for (ps_t ps = first; ps < end; ++ps) {
1227 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
9f95a23c
TL
1230 pending_creatings.pgs.emplace(
1231 pgid,
1232 creating_pgs_t::pg_create_info(inc.epoch,
1233 p->second.modified));
31f18b77
FG
1234 dout(10) << __func__ << " adding " << pgid << dendl;
1235 }
1236 p->second.start = end;
1237 if (p->second.done()) {
1238 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239 pending_creatings.queue.erase(p);
1240 } else {
1241 dout(10) << __func__ << " pool " << poolid
1242 << " now [" << p->second.start << "-" << p->second.end << ")"
1243 << dendl;
1244 }
1245 }
1246 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247 << " pools" << dendl;
9f95a23c
TL
1248
1249 if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i : pending_creatings.pgs) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid = i.first;
1254
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257 const pg_pool_t *pi;
1258 bool operator()(const set<pg_shard_t> &have) const {
1259 return have.size() >= pi->min_size;
1260 }
1261 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264 vector<int> up, acting;
1265 int up_primary, acting_primary;
1266 nextmap.pg_to_up_acting_osds(
1267 pgid, &up, &up_primary, &acting, &acting_primary);
1268 if (i.second.history.epoch_created == 0) {
1269 // new pg entry, set it up
1270 i.second.up = up;
1271 i.second.acting = acting;
1272 i.second.up_primary = up_primary;
1273 i.second.acting_primary = acting_primary;
1274 i.second.history = pg_history_t(i.second.create_epoch,
1275 i.second.create_stamp);
1276 dout(10) << __func__ << " pg " << pgid << " just added, "
1277 << " up " << i.second.up
1278 << " p " << i.second.up_primary
1279 << " acting " << i.second.acting
1280 << " p " << i.second.acting_primary
1281 << " history " << i.second.history
1282 << " past_intervals " << i.second.past_intervals
1283 << dendl;
1284 } else {
1285 std::stringstream debug;
1286 if (PastIntervals::check_new_interval(
1287 i.second.acting_primary, acting_primary,
1288 i.second.acting, acting,
1289 i.second.up_primary, up_primary,
1290 i.second.up, up,
1291 i.second.history.same_interval_since,
1292 i.second.history.last_epoch_clean,
1293 &nextmap,
1294 &osdmap,
1295 pgid,
1296 min_size_predicate,
1297 &i.second.past_intervals,
1298 &debug)) {
1299 epoch_t e = inc.epoch;
1300 i.second.history.same_interval_since = e;
1301 if (i.second.up != up) {
1302 i.second.history.same_up_since = e;
1303 }
1304 if (i.second.acting_primary != acting_primary) {
1305 i.second.history.same_primary_since = e;
1306 }
1307 if (pgid.is_split(
1308 osdmap.get_pg_num(pgid.pool()),
1309 nextmap.get_pg_num(pgid.pool()),
1310 nullptr)) {
1311 i.second.history.last_epoch_split = e;
1312 }
1313 dout(10) << __func__ << " pg " << pgid << " new interval,"
1314 << " up " << i.second.up << " -> " << up
1315 << " p " << i.second.up_primary << " -> " << up_primary
1316 << " acting " << i.second.acting << " -> " << acting
1317 << " p " << i.second.acting_primary << " -> "
1318 << acting_primary
1319 << " history " << i.second.history
1320 << " past_intervals " << i.second.past_intervals
1321 << dendl;
1322 dout(20) << " debug: " << debug.str() << dendl;
1323 i.second.up = up;
1324 i.second.acting = acting;
1325 i.second.up_primary = up_primary;
1326 i.second.acting_primary = acting_primary;
1327 }
1328 }
1329 }
1330 }
c07f9fc5
FG
1331 dout(10) << __func__
1332 << " " << (pending_creatings.pgs.size() - total)
1333 << "/" << pending_creatings.pgs.size()
31f18b77 1334 << " pgs added from queued pools" << dendl;
7c673cae
FG
1335 return pending_creatings;
1336}
1337
1338void OSDMonitor::maybe_prime_pg_temp()
1339{
1340 bool all = false;
1341 if (pending_inc.crush.length()) {
1342 dout(10) << __func__ << " new crush map, all" << dendl;
1343 all = true;
1344 }
1345
1346 if (!pending_inc.new_up_client.empty()) {
1347 dout(10) << __func__ << " new up osds, all" << dendl;
1348 all = true;
1349 }
1350
1351 // check for interesting OSDs
1352 set<int> osds;
31f18b77 1353 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
1354 !all && p != pending_inc.new_state.end();
1355 ++p) {
1356 if ((p->second & CEPH_OSD_UP) &&
1357 osdmap.is_up(p->first)) {
1358 osds.insert(p->first);
1359 }
1360 }
1361 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362 !all && p != pending_inc.new_weight.end();
1363 ++p) {
1364 if (p->second < osdmap.get_weight(p->first)) {
1365 // weight reduction
1366 osds.insert(p->first);
1367 } else {
1368 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369 << dendl;
1370 all = true;
1371 }
1372 }
1373
1374 if (!all && osds.empty())
1375 return;
1376
1377 if (!all) {
1378 unsigned estimate =
1379 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380 if (estimate > mapping.get_num_pgs() *
11fdf7f2 1381 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
7c673cae
FG
1382 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383 << osds.size() << " osds >= "
11fdf7f2 1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
7c673cae
FG
1385 << mapping.get_num_pgs() << " pgs, all"
1386 << dendl;
1387 all = true;
1388 } else {
1389 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390 << osds.size() << " osds" << dendl;
1391 }
1392 }
1393
1394 OSDMap next;
1395 next.deepish_copy_from(osdmap);
1396 next.apply_incremental(pending_inc);
1397
224ce89b
WB
1398 if (next.get_pools().empty()) {
1399 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400 } else if (all) {
7c673cae 1401 PrimeTempJob job(next, this);
494da23a 1402 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
11fdf7f2 1403 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
7c673cae
FG
1404 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405 } else {
1406 dout(10) << __func__ << " did not finish in "
11fdf7f2 1407 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1408 << ", stopping" << dendl;
1409 job.abort();
1410 }
1411 } else {
1412 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413 utime_t stop = ceph_clock_now();
11fdf7f2 1414 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
7c673cae
FG
1415 const int chunk = 1000;
1416 int n = chunk;
1417 std::unordered_set<pg_t> did_pgs;
1418 for (auto osd : osds) {
1419 auto& pgs = mapping.get_osd_acting_pgs(osd);
1420 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421 for (auto pgid : pgs) {
1422 if (!did_pgs.insert(pgid).second) {
1423 continue;
1424 }
1425 prime_pg_temp(next, pgid);
1426 if (--n <= 0) {
1427 n = chunk;
1428 if (ceph_clock_now() > stop) {
1429 dout(10) << __func__ << " consumed more than "
11fdf7f2 1430 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1431 << " seconds, stopping"
1432 << dendl;
1433 return;
1434 }
1435 }
1436 }
1437 }
1438 }
1439}
1440
1441void OSDMonitor::prime_pg_temp(
1442 const OSDMap& next,
1443 pg_t pgid)
1444{
11fdf7f2
TL
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs.pgs.count(pgid)) {
1447 return;
7c673cae
FG
1448 }
1449 if (!osdmap.pg_exists(pgid)) {
1450 return;
1451 }
1452
1453 vector<int> up, acting;
1454 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456 vector<int> next_up, next_acting;
1457 int next_up_primary, next_acting_primary;
1458 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459 &next_acting, &next_acting_primary);
f64942e4
AA
1460 if (acting == next_acting &&
1461 !(up != acting && next_up == next_acting))
7c673cae
FG
1462 return; // no change since last epoch
1463
1464 if (acting.empty())
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467 if (pool && acting.size() < pool->min_size)
1468 return; // can be no worse off than before
1469
c07f9fc5
FG
1470 if (next_up == next_acting) {
1471 acting.clear();
11fdf7f2
TL
1472 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473 << dendl;
c07f9fc5
FG
1474 }
1475
7c673cae
FG
1476 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477 << " -> " << next_up << "/" << next_acting
1478 << ", priming " << acting
1479 << dendl;
1480 {
11fdf7f2 1481 std::lock_guard l(prime_pg_temp_lock);
7c673cae
FG
1482 // do not touch a mapping if a change is pending
1483 pending_inc.new_pg_temp.emplace(
1484 pgid,
1485 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486 }
1487}
1488
1489/**
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1492 */
1493void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494{
1495 dout(10) << "encode_pending e " << pending_inc.epoch
1496 << dendl;
1497
11fdf7f2
TL
1498 if (do_prune(t)) {
1499 dout(1) << __func__ << " osdmap full prune encoded e"
1500 << pending_inc.epoch << dendl;
1501 }
1502
7c673cae
FG
1503 // finalize up pending_inc
1504 pending_inc.modified = ceph_clock_now();
1505
11fdf7f2
TL
1506 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1507 ceph_assert(r == 0);
7c673cae
FG
1508
1509 if (mapping_job) {
1510 if (!mapping_job->is_done()) {
1511 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512 << mapping_job.get() << " did not complete, "
1513 << mapping_job->shards << " left" << dendl;
1514 mapping_job->abort();
1515 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517 << mapping_job.get() << " is prior epoch "
1518 << mapping.get_epoch() << dendl;
1519 } else {
11fdf7f2 1520 if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1521 maybe_prime_pg_temp();
1522 }
1523 }
11fdf7f2 1524 } else if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1525 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526 << dendl;
1527 }
1528 mapping_job.reset();
1529
c07f9fc5
FG
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p = pending_inc.new_state.begin();
1533 while (p != pending_inc.new_state.end()) {
1534 if (p->second == 0) {
1535 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536 p = pending_inc.new_state.erase(p);
1537 } else {
11fdf7f2
TL
1538 if (p->second & CEPH_OSD_UP) {
1539 pending_inc.new_last_up_change = pending_inc.modified;
1540 }
c07f9fc5
FG
1541 ++p;
1542 }
1543 }
11fdf7f2
TL
1544 if (!pending_inc.new_up_client.empty()) {
1545 pending_inc.new_last_up_change = pending_inc.modified;
1546 }
1547 for (auto& i : pending_inc.new_weight) {
9f95a23c 1548 if (i.first >= osdmap.max_osd) {
11fdf7f2
TL
1549 if (i.second) {
1550 // new osd is already marked in
1551 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1552 break;
11fdf7f2
TL
1553 }
1554 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555 // existing osd marked in or out
1556 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1557 break;
11fdf7f2
TL
1558 }
1559 }
7c673cae
FG
1560
1561 {
1562 OSDMap tmp;
1563 tmp.deepish_copy_from(osdmap);
1564 tmp.apply_incremental(pending_inc);
1565
11fdf7f2
TL
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
494da23a
TL
1570 {
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector<pg_t> pgs_to_check;
1575 tmp.get_upmap_pgs(&pgs_to_check);
9f95a23c
TL
1576 if (pgs_to_check.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
494da23a
TL
1578 // not enough pgs, do it inline
1579 tmp.clean_pg_upmaps(cct, &pending_inc);
1580 } else {
1581 CleanUpmapJob job(cct, tmp, pending_inc);
1582 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583 job.wait();
1584 }
1585 }
11fdf7f2
TL
1586
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590 bufferlist creatings_bl;
9f95a23c
TL
1591 uint64_t features = CEPH_FEATURES_ALL;
1592 if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593 dout(20) << __func__ << " encoding pending pgs without octopus features"
1594 << dendl;
1595 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596 }
1597 encode(pending_creatings, creatings_bl, features);
11fdf7f2
TL
1598 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i : tmp.get_pools()) {
9f95a23c 1602 if (tmp.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc.new_pools.count(i.first)) {
1605 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606 }
1607 }
1608 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609 !pending_creatings.still_creating_pool(i.first)) {
1610 dout(10) << __func__ << " done creating pool " << i.first
1611 << ", clearing CREATING flag" << dendl;
1612 if (pending_inc.new_pools.count(i.first) == 0) {
1613 pending_inc.new_pools[i.first] = i.second;
1614 }
1615 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
3efd9988 1616 }
11fdf7f2
TL
1617 }
1618
11fdf7f2
TL
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set<int64_t> full_pool_ids;
1623 set<int64_t> backfillfull_pool_ids;
1624 set<int64_t> nearfull_pool_ids;
1625 tmp.get_full_pools(cct,
1626 &full_pool_ids,
1627 &backfillfull_pool_ids,
3efd9988 1628 &nearfull_pool_ids);
11fdf7f2
TL
1629 if (full_pool_ids.empty() ||
1630 backfillfull_pool_ids.empty() ||
1631 nearfull_pool_ids.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
3efd9988
FG
1633 // try cancel any improper nearfull/backfillfull/full pool
1634 // flags first
11fdf7f2
TL
1635 for (auto &pool: tmp.get_pools()) {
1636 auto p = pool.first;
1637 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638 nearfull_pool_ids.empty()) {
1639 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640 << "'s nearfull flag" << dendl;
1641 if (pending_inc.new_pools.count(p) == 0) {
1642 // load original pool info first!
1643 pending_inc.new_pools[p] = pool.second;
1644 }
1645 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646 }
1647 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648 backfillfull_pool_ids.empty()) {
1649 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650 << "'s backfillfull flag" << dendl;
1651 if (pending_inc.new_pools.count(p) == 0) {
1652 pending_inc.new_pools[p] = pool.second;
1653 }
1654 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655 }
1656 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657 full_pool_ids.empty()) {
1658 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659 // set by EQUOTA, skipping
1660 continue;
1661 }
1662 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663 << "'s full flag" << dendl;
1664 if (pending_inc.new_pools.count(p) == 0) {
1665 pending_inc.new_pools[p] = pool.second;
1666 }
1667 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668 }
3efd9988 1669 }
11fdf7f2
TL
1670 }
1671 if (!full_pool_ids.empty()) {
1672 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl;
1674 for (auto &p: full_pool_ids) {
1675 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676 continue;
1677 }
1678 if (pending_inc.new_pools.count(p) == 0) {
1679 pending_inc.new_pools[p] = tmp.pools[p];
1680 }
1681 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684 }
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool: tmp.get_pools()) {
1687 auto p = pool.first;
1688 if (full_pool_ids.count(p)) {
1689 // skip pools we have just marked as full above
1690 continue;
1691 }
1692 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1696 continue;
1697 }
1698 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699 << "'s full flag" << dendl;
1700 if (pending_inc.new_pools.count(p) == 0) {
1701 pending_inc.new_pools[p] = pool.second;
1702 }
1703 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
3efd9988 1704 }
11fdf7f2
TL
1705 }
1706 if (!backfillfull_pool_ids.empty()) {
1707 for (auto &p: backfillfull_pool_ids) {
1708 if (full_pool_ids.count(p)) {
1709 // skip pools we have already considered as full above
1710 continue;
1711 }
1712 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716 continue;
1717 }
1718 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719 // don't bother if pool is already marked as backfillfull
1720 continue;
1721 }
1722 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723 << "'s as backfillfull" << dendl;
1724 if (pending_inc.new_pools.count(p) == 0) {
1725 pending_inc.new_pools[p] = tmp.pools[p];
1726 }
1727 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729 }
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool: tmp.get_pools()) {
1733 auto p = pool.first;
1734 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735 // skip pools we have just marked as backfillfull/full above
1736 continue;
1737 }
1738 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739 // and don't touch if currently is not backfillfull
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s backfillfull flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
3efd9988 1748 }
11fdf7f2
TL
1749 }
1750 if (!nearfull_pool_ids.empty()) {
1751 for (auto &p: nearfull_pool_ids) {
1752 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753 continue;
1754 }
1755 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759 continue;
1760 }
1761 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762 // don't bother if pool is already marked as nearfull
1763 continue;
1764 }
1765 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766 << "'s as nearfull" << dendl;
1767 if (pending_inc.new_pools.count(p) == 0) {
1768 pending_inc.new_pools[p] = tmp.pools[p];
1769 }
1770 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771 }
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool: tmp.get_pools()) {
1775 auto p = pool.first;
1776 if (full_pool_ids.count(p) ||
1777 backfillfull_pool_ids.count(p) ||
1778 nearfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1781 continue;
1782 }
1783 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784 // and don't touch if currently is not nearfull
1785 continue;
1786 }
1787 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788 << "'s nearfull flag" << dendl;
1789 if (pending_inc.new_pools.count(p) == 0) {
1790 pending_inc.new_pools[p] = pool.second;
1791 }
1792 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
7c673cae 1793 }
11fdf7f2 1794 }
7c673cae 1795
11fdf7f2 1796 // min_compat_client?
9f95a23c 1797 if (!tmp.require_min_compat_client) {
11fdf7f2
TL
1798 auto mv = tmp.get_min_compat_client();
1799 dout(1) << __func__ << " setting require_min_compat_client to currently "
9f95a23c 1800 << "required " << mv << dendl;
11fdf7f2 1801 mon->clog->info() << "setting require_min_compat_client to currently "
9f95a23c 1802 << "required " << mv;
11fdf7f2
TL
1803 pending_inc.new_require_min_compat_client = mv;
1804 }
1805
9f95a23c
TL
1806 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807 tmp.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
1808 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809 // add creating flags?
1810 for (auto& i : tmp.get_pools()) {
1811 if (pending_creatings.still_creating_pool(i.first)) {
1812 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813 << dendl;
1814 if (pending_inc.new_pools.count(i.first) == 0) {
1815 pending_inc.new_pools[i.first] = i.second;
224ce89b 1816 }
11fdf7f2 1817 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
224ce89b 1818 }
11fdf7f2
TL
1819 }
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i : tmp.blacklist) {
1822 auto a = i.first;
1823 a.set_type(entity_addr_t::TYPE_ANY);
1824 pending_inc.new_blacklist[a] = i.second;
1825 pending_inc.old_blacklist.push_back(i.first);
224ce89b 1826 }
7c673cae 1827 }
9f95a23c
TL
1828
1829 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830 tmp.require_osd_release >= ceph_release_t::octopus) {
1831 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid, pi] : tmp.pools) {
1835 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836 if (pending_inc.new_pools.count(poolid) == 0) {
1837 pending_inc.new_pools[poolid] = pi;
1838 }
1839 dout(10) << __func__ << " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl;
1841 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842 }
1843 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844 if (pending_inc.new_pools.count(poolid) == 0) {
1845 pending_inc.new_pools[poolid] = pi;
1846 }
1847 dout(10) << __func__ << " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl;
1849 pending_inc.new_pools[poolid].cache_mode =
1850 pg_pool_t::CACHEMODE_READPROXY;
1851 }
1852 }
1853
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid, pi] : tmp.pools) {
1856 if (pi.removed_snaps.empty()) {
1857 continue;
1858 }
1859 if (pending_inc.new_pools.count(poolid) == 0) {
1860 pending_inc.new_pools[poolid] = pi;
1861 }
1862 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863 << dendl;
1864 pending_inc.new_pools[poolid].removed_snaps.clear();
1865 }
1866
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1870 // encoding now).
1871 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872 it->lower_bound("purged_snap_");
1873 map<int64_t,snap_interval_set_t> combined;
1874 while (it->valid()) {
1875 if (it->key().find("purged_snap_") != 0) {
1876 break;
1877 }
1878 string k = it->key();
1879 long long unsigned pool;
1880 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881 if (n != 1) {
1882 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883 } else {
1884 bufferlist v = it->value();
1885 auto p = v.cbegin();
1886 snapid_t begin, end;
1887 ceph::decode(begin, p);
1888 ceph::decode(end, p);
1889 combined[pool].insert(begin, end - begin);
1890 }
1891 it->next();
1892 }
1893 if (!combined.empty()) {
1894 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895 bufferlist v;
1896 ceph::encode(combined, v);
1897 t->put(OSD_SNAP_PREFIX, k, v);
1898 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900 << dendl;
1901 } else {
1902 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903 << dendl;
1904 }
1905
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910 }
7c673cae
FG
1911 }
1912
1913 // tell me about it
31f18b77 1914 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1915 i != pending_inc.new_state.end();
1916 ++i) {
1917 int s = i->second ? i->second : CEPH_OSD_UP;
1918 if (s & CEPH_OSD_UP)
1919 dout(2) << " osd." << i->first << " DOWN" << dendl;
1920 if (s & CEPH_OSD_EXISTS)
1921 dout(2) << " osd." << i->first << " DNE" << dendl;
1922 }
11fdf7f2 1923 for (auto i = pending_inc.new_up_client.begin();
7c673cae
FG
1924 i != pending_inc.new_up_client.end();
1925 ++i) {
1926 //FIXME: insert cluster addresses too
1927 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1928 }
1929 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1930 i != pending_inc.new_weight.end();
1931 ++i) {
1932 if (i->second == CEPH_OSD_OUT) {
1933 dout(2) << " osd." << i->first << " OUT" << dendl;
1934 } else if (i->second == CEPH_OSD_IN) {
1935 dout(2) << " osd." << i->first << " IN" << dendl;
1936 } else {
1937 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1938 }
1939 }
1940
1941 // features for osdmap and its incremental
28e407b8 1942 uint64_t features;
7c673cae
FG
1943
1944 // encode full map and determine its crc
1945 OSDMap tmp;
1946 {
1947 tmp.deepish_copy_from(osdmap);
1948 tmp.apply_incremental(pending_inc);
1949
1950 // determine appropriate features
28e407b8
AA
1951 features = tmp.get_encoding_features();
1952 dout(10) << __func__ << " encoding full map with "
9f95a23c 1953 << tmp.require_osd_release
28e407b8
AA
1954 << " features " << features << dendl;
1955
1956 // the features should be a subset of the mon quorum's features!
11fdf7f2 1957 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
7c673cae
FG
1958
1959 bufferlist fullbl;
11fdf7f2 1960 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
1961 pending_inc.full_crc = tmp.get_crc();
1962
1963 // include full map in the txn. note that old monitors will
1964 // overwrite this. new ones will now skip the local full map
1965 // encode and reload from this.
1966 put_version_full(t, pending_inc.epoch, fullbl);
1967 }
1968
1969 // encode
11fdf7f2
TL
1970 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1971 bufferlist bl;
1972 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
1973
1974 dout(20) << " full_crc " << tmp.get_crc()
1975 << " inc_crc " << pending_inc.inc_crc << dendl;
1976
1977 /* put everything in the transaction */
1978 put_version(t, pending_inc.epoch, bl);
1979 put_last_committed(t, pending_inc.epoch);
1980
1981 // metadata, too!
1982 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1983 p != pending_metadata.end();
1984 ++p)
1985 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1986 for (set<int>::iterator p = pending_metadata_rm.begin();
1987 p != pending_metadata_rm.end();
1988 ++p)
1989 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1990 pending_metadata.clear();
1991 pending_metadata_rm.clear();
1992
9f95a23c
TL
1993 // purged_snaps
1994 if (tmp.require_osd_release >= ceph_release_t::octopus &&
1995 !pending_inc.new_purged_snaps.empty()) {
1996 // all snaps purged this epoch (across all pools)
1997 string k = make_purged_snap_epoch_key(pending_inc.epoch);
1998 bufferlist v;
1999 encode(pending_inc.new_purged_snaps, v);
2000 t->put(OSD_SNAP_PREFIX, k, v);
2001 }
2002 for (auto& i : pending_inc.new_purged_snaps) {
2003 for (auto q = i.second.begin();
2004 q != i.second.end();
2005 ++q) {
2006 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2007 pending_inc.epoch,
2008 t);
11fdf7f2 2009 }
9f95a23c
TL
2010 }
2011 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2012 for (auto snap : snaps) {
2013 insert_purged_snap_update(pool, snap, snap + 1,
2014 pending_inc.epoch,
2015 t);
7c673cae 2016 }
7c673cae 2017 }
224ce89b
WB
2018
2019 // health
2020 health_check_map_t next;
92f5a8d4 2021 tmp.check_health(cct, &next);
224ce89b 2022 encode_health(next, t);
7c673cae
FG
2023}
2024
7c673cae
FG
2025int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2026{
2027 bufferlist bl;
2028 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2029 if (r < 0)
2030 return r;
2031 try {
11fdf7f2
TL
2032 auto p = bl.cbegin();
2033 decode(m, p);
7c673cae
FG
2034 }
2035 catch (buffer::error& e) {
2036 if (err)
2037 *err << "osd." << osd << " metadata is corrupt";
2038 return -EIO;
2039 }
2040 return 0;
2041}
2042
c07f9fc5 2043void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 2044{
31f18b77
FG
2045 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2046 if (osdmap.is_up(osd)) {
2047 map<string,string> meta;
2048 load_metadata(osd, meta, nullptr);
2049 auto p = meta.find(field);
2050 if (p == meta.end()) {
c07f9fc5 2051 (*out)["unknown"]++;
31f18b77 2052 } else {
c07f9fc5 2053 (*out)[p->second]++;
31f18b77
FG
2054 }
2055 }
2056 }
c07f9fc5
FG
2057}
2058
2059void OSDMonitor::count_metadata(const string& field, Formatter *f)
2060{
2061 map<string,int> by_val;
2062 count_metadata(field, &by_val);
31f18b77
FG
2063 f->open_object_section(field.c_str());
2064 for (auto& p : by_val) {
2065 f->dump_int(p.first.c_str(), p.second);
2066 }
2067 f->close_section();
2068}
2069
7c673cae
FG
2070int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2071{
2072 map<string, string> metadata;
2073 int r = load_metadata(osd, metadata, nullptr);
2074 if (r < 0)
2075 return r;
2076
2077 auto it = metadata.find("osd_objectstore");
2078 if (it == metadata.end())
2079 return -ENOENT;
2080 *type = it->second;
2081 return 0;
2082}
2083
2084bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2085 const pg_pool_t &pool,
2086 ostream *err)
2087{
2088 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2089 // since filestore osds could always join the pool later
2090 set<int> checked_osds;
11fdf7f2 2091 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
7c673cae 2092 vector<int> up, acting;
11fdf7f2 2093 pg_t pgid(ps, pool_id);
7c673cae
FG
2094 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2095 for (int osd : up) {
2096 if (checked_osds.find(osd) != checked_osds.end())
2097 continue;
2098 string objectstore_type;
2099 int r = get_osd_objectstore_type(osd, &objectstore_type);
2100 // allow with missing metadata, e.g. due to an osd never booting yet
2101 if (r < 0 || objectstore_type == "bluestore") {
2102 checked_osds.insert(osd);
2103 continue;
2104 }
2105 *err << "osd." << osd << " uses " << objectstore_type;
2106 return false;
2107 }
2108 }
2109 return true;
2110}
2111
2112int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2113{
2114 map<string,string> m;
2115 if (int r = load_metadata(osd, m, err))
2116 return r;
2117 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2118 f->dump_string(p->first.c_str(), p->second);
2119 return 0;
2120}
2121
2122void OSDMonitor::print_nodes(Formatter *f)
2123{
2124 // group OSDs by their hosts
2125 map<string, list<int> > osds; // hostname => osd
2126 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2127 map<string, string> m;
2128 if (load_metadata(osd, m, NULL)) {
2129 continue;
2130 }
2131 map<string, string>::iterator hostname = m.find("hostname");
2132 if (hostname == m.end()) {
2133 // not likely though
2134 continue;
2135 }
2136 osds[hostname->second].push_back(osd);
2137 }
2138
2139 dump_services(f, osds, "osd");
2140}
2141
2142void OSDMonitor::share_map_with_random_osd()
2143{
2144 if (osdmap.get_num_up_osds() == 0) {
2145 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2146 return;
2147 }
2148
2149 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2150 if (!s) {
2151 dout(10) << __func__ << " no up osd on our session map" << dendl;
2152 return;
2153 }
2154
11fdf7f2
TL
2155 dout(10) << "committed, telling random " << s->name
2156 << " all about it" << dendl;
28e407b8
AA
2157
2158 // get feature of the peer
2159 // use quorum_con_features, if it's an anonymous connection.
2160 uint64_t features = s->con_features ? s->con_features :
2161 mon->get_quorum_con_features();
7c673cae 2162 // whatev, they'll request more if they need it
28e407b8 2163 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
7c673cae
FG
2164 s->con->send_message(m);
2165 // NOTE: do *not* record osd has up to this epoch (as we do
2166 // elsewhere) as they may still need to request older values.
2167}
2168
11fdf7f2 2169version_t OSDMonitor::get_trim_to() const
7c673cae 2170{
31f18b77
FG
2171 if (mon->get_quorum().empty()) {
2172 dout(10) << __func__ << ": quorum not formed" << dendl;
2173 return 0;
2174 }
7c673cae 2175
11fdf7f2
TL
2176 {
2177 std::lock_guard<std::mutex> l(creating_pgs_lock);
2178 if (!creating_pgs.pgs.empty()) {
7c673cae
FG
2179 return 0;
2180 }
7c673cae 2181 }
11fdf7f2
TL
2182
2183 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2184 dout(0) << __func__
2185 << " blocking osdmap trim"
2186 " ('mon_debug_block_osdmap_trim' set to 'true')"
2187 << dendl;
2188 return 0;
2189 }
2190
7c673cae 2191 {
11fdf7f2 2192 epoch_t floor = get_min_last_epoch_clean();
7c673cae 2193 dout(10) << " min_last_epoch_clean " << floor << dendl;
11fdf7f2
TL
2194 if (g_conf()->mon_osd_force_trim_to > 0 &&
2195 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2196 floor = g_conf()->mon_osd_force_trim_to;
7c673cae
FG
2197 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2198 }
11fdf7f2 2199 unsigned min = g_conf()->mon_min_osdmap_epochs;
7c673cae
FG
2200 if (floor + min > get_last_committed()) {
2201 if (min < get_last_committed())
2202 floor = get_last_committed() - min;
2203 else
2204 floor = 0;
2205 }
2206 if (floor > get_first_committed())
2207 return floor;
2208 }
2209 return 0;
2210}
2211
2212epoch_t OSDMonitor::get_min_last_epoch_clean() const
2213{
2214 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2215 // also scan osd epochs
2216 // don't trim past the oldest reported osd epoch
2217 for (auto& osd_epoch : osd_epochs) {
1911f103
TL
2218 if (osd_epoch.second < floor &&
2219 osdmap.is_out(osd_epoch.first)) {
7c673cae
FG
2220 floor = osd_epoch.second;
2221 }
2222 }
2223 return floor;
2224}
2225
2226void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2227 version_t first)
2228{
2229 dout(10) << __func__ << " including full map for e " << first << dendl;
2230 bufferlist bl;
2231 get_version_full(first, bl);
2232 put_version_full(tx, first, bl);
11fdf7f2
TL
2233
2234 if (has_osdmap_manifest &&
2235 first > osdmap_manifest.get_first_pinned()) {
2236 _prune_update_trimmed(tx, first);
2237 }
7c673cae
FG
2238}
2239
11fdf7f2
TL
2240
2241/* full osdmap prune
2242 *
2243 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2244 */
2245
2246void OSDMonitor::load_osdmap_manifest()
2247{
2248 bool store_has_manifest =
2249 mon->store->exists(get_service_name(), "osdmap_manifest");
2250
2251 if (!store_has_manifest) {
2252 if (!has_osdmap_manifest) {
2253 return;
2254 }
2255
2256 dout(20) << __func__
2257 << " dropping osdmap manifest from memory." << dendl;
2258 osdmap_manifest = osdmap_manifest_t();
2259 has_osdmap_manifest = false;
2260 return;
2261 }
2262
2263 dout(20) << __func__
2264 << " osdmap manifest detected in store; reload." << dendl;
2265
2266 bufferlist manifest_bl;
2267 int r = get_value("osdmap_manifest", manifest_bl);
2268 if (r < 0) {
2269 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2270 ceph_abort_msg("error reading manifest");
2271 }
2272 osdmap_manifest.decode(manifest_bl);
2273 has_osdmap_manifest = true;
2274
2275 dout(10) << __func__ << " store osdmap manifest pinned ("
2276 << osdmap_manifest.get_first_pinned()
2277 << " .. "
2278 << osdmap_manifest.get_last_pinned()
2279 << ")"
2280 << dendl;
2281}
2282
2283bool OSDMonitor::should_prune() const
2284{
2285 version_t first = get_first_committed();
2286 version_t last = get_last_committed();
2287 version_t min_osdmap_epochs =
2288 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2289 version_t prune_min =
2290 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2291 version_t prune_interval =
2292 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2293 version_t last_pinned = osdmap_manifest.get_last_pinned();
2294 version_t last_to_pin = last - min_osdmap_epochs;
2295
2296 // Make it or break it constraints.
2297 //
2298 // If any of these conditions fails, we will not prune, regardless of
2299 // whether we have an on-disk manifest with an on-going pruning state.
2300 //
2301 if ((last - first) <= min_osdmap_epochs) {
2302 // between the first and last committed epochs, we don't have
2303 // enough epochs to trim, much less to prune.
2304 dout(10) << __func__
2305 << " currently holding only " << (last - first)
2306 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2307 << "); do not prune."
2308 << dendl;
2309 return false;
2310
2311 } else if ((last_to_pin - first) < prune_min) {
2312 // between the first committed epoch and the last epoch we would prune,
2313 // we simply don't have enough versions over the minimum to prune maps.
2314 dout(10) << __func__
2315 << " could only prune " << (last_to_pin - first)
2316 << " epochs (" << first << ".." << last_to_pin << "), which"
2317 " is less than the required minimum (" << prune_min << ")"
2318 << dendl;
2319 return false;
2320
2321 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2322 dout(10) << __func__
2323 << " we have pruned as far as we can; do not prune."
2324 << dendl;
2325 return false;
2326
2327 } else if (last_pinned + prune_interval > last_to_pin) {
2328 dout(10) << __func__
2329 << " not enough epochs to form an interval (last pinned: "
2330 << last_pinned << ", last to pin: "
2331 << last_to_pin << ", interval: " << prune_interval << ")"
2332 << dendl;
2333 return false;
2334 }
2335
2336 dout(15) << __func__
2337 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2338 << " lc (" << first << ".." << last << ")"
2339 << dendl;
2340 return true;
2341}
2342
2343void OSDMonitor::_prune_update_trimmed(
2344 MonitorDBStore::TransactionRef tx,
2345 version_t first)
2346{
2347 dout(10) << __func__
2348 << " first " << first
2349 << " last_pinned " << osdmap_manifest.get_last_pinned()
2350 << " last_pinned " << osdmap_manifest.get_last_pinned()
2351 << dendl;
2352
2353 osdmap_manifest_t manifest = osdmap_manifest;
2354
2355 if (!manifest.is_pinned(first)) {
2356 manifest.pin(first);
2357 }
2358
2359 set<version_t>::iterator p_end = manifest.pinned.find(first);
2360 set<version_t>::iterator p = manifest.pinned.begin();
2361 manifest.pinned.erase(p, p_end);
2362 ceph_assert(manifest.get_first_pinned() == first);
2363
2364 if (manifest.get_last_pinned() == first+1 ||
2365 manifest.pinned.size() == 1) {
2366 // we reached the end of the line, as pinned maps go; clean up our
2367 // manifest, and let `should_prune()` decide whether we should prune
2368 // again.
2369 tx->erase(get_service_name(), "osdmap_manifest");
2370 return;
2371 }
2372
2373 bufferlist bl;
2374 manifest.encode(bl);
2375 tx->put(get_service_name(), "osdmap_manifest", bl);
2376}
2377
2378void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2379{
2380 dout(1) << __func__ << dendl;
2381
2382 version_t pin_first;
2383
2384 // verify constrainsts on stable in-memory state
2385 if (!has_osdmap_manifest) {
2386 // we must have never pruned, OR if we pruned the state must no longer
2387 // be relevant (i.e., the state must have been removed alongside with
2388 // the trim that *must* have removed past the last pinned map in a
2389 // previous prune).
2390 ceph_assert(osdmap_manifest.pinned.empty());
2391 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2392 pin_first = get_first_committed();
2393
2394 } else {
2395 // we must have pruned in the past AND its state is still relevant
2396 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2397 // and thus we still hold a manifest in the store).
2398 ceph_assert(!osdmap_manifest.pinned.empty());
2399 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2400 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2401
2402 dout(10) << __func__
2403 << " first_pinned " << osdmap_manifest.get_first_pinned()
2404 << " last_pinned " << osdmap_manifest.get_last_pinned()
2405 << dendl;
2406
2407 pin_first = osdmap_manifest.get_last_pinned();
2408 }
2409
2410 manifest.pin(pin_first);
2411}
2412
2413bool OSDMonitor::_prune_sanitize_options() const
2414{
2415 uint64_t prune_interval =
2416 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2417 uint64_t prune_min =
2418 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2419 uint64_t txsize =
2420 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2421
2422 bool r = true;
2423
2424 if (prune_interval == 0) {
2425 derr << __func__
2426 << " prune is enabled BUT prune interval is zero; abort."
2427 << dendl;
2428 r = false;
2429 } else if (prune_interval == 1) {
2430 derr << __func__
2431 << " prune interval is equal to one, which essentially means"
2432 " no pruning; abort."
2433 << dendl;
2434 r = false;
2435 }
2436 if (prune_min == 0) {
2437 derr << __func__
2438 << " prune is enabled BUT prune min is zero; abort."
2439 << dendl;
2440 r = false;
2441 }
2442 if (prune_interval > prune_min) {
2443 derr << __func__
2444 << " impossible to ascertain proper prune interval because"
2445 << " it is greater than the minimum prune epochs"
2446 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2447 << dendl;
2448 r = false;
2449 }
2450
2451 if (txsize < prune_interval - 1) {
2452 derr << __func__
2453 << "'mon_osdmap_full_prune_txsize' (" << txsize
2454 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2455 << "); abort." << dendl;
2456 r = false;
2457 }
2458 return r;
2459}
2460
2461bool OSDMonitor::is_prune_enabled() const {
2462 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2463}
2464
2465bool OSDMonitor::is_prune_supported() const {
2466 return mon->get_required_mon_features().contains_any(
2467 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2468}
2469
2470/** do_prune
2471 *
2472 * @returns true if has side-effects; false otherwise.
2473 */
2474bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2475{
2476 bool enabled = is_prune_enabled();
2477
2478 dout(1) << __func__ << " osdmap full prune "
2479 << ( enabled ? "enabled" : "disabled")
2480 << dendl;
2481
2482 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2483 return false;
2484 }
2485
2486 // we are beyond the minimum prune versions, we need to remove maps because
2487 // otherwise the store will grow unbounded and we may end up having issues
2488 // with available disk space or store hangs.
2489
2490 // we will not pin all versions. We will leave a buffer number of versions.
2491 // this allows us the monitor to trim maps without caring too much about
2492 // pinned maps, and then allow us to use another ceph-mon without these
2493 // capabilities, without having to repair the store.
2494
2495 osdmap_manifest_t manifest = osdmap_manifest;
2496
2497 version_t first = get_first_committed();
2498 version_t last = get_last_committed();
2499
2500 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2501 version_t last_pinned = manifest.get_last_pinned();
2502 uint64_t prune_interval =
2503 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2504 uint64_t txsize =
2505 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2506
2507 prune_init(manifest);
2508
2509 // we need to get rid of some osdmaps
2510
2511 dout(5) << __func__
2512 << " lc (" << first << " .. " << last << ")"
2513 << " last_pinned " << last_pinned
2514 << " interval " << prune_interval
2515 << " last_to_pin " << last_to_pin
2516 << dendl;
2517
2518 // We will be erasing maps as we go.
2519 //
2520 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2521 //
2522 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2523 // we stop pruning. We could prune the maps between `next_to_pin` and
2524 // `last_to_pin`, but by not doing it we end up with neater pruned
2525 // intervals, aligned with `prune_interval`. Besides, this should not be a
2526 // problem as long as `prune_interval` is set to a sane value, instead of
2527 // hundreds or thousands of maps.
2528
2529 auto map_exists = [this](version_t v) {
2530 string k = mon->store->combine_strings("full", v);
2531 return mon->store->exists(get_service_name(), k);
2532 };
2533
2534 // 'interval' represents the number of maps from the last pinned
2535 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2536 // version 11 next; all intermediate versions will be removed.
2537 //
2538 // 'txsize' represents the maximum number of versions we'll be removing in
2539 // this iteration. If 'txsize' is large enough to perform multiple passes
2540 // pinning and removing maps, we will do so; if not, we'll do at least one
2541 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2542 // ensure that we never go *over* the maximum.
2543
2544 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2545 uint64_t removal_interval = prune_interval - 1;
2546
2547 if (txsize < removal_interval) {
2548 dout(5) << __func__
2549 << " setting txsize to removal interval size ("
2550 << removal_interval << " versions"
2551 << dendl;
2552 txsize = removal_interval;
2553 }
2554 ceph_assert(removal_interval > 0);
2555
2556 uint64_t num_pruned = 0;
2557 while (num_pruned + removal_interval <= txsize) {
2558 last_pinned = manifest.get_last_pinned();
2559
2560 if (last_pinned + prune_interval > last_to_pin) {
2561 break;
2562 }
2563 ceph_assert(last_pinned < last_to_pin);
2564
2565 version_t next_pinned = last_pinned + prune_interval;
2566 ceph_assert(next_pinned <= last_to_pin);
2567 manifest.pin(next_pinned);
2568
2569 dout(20) << __func__
2570 << " last_pinned " << last_pinned
2571 << " next_pinned " << next_pinned
2572 << " num_pruned " << num_pruned
2573 << " removal interval (" << (last_pinned+1)
2574 << ".." << (next_pinned-1) << ")"
2575 << " txsize " << txsize << dendl;
2576
2577 ceph_assert(map_exists(last_pinned));
2578 ceph_assert(map_exists(next_pinned));
2579
2580 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2581 ceph_assert(!manifest.is_pinned(v));
2582
2583 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2584 string full_key = mon->store->combine_strings("full", v);
2585 tx->erase(get_service_name(), full_key);
2586 ++num_pruned;
2587 }
2588 }
2589
2590 ceph_assert(num_pruned > 0);
2591
2592 bufferlist bl;
2593 manifest.encode(bl);
2594 tx->put(get_service_name(), "osdmap_manifest", bl);
2595
2596 return true;
2597}
2598
2599
7c673cae
FG
2600// -------------
2601
2602bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2603{
2604 op->mark_osdmon_event(__func__);
2605 Message *m = op->get_req();
2606 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2607
2608 switch (m->get_type()) {
2609 // READs
2610 case MSG_MON_COMMAND:
f64942e4
AA
2611 try {
2612 return preprocess_command(op);
11fdf7f2 2613 } catch (const bad_cmd_get& e) {
f64942e4
AA
2614 bufferlist bl;
2615 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2616 return true;
2617 }
7c673cae
FG
2618 case CEPH_MSG_MON_GET_OSDMAP:
2619 return preprocess_get_osdmap(op);
2620
2621 // damp updates
2622 case MSG_OSD_MARK_ME_DOWN:
2623 return preprocess_mark_me_down(op);
9f95a23c
TL
2624 case MSG_OSD_MARK_ME_DEAD:
2625 return preprocess_mark_me_dead(op);
7c673cae
FG
2626 case MSG_OSD_FULL:
2627 return preprocess_full(op);
2628 case MSG_OSD_FAILURE:
2629 return preprocess_failure(op);
2630 case MSG_OSD_BOOT:
2631 return preprocess_boot(op);
2632 case MSG_OSD_ALIVE:
2633 return preprocess_alive(op);
2634 case MSG_OSD_PG_CREATED:
2635 return preprocess_pg_created(op);
11fdf7f2
TL
2636 case MSG_OSD_PG_READY_TO_MERGE:
2637 return preprocess_pg_ready_to_merge(op);
7c673cae
FG
2638 case MSG_OSD_PGTEMP:
2639 return preprocess_pgtemp(op);
2640 case MSG_OSD_BEACON:
2641 return preprocess_beacon(op);
2642
2643 case CEPH_MSG_POOLOP:
2644 return preprocess_pool_op(op);
2645
2646 case MSG_REMOVE_SNAPS:
2647 return preprocess_remove_snaps(op);
2648
9f95a23c
TL
2649 case MSG_MON_GET_PURGED_SNAPS:
2650 return preprocess_get_purged_snaps(op);
2651
7c673cae
FG
2652 default:
2653 ceph_abort();
2654 return true;
2655 }
2656}
2657
2658bool OSDMonitor::prepare_update(MonOpRequestRef op)
2659{
2660 op->mark_osdmon_event(__func__);
2661 Message *m = op->get_req();
2662 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2663
2664 switch (m->get_type()) {
2665 // damp updates
2666 case MSG_OSD_MARK_ME_DOWN:
2667 return prepare_mark_me_down(op);
9f95a23c
TL
2668 case MSG_OSD_MARK_ME_DEAD:
2669 return prepare_mark_me_dead(op);
7c673cae
FG
2670 case MSG_OSD_FULL:
2671 return prepare_full(op);
2672 case MSG_OSD_FAILURE:
2673 return prepare_failure(op);
2674 case MSG_OSD_BOOT:
2675 return prepare_boot(op);
2676 case MSG_OSD_ALIVE:
2677 return prepare_alive(op);
2678 case MSG_OSD_PG_CREATED:
2679 return prepare_pg_created(op);
2680 case MSG_OSD_PGTEMP:
2681 return prepare_pgtemp(op);
11fdf7f2
TL
2682 case MSG_OSD_PG_READY_TO_MERGE:
2683 return prepare_pg_ready_to_merge(op);
7c673cae
FG
2684 case MSG_OSD_BEACON:
2685 return prepare_beacon(op);
2686
2687 case MSG_MON_COMMAND:
f64942e4
AA
2688 try {
2689 return prepare_command(op);
11fdf7f2 2690 } catch (const bad_cmd_get& e) {
f64942e4
AA
2691 bufferlist bl;
2692 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2693 return true;
2694 }
7c673cae
FG
2695
2696 case CEPH_MSG_POOLOP:
2697 return prepare_pool_op(op);
2698
2699 case MSG_REMOVE_SNAPS:
2700 return prepare_remove_snaps(op);
2701
2702
2703 default:
2704 ceph_abort();
2705 }
2706
2707 return false;
2708}
2709
2710bool OSDMonitor::should_propose(double& delay)
2711{
2712 dout(10) << "should_propose" << dendl;
2713
2714 // if full map, propose immediately! any subsequent changes will be clobbered.
2715 if (pending_inc.fullmap.length())
2716 return true;
2717
2718 // adjust osd weights?
2719 if (!osd_weight.empty() &&
2720 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2721 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2722 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2723 delay = 0.0;
2724 osd_weight.clear();
2725 return true;
2726 }
2727
7c673cae
FG
2728 return PaxosService::should_propose(delay);
2729}
2730
2731
2732
2733// ---------------------------
2734// READs
2735
2736bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2737{
2738 op->mark_osdmon_event(__func__);
9f95a23c 2739 auto m = op->get_req<MMonGetOSDMap>();
28e407b8
AA
2740
2741 uint64_t features = mon->get_quorum_con_features();
11fdf7f2
TL
2742 if (op->get_session() && op->get_session()->con_features)
2743 features = op->get_session()->con_features;
28e407b8 2744
7c673cae 2745 dout(10) << __func__ << " " << *m << dendl;
28e407b8 2746 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
2747 epoch_t first = get_first_committed();
2748 epoch_t last = osdmap.get_epoch();
11fdf7f2
TL
2749 int max = g_conf()->osd_map_message_max;
2750 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2751 for (epoch_t e = std::max(first, m->get_full_first());
2752 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
7c673cae 2753 ++e, --max) {
11fdf7f2
TL
2754 bufferlist& bl = reply->maps[e];
2755 int r = get_version_full(e, features, bl);
2756 ceph_assert(r >= 0);
2757 max_bytes -= bl.length();
7c673cae 2758 }
11fdf7f2
TL
2759 for (epoch_t e = std::max(first, m->get_inc_first());
2760 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
7c673cae 2761 ++e, --max) {
11fdf7f2
TL
2762 bufferlist& bl = reply->incremental_maps[e];
2763 int r = get_version(e, features, bl);
2764 ceph_assert(r >= 0);
2765 max_bytes -= bl.length();
7c673cae
FG
2766 }
2767 reply->oldest_map = first;
2768 reply->newest_map = last;
2769 mon->send_reply(op, reply);
2770 return true;
2771}
2772
2773
2774// ---------------------------
2775// UPDATEs
2776
2777// failure --
2778
11fdf7f2 2779bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
7c673cae 2780 // check permissions
11fdf7f2 2781 MonSession *session = op->get_session();
7c673cae
FG
2782 if (!session)
2783 return true;
2784 if (!session->is_capable("osd", MON_CAP_X)) {
2785 dout(0) << "got MOSDFailure from entity with insufficient caps "
2786 << session->caps << dendl;
2787 return true;
2788 }
2789 if (fsid != mon->monmap->fsid) {
2790 dout(0) << "check_source: on fsid " << fsid
2791 << " != " << mon->monmap->fsid << dendl;
2792 return true;
2793 }
2794 return false;
2795}
2796
2797
2798bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2799{
2800 op->mark_osdmon_event(__func__);
9f95a23c 2801 auto m = op->get_req<MOSDFailure>();
7c673cae 2802 // who is target_osd
11fdf7f2 2803 int badboy = m->get_target_osd();
7c673cae
FG
2804
2805 // check permissions
11fdf7f2 2806 if (check_source(op, m->fsid))
7c673cae
FG
2807 goto didit;
2808
2809 // first, verify the reporting host is valid
2810 if (m->get_orig_source().is_osd()) {
2811 int from = m->get_orig_source().num();
2812 if (!osdmap.exists(from) ||
11fdf7f2 2813 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
7c673cae 2814 (osdmap.is_down(from) && m->if_osd_failed())) {
11fdf7f2
TL
2815 dout(5) << "preprocess_failure from dead osd." << from
2816 << ", ignoring" << dendl;
7c673cae
FG
2817 send_incremental(op, m->get_epoch()+1);
2818 goto didit;
2819 }
2820 }
2821
2822
2823 // weird?
2824 if (osdmap.is_down(badboy)) {
11fdf7f2
TL
2825 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2826 << " " << m->get_target_addrs()
2827 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2828 if (m->get_epoch() < osdmap.get_epoch())
2829 send_incremental(op, m->get_epoch()+1);
2830 goto didit;
2831 }
11fdf7f2
TL
2832 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2833 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2834 << " " << m->get_target_addrs()
2835 << " != map's " << osdmap.get_addrs(badboy)
2836 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2837 if (m->get_epoch() < osdmap.get_epoch())
2838 send_incremental(op, m->get_epoch()+1);
2839 goto didit;
2840 }
2841
2842 // already reported?
2843 if (osdmap.is_down(badboy) ||
2844 osdmap.get_up_from(badboy) > m->get_epoch()) {
11fdf7f2
TL
2845 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2846 << " " << m->get_target_addrs()
2847 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2848 if (m->get_epoch() < osdmap.get_epoch())
2849 send_incremental(op, m->get_epoch()+1);
2850 goto didit;
2851 }
2852
2853 if (!can_mark_down(badboy)) {
11fdf7f2
TL
2854 dout(5) << "preprocess_failure ignoring report of osd."
2855 << m->get_target_osd() << " " << m->get_target_addrs()
2856 << " from " << m->get_orig_source() << dendl;
7c673cae
FG
2857 goto didit;
2858 }
2859
11fdf7f2
TL
2860 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2861 << " " << m->get_target_addrs()
2862 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2863 return false;
2864
2865 didit:
28e407b8 2866 mon->no_reply(op);
7c673cae
FG
2867 return true;
2868}
2869
2870class C_AckMarkedDown : public C_MonOp {
2871 OSDMonitor *osdmon;
2872public:
2873 C_AckMarkedDown(
2874 OSDMonitor *osdmon,
2875 MonOpRequestRef op)
2876 : C_MonOp(op), osdmon(osdmon) {}
2877
eafe8130
TL
2878 void _finish(int r) override {
2879 if (r == 0) {
9f95a23c 2880 auto m = op->get_req<MOSDMarkMeDown>();
eafe8130
TL
2881 osdmon->mon->send_reply(
2882 op,
2883 new MOSDMarkMeDown(
2884 m->fsid,
2885 m->target_osd,
2886 m->target_addrs,
2887 m->get_epoch(),
2888 false)); // ACK itself does not request an ack
2889 } else if (r == -EAGAIN) {
2890 osdmon->dispatch(op);
2891 } else {
2892 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2893 }
7c673cae
FG
2894 }
2895 ~C_AckMarkedDown() override {
2896 }
2897};
2898
2899bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2900{
2901 op->mark_osdmon_event(__func__);
9f95a23c 2902 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 2903 int from = m->target_osd;
7c673cae
FG
2904
2905 // check permissions
11fdf7f2 2906 if (check_source(op, m->fsid))
7c673cae
FG
2907 goto reply;
2908
2909 // first, verify the reporting host is valid
2910 if (!m->get_orig_source().is_osd())
2911 goto reply;
2912
2913 if (!osdmap.exists(from) ||
2914 osdmap.is_down(from) ||
11fdf7f2 2915 osdmap.get_addrs(from) != m->target_addrs) {
7c673cae
FG
2916 dout(5) << "preprocess_mark_me_down from dead osd."
2917 << from << ", ignoring" << dendl;
2918 send_incremental(op, m->get_epoch()+1);
2919 goto reply;
2920 }
2921
2922 // no down might be set
11fdf7f2 2923 if (!can_mark_down(from))
7c673cae
FG
2924 goto reply;
2925
11fdf7f2
TL
2926 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2927 << " " << m->target_addrs << dendl;
7c673cae
FG
2928 return false;
2929
2930 reply:
2931 if (m->request_ack) {
2932 Context *c(new C_AckMarkedDown(this, op));
2933 c->complete(0);
2934 }
2935 return true;
2936}
2937
2938bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2939{
2940 op->mark_osdmon_event(__func__);
9f95a23c 2941 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 2942 int target_osd = m->target_osd;
7c673cae 2943
11fdf7f2
TL
2944 ceph_assert(osdmap.is_up(target_osd));
2945 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
7c673cae
FG
2946
2947 mon->clog->info() << "osd." << target_osd << " marked itself down";
2948 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2949 if (m->request_ack)
2950 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2951 return true;
2952}
2953
9f95a23c
TL
2954bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2955{
2956 op->mark_osdmon_event(__func__);
2957 auto m = op->get_req<MOSDMarkMeDead>();
2958 int from = m->target_osd;
2959
2960 // check permissions
2961 if (check_source(op, m->fsid)) {
2962 mon->no_reply(op);
2963 return true;
2964 }
2965
2966 // first, verify the reporting host is valid
2967 if (!m->get_orig_source().is_osd()) {
2968 mon->no_reply(op);
2969 return true;
2970 }
2971
2972 if (!osdmap.exists(from) ||
2973 !osdmap.is_down(from)) {
2974 dout(5) << __func__ << " from nonexistent or up osd." << from
2975 << ", ignoring" << dendl;
2976 send_incremental(op, m->get_epoch()+1);
2977 mon->no_reply(op);
2978 return true;
2979 }
2980
2981 return false;
2982}
2983
2984bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2985{
2986 op->mark_osdmon_event(__func__);
2987 auto m = op->get_req<MOSDMarkMeDead>();
2988 int target_osd = m->target_osd;
2989
2990 ceph_assert(osdmap.is_down(target_osd));
2991
2992 mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
2993 << m->get_epoch();
2994 if (!pending_inc.new_xinfo.count(target_osd)) {
2995 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
2996 }
2997 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
2998 wait_for_finished_proposal(
2999 op,
3000 new LambdaContext(
3001 [op, this] (int r) {
3002 if (r >= 0) {
3003 mon->no_reply(op); // ignore on success
3004 }
3005 }
3006 ));
3007 return true;
3008}
3009
7c673cae
FG
3010bool OSDMonitor::can_mark_down(int i)
3011{
31f18b77
FG
3012 if (osdmap.is_nodown(i)) {
3013 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3014 << "will not mark it down" << dendl;
7c673cae
FG
3015 return false;
3016 }
31f18b77 3017
7c673cae
FG
3018 int num_osds = osdmap.get_num_osds();
3019 if (num_osds == 0) {
31f18b77 3020 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
3021 return false;
3022 }
3023 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3024 float up_ratio = (float)up / (float)num_osds;
11fdf7f2 3025 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
31f18b77 3026 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
11fdf7f2 3027 << g_conf()->mon_osd_min_up_ratio
7c673cae
FG
3028 << ", will not mark osd." << i << " down" << dendl;
3029 return false;
3030 }
3031 return true;
3032}
3033
3034bool OSDMonitor::can_mark_up(int i)
3035{
31f18b77
FG
3036 if (osdmap.is_noup(i)) {
3037 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3038 << "will not mark it up" << dendl;
7c673cae
FG
3039 return false;
3040 }
31f18b77 3041
7c673cae
FG
3042 return true;
3043}
3044
3045/**
3046 * @note the parameter @p i apparently only exists here so we can output the
3047 * osd's id on messages.
3048 */
3049bool OSDMonitor::can_mark_out(int i)
3050{
31f18b77
FG
3051 if (osdmap.is_noout(i)) {
3052 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3053 << "will not mark it out" << dendl;
3054 return false;
3055 }
3056
7c673cae
FG
3057 int num_osds = osdmap.get_num_osds();
3058 if (num_osds == 0) {
3059 dout(5) << __func__ << " no osds" << dendl;
3060 return false;
3061 }
3062 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3063 float in_ratio = (float)in / (float)num_osds;
11fdf7f2 3064 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
7c673cae
FG
3065 if (i >= 0)
3066 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3067 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3068 << ", will not mark osd." << i << " out" << dendl;
3069 else
3070 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3071 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3072 << ", will not mark osds out" << dendl;
3073 return false;
3074 }
3075
3076 return true;
3077}
3078
3079bool OSDMonitor::can_mark_in(int i)
3080{
31f18b77
FG
3081 if (osdmap.is_noin(i)) {
3082 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3083 << "will not mark it in" << dendl;
7c673cae
FG
3084 return false;
3085 }
31f18b77 3086
7c673cae
FG
3087 return true;
3088}
3089
3090bool OSDMonitor::check_failures(utime_t now)
3091{
3092 bool found_failure = false;
3093 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3094 p != failure_info.end();
3095 ++p) {
3096 if (can_mark_down(p->first)) {
3097 found_failure |= check_failure(now, p->first, p->second);
3098 }
3099 }
3100 return found_failure;
3101}
3102
3103bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3104{
3105 // already pending failure?
3106 if (pending_inc.new_state.count(target_osd) &&
3107 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3108 dout(10) << " already pending failure" << dendl;
3109 return true;
3110 }
3111
3112 set<string> reporters_by_subtree;
11fdf7f2
TL
3113 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3114 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
7c673cae
FG
3115 utime_t max_failed_since = fi.get_failed_since();
3116 utime_t failed_for = now - max_failed_since;
3117
3118 utime_t grace = orig_grace;
3119 double my_grace = 0, peer_grace = 0;
3120 double decay_k = 0;
11fdf7f2
TL
3121 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3122 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
3123 decay_k = ::log(.5) / halflife;
3124
3125 // scale grace period based on historical probability of 'lagginess'
3126 // (false positive failures due to slowness).
3127 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3128 double decay = exp((double)failed_for * decay_k);
3129 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3130 << " failed_for " << failed_for << " decay " << decay << dendl;
3131 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3132 grace += my_grace;
3133 }
3134
3135 // consider the peers reporting a failure a proxy for a potential
3136 // 'subcluster' over the overall cluster that is similarly
3137 // laggy. this is clearly not true in all cases, but will sometimes
3138 // help us localize the grace correction to a subset of the system
3139 // (say, a rack with a bad switch) that is unhappy.
11fdf7f2 3140 ceph_assert(fi.reporters.size());
eafe8130 3141 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
7c673cae
FG
3142 // get the parent bucket whose type matches with "reporter_subtree_level".
3143 // fall back to OSD if the level doesn't exist.
eafe8130
TL
3144 if (osdmap.exists(p->first)) {
3145 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3146 if (auto iter = reporter_loc.find(reporter_subtree_level);
3147 iter == reporter_loc.end()) {
3148 reporters_by_subtree.insert("osd." + to_string(p->first));
3149 } else {
3150 reporters_by_subtree.insert(iter->second);
3151 }
3152 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3153 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3154 utime_t elapsed = now - xi.down_stamp;
3155 double decay = exp((double)elapsed * decay_k);
3156 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3157 }
3158 ++p;
7c673cae 3159 } else {
eafe8130
TL
3160 fi.cancel_report(p->first);;
3161 p = fi.reporters.erase(p);
7c673cae
FG
3162 }
3163 }
3164
11fdf7f2 3165 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
7c673cae
FG
3166 peer_grace /= (double)fi.reporters.size();
3167 grace += peer_grace;
3168 }
3169
3170 dout(10) << " osd." << target_osd << " has "
3171 << fi.reporters.size() << " reporters, "
3172 << grace << " grace (" << orig_grace << " + " << my_grace
3173 << " + " << peer_grace << "), max_failed_since " << max_failed_since
3174 << dendl;
3175
3176 if (failed_for >= grace &&
11fdf7f2 3177 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
7c673cae
FG
3178 dout(1) << " we have enough reporters to mark osd." << target_osd
3179 << " down" << dendl;
3180 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3181
31f18b77
FG
3182 mon->clog->info() << "osd." << target_osd << " failed ("
3183 << osdmap.crush->get_full_location_ordered_string(
3184 target_osd)
3185 << ") ("
3186 << (int)reporters_by_subtree.size()
3187 << " reporters from different "
7c673cae
FG
3188 << reporter_subtree_level << " after "
3189 << failed_for << " >= grace " << grace << ")";
3190 return true;
3191 }
3192 return false;
3193}
3194
224ce89b 3195void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
3196{
3197 // already pending failure?
3198 if (pending_inc.new_state.count(target_osd) &&
3199 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3200 dout(10) << " already pending failure" << dendl;
3201 return;
3202 }
3203
3204 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3205 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
9f95a23c
TL
3206 if (!pending_inc.new_xinfo.count(target_osd)) {
3207 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3208 }
3209 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
7c673cae 3210
31f18b77
FG
3211 mon->clog->info() << "osd." << target_osd << " failed ("
3212 << osdmap.crush->get_full_location_ordered_string(target_osd)
3213 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
3214 return;
3215}
3216
3217bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3218{
3219 op->mark_osdmon_event(__func__);
9f95a23c 3220 auto m = op->get_req<MOSDFailure>();
11fdf7f2
TL
3221 dout(1) << "prepare_failure osd." << m->get_target_osd()
3222 << " " << m->get_target_addrs()
3223 << " from " << m->get_orig_source()
7c673cae
FG
3224 << " is reporting failure:" << m->if_osd_failed() << dendl;
3225
11fdf7f2 3226 int target_osd = m->get_target_osd();
7c673cae 3227 int reporter = m->get_orig_source().num();
11fdf7f2
TL
3228 ceph_assert(osdmap.is_up(target_osd));
3229 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
7c673cae 3230
eafe8130
TL
3231 mon->no_reply(op);
3232
7c673cae
FG
3233 if (m->if_osd_failed()) {
3234 // calculate failure time
3235 utime_t now = ceph_clock_now();
3236 utime_t failed_since =
3237 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3238
3239 // add a report
3240 if (m->is_immediate()) {
11fdf7f2
TL
3241 mon->clog->debug() << "osd." << m->get_target_osd()
3242 << " reported immediately failed by "
3243 << m->get_orig_source();
224ce89b 3244 force_failure(target_osd, reporter);
7c673cae
FG
3245 return true;
3246 }
11fdf7f2
TL
3247 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3248 << m->get_orig_source();
7c673cae
FG
3249
3250 failure_info_t& fi = failure_info[target_osd];
3251 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3252 if (old_op) {
3253 mon->no_reply(old_op);
3254 }
3255
3256 return check_failure(now, target_osd, fi);
3257 } else {
3258 // remove the report
11fdf7f2
TL
3259 mon->clog->debug() << "osd." << m->get_target_osd()
3260 << " failure report canceled by "
3261 << m->get_orig_source();
7c673cae
FG
3262 if (failure_info.count(target_osd)) {
3263 failure_info_t& fi = failure_info[target_osd];
3264 MonOpRequestRef report_op = fi.cancel_report(reporter);
3265 if (report_op) {
3266 mon->no_reply(report_op);
3267 }
3268 if (fi.reporters.empty()) {
3269 dout(10) << " removing last failure_info for osd." << target_osd
3270 << dendl;
3271 failure_info.erase(target_osd);
3272 } else {
3273 dout(10) << " failure_info for osd." << target_osd << " now "
3274 << fi.reporters.size() << " reporters" << dendl;
3275 }
3276 } else {
3277 dout(10) << " no failure_info for osd." << target_osd << dendl;
3278 }
7c673cae
FG
3279 }
3280
3281 return false;
3282}
3283
3284void OSDMonitor::process_failures()
3285{
3286 map<int,failure_info_t>::iterator p = failure_info.begin();
3287 while (p != failure_info.end()) {
3288 if (osdmap.is_up(p->first)) {
3289 ++p;
3290 } else {
3291 dout(10) << "process_failures osd." << p->first << dendl;
3292 list<MonOpRequestRef> ls;
3293 p->second.take_report_messages(ls);
3294 failure_info.erase(p++);
3295
3296 while (!ls.empty()) {
3297 MonOpRequestRef o = ls.front();
3298 if (o) {
3299 o->mark_event(__func__);
3300 MOSDFailure *m = o->get_req<MOSDFailure>();
3301 send_latest(o, m->get_epoch());
28e407b8 3302 mon->no_reply(o);
7c673cae
FG
3303 }
3304 ls.pop_front();
3305 }
3306 }
3307 }
3308}
3309
3310void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3311{
3312 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3313
3314 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3315 p != failure_info.end();
3316 ++p) {
3317 p->second.take_report_messages(ls);
3318 }
3319 failure_info.clear();
3320}
3321
3322
3323// boot --
3324
3325bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3326{
3327 op->mark_osdmon_event(__func__);
9f95a23c 3328 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3329 int from = m->get_orig_source_inst().name.num();
3330
3331 // check permissions, ignore if failed (no response expected)
11fdf7f2 3332 MonSession *session = op->get_session();
7c673cae
FG
3333 if (!session)
3334 goto ignore;
3335 if (!session->is_capable("osd", MON_CAP_X)) {
3336 dout(0) << "got preprocess_boot message from entity with insufficient caps"
11fdf7f2 3337 << session->caps << dendl;
7c673cae
FG
3338 goto ignore;
3339 }
3340
11fdf7f2
TL
3341 if (m->sb.cluster_fsid != mon->monmap->fsid) {
3342 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3343 << " != " << mon->monmap->fsid << dendl;
7c673cae
FG
3344 goto ignore;
3345 }
3346
11fdf7f2
TL
3347 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3348 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
7c673cae
FG
3349 goto ignore;
3350 }
3351
11fdf7f2 3352 ceph_assert(m->get_orig_source_inst().name.is_osd());
7c673cae 3353
11fdf7f2
TL
3354 // force all osds to have gone through luminous prior to upgrade to nautilus
3355 {
3356 vector<string> missing;
3357 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3358 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3359 }
3360 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3361 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3362 }
3363 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3364 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3365 }
3366 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3367 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3368 }
7c673cae 3369
11fdf7f2
TL
3370 if (!missing.empty()) {
3371 using std::experimental::make_ostream_joiner;
7c673cae 3372
11fdf7f2
TL
3373 stringstream ss;
3374 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
c07f9fc5 3375
11fdf7f2
TL
3376 mon->clog->info() << "disallowing boot of OSD "
3377 << m->get_orig_source_inst()
3378 << " because the osd lacks " << ss.str();
7c673cae
FG
3379 goto ignore;
3380 }
3381 }
3382
9f95a23c
TL
3383 // make sure osd versions do not span more than 3 releases
3384 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3385 osdmap.require_osd_release < ceph_release_t::mimic) {
3386 mon->clog->info() << "disallowing boot of octopus+ OSD "
7c673cae 3387 << m->get_orig_source_inst()
9f95a23c 3388 << " because require_osd_release < mimic";
7c673cae
FG
3389 goto ignore;
3390 }
3391
f64942e4
AA
3392 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3393 // we are reusing a jewel feature bit that was retired in luminous.
9f95a23c 3394 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
f64942e4
AA
3395 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3396 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3397 mon->clog->info() << "disallowing boot of OSD "
3398 << m->get_orig_source_inst()
3399 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3400 goto ignore;
3401 }
3402
7c673cae
FG
3403 // already booted?
3404 if (osdmap.is_up(from) &&
11fdf7f2
TL
3405 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3406 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
7c673cae 3407 // yup.
11fdf7f2
TL
3408 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3409 << " " << m->get_orig_source_addrs()
3410 << " =~ " << osdmap.get_addrs(from) << dendl;
7c673cae
FG
3411 _booted(op, false);
3412 return true;
3413 }
3414
3415 if (osdmap.exists(from) &&
3416 !osdmap.get_uuid(from).is_zero() &&
3417 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3418 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3419 << " clashes with existing osd: different fsid"
3420 << " (ours: " << osdmap.get_uuid(from)
3421 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3422 goto ignore;
3423 }
3424
3425 if (osdmap.exists(from) &&
3426 osdmap.get_info(from).up_from > m->version &&
11fdf7f2
TL
3427 osdmap.get_most_recent_addrs(from).legacy_equals(
3428 m->get_orig_source_addrs())) {
7c673cae
FG
3429 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3430 send_latest(op, m->sb.current_epoch+1);
3431 return true;
3432 }
3433
3434 // noup?
3435 if (!can_mark_up(from)) {
3436 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3437 send_latest(op, m->sb.current_epoch+1);
3438 return true;
3439 }
3440
3441 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3442 return false;
3443
3444 ignore:
3445 return true;
3446}
3447
3448bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3449{
3450 op->mark_osdmon_event(__func__);
9f95a23c 3451 auto m = op->get_req<MOSDBoot>();
11fdf7f2
TL
3452 dout(7) << __func__ << " from " << m->get_source()
3453 << " sb " << m->sb
3454 << " client_addrs" << m->get_connection()->get_peer_addrs()
3455 << " cluster_addrs " << m->cluster_addrs
3456 << " hb_back_addrs " << m->hb_back_addrs
3457 << " hb_front_addrs " << m->hb_front_addrs
7c673cae
FG
3458 << dendl;
3459
11fdf7f2 3460 ceph_assert(m->get_orig_source().is_osd());
7c673cae
FG
3461 int from = m->get_orig_source().num();
3462
3463 // does this osd exist?
3464 if (from >= osdmap.get_max_osd()) {
3465 dout(1) << "boot from osd." << from << " >= max_osd "
3466 << osdmap.get_max_osd() << dendl;
3467 return false;
3468 }
3469
3470 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3471 if (pending_inc.new_state.count(from))
3472 oldstate ^= pending_inc.new_state[from];
3473
3474 // already up? mark down first?
3475 if (osdmap.is_up(from)) {
11fdf7f2
TL
3476 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3477 << osdmap.get_addrs(from) << dendl;
7c673cae 3478 // preprocess should have caught these; if not, assert.
11fdf7f2
TL
3479 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3480 m->get_orig_source_addrs()) ||
3481 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3482 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
7c673cae
FG
3483
3484 if (pending_inc.new_state.count(from) == 0 ||
3485 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3486 // mark previous guy down
3487 pending_inc.new_state[from] = CEPH_OSD_UP;
3488 }
3489 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3490 } else if (pending_inc.new_up_client.count(from)) {
3491 // already prepared, just wait
3492 dout(7) << __func__ << " already prepared, waiting on "
3493 << m->get_orig_source_addr() << dendl;
3494 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3495 } else {
3496 // mark new guy up.
11fdf7f2
TL
3497 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3498 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3499 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3500 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
7c673cae
FG
3501
3502 down_pending_out.erase(from); // if any
3503
3504 if (m->sb.weight)
3505 osd_weight[from] = m->sb.weight;
3506
3507 // set uuid?
3508 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3509 << dendl;
3510 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3511 // preprocess should have caught this; if not, assert.
11fdf7f2 3512 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
7c673cae
FG
3513 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3514 }
3515
3516 // fresh osd?
3517 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3518 const osd_info_t& i = osdmap.get_info(from);
3519 if (i.up_from > i.lost_at) {
3520 dout(10) << " fresh osd; marking lost_at too" << dendl;
3521 pending_inc.new_lost[from] = osdmap.get_epoch();
3522 }
3523 }
3524
3525 // metadata
3526 bufferlist osd_metadata;
11fdf7f2 3527 encode(m->metadata, osd_metadata);
7c673cae 3528 pending_metadata[from] = osd_metadata;
31f18b77 3529 pending_metadata_rm.erase(from);
7c673cae
FG
3530
3531 // adjust last clean unmount epoch?
3532 const osd_info_t& info = osdmap.get_info(from);
3533 dout(10) << " old osd_info: " << info << dendl;
3534 if (m->sb.mounted > info.last_clean_begin ||
3535 (m->sb.mounted == info.last_clean_begin &&
3536 m->sb.clean_thru > info.last_clean_end)) {
3537 epoch_t begin = m->sb.mounted;
3538 epoch_t end = m->sb.clean_thru;
3539
3540 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3541 << "[" << info.last_clean_begin << "," << info.last_clean_end
3542 << ") -> [" << begin << "-" << end << ")"
3543 << dendl;
3544 pending_inc.new_last_clean_interval[from] =
3545 pair<epoch_t,epoch_t>(begin, end);
3546 }
3547
9f95a23c
TL
3548 if (pending_inc.new_xinfo.count(from) == 0)
3549 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3550 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
7c673cae 3551 if (m->boot_epoch == 0) {
11fdf7f2
TL
3552 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3553 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3554 dout(10) << " not laggy, new xi " << xi << dendl;
3555 } else {
3556 if (xi.down_stamp.sec()) {
3557 int interval = ceph_clock_now().sec() -
3558 xi.down_stamp.sec();
11fdf7f2
TL
3559 if (g_conf()->mon_osd_laggy_max_interval &&
3560 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3561 interval = g_conf()->mon_osd_laggy_max_interval;
7c673cae
FG
3562 }
3563 xi.laggy_interval =
11fdf7f2
TL
3564 interval * g_conf()->mon_osd_laggy_weight +
3565 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3566 }
3567 xi.laggy_probability =
11fdf7f2
TL
3568 g_conf()->mon_osd_laggy_weight +
3569 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3570 dout(10) << " laggy, now xi " << xi << dendl;
3571 }
3572
3573 // set features shared by the osd
3574 if (m->osd_features)
3575 xi.features = m->osd_features;
3576 else
3577 xi.features = m->get_connection()->get_features();
3578
3579 // mark in?
11fdf7f2 3580 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
7c673cae 3581 (oldstate & CEPH_OSD_AUTOOUT)) ||
11fdf7f2
TL
3582 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3583 (g_conf()->mon_osd_auto_mark_in)) {
7c673cae 3584 if (can_mark_in(from)) {
9f95a23c
TL
3585 if (xi.old_weight > 0) {
3586 pending_inc.new_weight[from] = xi.old_weight;
7c673cae
FG
3587 xi.old_weight = 0;
3588 } else {
3589 pending_inc.new_weight[from] = CEPH_OSD_IN;
3590 }
3591 } else {
3592 dout(7) << __func__ << " NOIN set, will not mark in "
3593 << m->get_orig_source_addr() << dendl;
3594 }
3595 }
3596
7c673cae
FG
3597 // wait
3598 wait_for_finished_proposal(op, new C_Booted(this, op));
3599 }
3600 return true;
3601}
3602
3603void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3604{
3605 op->mark_osdmon_event(__func__);
9f95a23c 3606 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3607 dout(7) << "_booted " << m->get_orig_source_inst()
3608 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3609
3610 if (logit) {
11fdf7f2
TL
3611 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3612 << " boot";
7c673cae
FG
3613 }
3614
3615 send_latest(op, m->sb.current_epoch+1);
3616}
3617
3618
3619// -------------
3620// full
3621
3622bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3623{
3624 op->mark_osdmon_event(__func__);
9f95a23c 3625 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3626 int from = m->get_orig_source().num();
3627 set<string> state;
3628 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3629
3630 // check permissions, ignore if failed
11fdf7f2 3631 MonSession *session = op->get_session();
7c673cae
FG
3632 if (!session)
3633 goto ignore;
3634 if (!session->is_capable("osd", MON_CAP_X)) {
3635 dout(0) << "MOSDFull from entity with insufficient privileges:"
3636 << session->caps << dendl;
3637 goto ignore;
3638 }
3639
3640 // ignore a full message from the osd instance that already went down
3641 if (!osdmap.exists(from)) {
3642 dout(7) << __func__ << " ignoring full message from nonexistent "
3643 << m->get_orig_source_inst() << dendl;
3644 goto ignore;
3645 }
3646 if ((!osdmap.is_up(from) &&
11fdf7f2
TL
3647 osdmap.get_most_recent_addrs(from).legacy_equals(
3648 m->get_orig_source_addrs())) ||
7c673cae 3649 (osdmap.is_up(from) &&
11fdf7f2 3650 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
7c673cae
FG
3651 dout(7) << __func__ << " ignoring full message from down "
3652 << m->get_orig_source_inst() << dendl;
3653 goto ignore;
3654 }
3655
3656 OSDMap::calc_state_set(osdmap.get_state(from), state);
3657
3658 if ((osdmap.get_state(from) & mask) == m->state) {
3659 dout(7) << __func__ << " state already " << state << " for osd." << from
3660 << " " << m->get_orig_source_inst() << dendl;
3661 _reply_map(op, m->version);
3662 goto ignore;
3663 }
3664
3665 dout(10) << __func__ << " want state " << state << " for osd." << from
3666 << " " << m->get_orig_source_inst() << dendl;
3667 return false;
3668
3669 ignore:
3670 return true;
3671}
3672
3673bool OSDMonitor::prepare_full(MonOpRequestRef op)
3674{
3675 op->mark_osdmon_event(__func__);
9f95a23c 3676 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3677 const int from = m->get_orig_source().num();
3678
3679 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3680 const unsigned want_state = m->state & mask; // safety first
3681
3682 unsigned cur_state = osdmap.get_state(from);
3683 auto p = pending_inc.new_state.find(from);
3684 if (p != pending_inc.new_state.end()) {
3685 cur_state ^= p->second;
3686 }
3687 cur_state &= mask;
3688
3689 set<string> want_state_set, cur_state_set;
3690 OSDMap::calc_state_set(want_state, want_state_set);
3691 OSDMap::calc_state_set(cur_state, cur_state_set);
3692
3693 if (cur_state != want_state) {
3694 if (p != pending_inc.new_state.end()) {
3695 p->second &= ~mask;
3696 } else {
3697 pending_inc.new_state[from] = 0;
3698 }
3699 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3700 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3701 << " -> " << want_state_set << dendl;
3702 } else {
3703 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3704 << " = wanted " << want_state_set << ", just waiting" << dendl;
3705 }
3706
3707 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3708 return true;
3709}
3710
3711// -------------
3712// alive
3713
3714bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3715{
3716 op->mark_osdmon_event(__func__);
9f95a23c 3717 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3718 int from = m->get_orig_source().num();
3719
3720 // check permissions, ignore if failed
11fdf7f2 3721 MonSession *session = op->get_session();
7c673cae
FG
3722 if (!session)
3723 goto ignore;
3724 if (!session->is_capable("osd", MON_CAP_X)) {
3725 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3726 << session->caps << dendl;
3727 goto ignore;
3728 }
3729
3730 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3731 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3732 dout(7) << "preprocess_alive ignoring alive message from down "
3733 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3734 << dendl;
7c673cae
FG
3735 goto ignore;
3736 }
3737
3738 if (osdmap.get_up_thru(from) >= m->want) {
3739 // yup.
3740 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3741 _reply_map(op, m->version);
3742 return true;
3743 }
3744
3745 dout(10) << "preprocess_alive want up_thru " << m->want
3746 << " from " << m->get_orig_source_inst() << dendl;
3747 return false;
3748
3749 ignore:
3750 return true;
3751}
3752
3753bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3754{
3755 op->mark_osdmon_event(__func__);
9f95a23c 3756 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3757 int from = m->get_orig_source().num();
3758
3759 if (0) { // we probably don't care much about these
3760 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3761 }
3762
3763 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3764 << " from " << m->get_orig_source_inst() << dendl;
3765
3766 update_up_thru(from, m->version); // set to the latest map the OSD has
3767 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3768 return true;
3769}
3770
3771void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3772{
3773 op->mark_osdmon_event(__func__);
3774 dout(7) << "_reply_map " << e
3775 << " from " << op->get_req()->get_orig_source_inst()
3776 << dendl;
3777 send_latest(op, e);
3778}
3779
3780// pg_created
3781bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3782{
3783 op->mark_osdmon_event(__func__);
9f95a23c 3784 auto m = op->get_req<MOSDPGCreated>();
7c673cae 3785 dout(10) << __func__ << " " << *m << dendl;
11fdf7f2 3786 auto session = op->get_session();
94b18763 3787 mon->no_reply(op);
7c673cae
FG
3788 if (!session) {
3789 dout(10) << __func__ << ": no monitor session!" << dendl;
3790 return true;
3791 }
3792 if (!session->is_capable("osd", MON_CAP_X)) {
3793 derr << __func__ << " received from entity "
3794 << "with insufficient privileges " << session->caps << dendl;
3795 return true;
3796 }
3797 // always forward the "created!" to the leader
3798 return false;
3799}
3800
3801bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3802{
3803 op->mark_osdmon_event(__func__);
9f95a23c 3804 auto m = op->get_req<MOSDPGCreated>();
7c673cae
FG
3805 dout(10) << __func__ << " " << *m << dendl;
3806 auto src = m->get_orig_source();
3807 auto from = src.num();
3808 if (!src.is_osd() ||
3809 !mon->osdmon()->osdmap.is_up(from) ||
11fdf7f2
TL
3810 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3811 m->get_orig_source_addrs())) {
7c673cae
FG
3812 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3813 return false;
3814 }
3815 pending_created_pgs.push_back(m->pgid);
3816 return true;
3817}
3818
11fdf7f2
TL
3819bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3820{
3821 op->mark_osdmon_event(__func__);
9f95a23c 3822 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
3823 dout(10) << __func__ << " " << *m << dendl;
3824 const pg_pool_t *pi;
3825 auto session = op->get_session();
3826 if (!session) {
3827 dout(10) << __func__ << ": no monitor session!" << dendl;
3828 goto ignore;
3829 }
3830 if (!session->is_capable("osd", MON_CAP_X)) {
3831 derr << __func__ << " received from entity "
3832 << "with insufficient privileges " << session->caps << dendl;
3833 goto ignore;
3834 }
3835 pi = osdmap.get_pg_pool(m->pgid.pool());
3836 if (!pi) {
3837 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3838 goto ignore;
3839 }
3840 if (pi->get_pg_num() <= m->pgid.ps()) {
3841 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3842 goto ignore;
3843 }
3844 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3845 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3846 goto ignore;
3847 }
3848 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3849 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3850 goto ignore;
3851 }
3852 return false;
3853
3854 ignore:
3855 mon->no_reply(op);
3856 return true;
3857}
3858
3859bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3860{
3861 op->mark_osdmon_event(__func__);
9f95a23c 3862 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
3863 dout(10) << __func__ << " " << *m << dendl;
3864 pg_pool_t p;
3865 if (pending_inc.new_pools.count(m->pgid.pool()))
3866 p = pending_inc.new_pools[m->pgid.pool()];
3867 else
3868 p = *osdmap.get_pg_pool(m->pgid.pool());
3869 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3870 p.get_pg_num_pending() > m->pgid.ps()) {
3871 dout(10) << __func__
3872 << " race with concurrent pg_num[_pending] update, will retry"
3873 << dendl;
3874 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3875 return true;
3876 }
3877
3878 if (m->ready) {
3879 p.dec_pg_num(m->pgid,
3880 pending_inc.epoch,
3881 m->source_version,
3882 m->target_version,
3883 m->last_epoch_started,
3884 m->last_epoch_clean);
3885 p.last_change = pending_inc.epoch;
3886 } else {
3887 // back off the merge attempt!
3888 p.set_pg_num_pending(p.get_pg_num());
3889 }
3890
3891 // force pre-nautilus clients to resend their ops, since they
3892 // don't understand pg_num_pending changes form a new interval
3893 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3894
3895 pending_inc.new_pools[m->pgid.pool()] = p;
3896
3897 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3898 if (m->ready &&
3899 prob > 0 &&
3900 prob > (double)(rand() % 1000)/1000.0) {
3901 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3902 auto n = new MMonCommand(mon->monmap->get_fsid());
3903 n->set_connection(m->get_connection());
3904 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3905 osdmap.get_pool_name(m->pgid.pool()) +
3906 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3907 stringify(m->pgid.ps() + 1) + "\"}" };
3908 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3909 nop->set_type_service();
3910 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3911 } else {
3912 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3913 }
3914 return true;
3915}
3916
3917
7c673cae
FG
3918// -------------
3919// pg_temp changes
3920
3921bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3922{
9f95a23c 3923 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
3924 dout(10) << "preprocess_pgtemp " << *m << dendl;
3925 mempool::osdmap::vector<int> empty;
3926 int from = m->get_orig_source().num();
3927 size_t ignore_cnt = 0;
3928
3929 // check caps
11fdf7f2 3930 MonSession *session = op->get_session();
7c673cae
FG
3931 if (!session)
3932 goto ignore;
3933 if (!session->is_capable("osd", MON_CAP_X)) {
3934 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3935 << session->caps << dendl;
3936 goto ignore;
3937 }
3938
3939 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3940 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3941 dout(7) << "ignoring pgtemp message from down "
3942 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3943 << dendl;
7c673cae
FG
3944 goto ignore;
3945 }
3946
3efd9988
FG
3947 if (m->forced) {
3948 return false;
3949 }
3950
7c673cae
FG
3951 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3952 dout(20) << " " << p->first
31f18b77 3953 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
3954 << " -> " << p->second << dendl;
3955
3956 // does the pool exist?
3957 if (!osdmap.have_pg_pool(p->first.pool())) {
3958 /*
3959 * 1. If the osdmap does not have the pool, it means the pool has been
3960 * removed in-between the osd sending this message and us handling it.
3961 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3962 * not exist in the pending either, as the osds would not send a
3963 * message about a pool they know nothing about (yet).
3964 * 3. However, if the pool does exist in the pending, then it must be a
3965 * new pool, and not relevant to this message (see 1).
3966 */
3967 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3968 << ": pool has been removed" << dendl;
3969 ignore_cnt++;
3970 continue;
3971 }
3972
3973 int acting_primary = -1;
3974 osdmap.pg_to_up_acting_osds(
3975 p->first, nullptr, nullptr, nullptr, &acting_primary);
3976 if (acting_primary != from) {
3977 /* If the source isn't the primary based on the current osdmap, we know
3978 * that the interval changed and that we can discard this message.
3979 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3980 * which of two pg temp mappings on the same pg is more recent.
3981 */
3982 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3983 << ": primary has changed" << dendl;
3984 ignore_cnt++;
3985 continue;
3986 }
3987
3988 // removal?
3989 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3990 osdmap.primary_temp->count(p->first)))
3991 return false;
3992 // change?
3993 // NOTE: we assume that this will clear pg_primary, so consider
3994 // an existing pg_primary field to imply a change
3995 if (p->second.size() &&
3996 (osdmap.pg_temp->count(p->first) == 0 ||
11fdf7f2 3997 osdmap.pg_temp->get(p->first) != p->second ||
7c673cae
FG
3998 osdmap.primary_temp->count(p->first)))
3999 return false;
4000 }
4001
4002 // should we ignore all the pgs?
4003 if (ignore_cnt == m->pg_temp.size())
4004 goto ignore;
4005
4006 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4007 _reply_map(op, m->map_epoch);
4008 return true;
4009
4010 ignore:
4011 return true;
4012}
4013
4014void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4015{
4016 epoch_t old_up_thru = osdmap.get_up_thru(from);
4017 auto ut = pending_inc.new_up_thru.find(from);
4018 if (ut != pending_inc.new_up_thru.end()) {
4019 old_up_thru = ut->second;
4020 }
4021 if (up_thru > old_up_thru) {
4022 // set up_thru too, so the osd doesn't have to ask again
4023 pending_inc.new_up_thru[from] = up_thru;
4024 }
4025}
4026
4027bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4028{
4029 op->mark_osdmon_event(__func__);
9f95a23c 4030 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
4031 int from = m->get_orig_source().num();
4032 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4033 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4034 uint64_t pool = p->first.pool();
4035 if (pending_inc.old_pools.count(pool)) {
4036 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4037 << ": pool pending removal" << dendl;
4038 continue;
4039 }
4040 if (!osdmap.have_pg_pool(pool)) {
4041 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4042 << ": pool has been removed" << dendl;
4043 continue;
4044 }
4045 pending_inc.new_pg_temp[p->first] =
4046 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4047
4048 // unconditionally clear pg_primary (until this message can encode
4049 // a change for that, too.. at which point we need to also fix
4050 // preprocess_pg_temp)
4051 if (osdmap.primary_temp->count(p->first) ||
4052 pending_inc.new_primary_temp.count(p->first))
4053 pending_inc.new_primary_temp[p->first] = -1;
4054 }
4055
4056 // set up_thru too, so the osd doesn't have to ask again
4057 update_up_thru(from, m->map_epoch);
4058
4059 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4060 return true;
4061}
4062
4063
4064// ---
4065
4066bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4067{
4068 op->mark_osdmon_event(__func__);
9f95a23c 4069 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4070 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4071
4072 // check privilege, ignore if failed
11fdf7f2 4073 MonSession *session = op->get_session();
f64942e4 4074 mon->no_reply(op);
7c673cae
FG
4075 if (!session)
4076 goto ignore;
4077 if (!session->caps.is_capable(
11fdf7f2 4078 cct,
7c673cae 4079 session->entity_name,
11fdf7f2
TL
4080 "osd", "osd pool rmsnap", {}, true, true, false,
4081 session->get_peer_socket_addr())) {
7c673cae
FG
4082 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4083 << session->caps << dendl;
4084 goto ignore;
4085 }
4086
4087 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4088 q != m->snaps.end();
4089 ++q) {
4090 if (!osdmap.have_pg_pool(q->first)) {
9f95a23c
TL
4091 dout(10) << " ignoring removed_snaps " << q->second
4092 << " on non-existent pool " << q->first << dendl;
7c673cae
FG
4093 continue;
4094 }
4095 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4096 for (vector<snapid_t>::iterator p = q->second.begin();
4097 p != q->second.end();
4098 ++p) {
4099 if (*p > pi->get_snap_seq() ||
9f95a23c 4100 !_is_removed_snap(q->first, *p)) {
7c673cae 4101 return false;
9f95a23c 4102 }
7c673cae
FG
4103 }
4104 }
4105
9f95a23c
TL
4106 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4107 auto reply = make_message<MRemoveSnaps>();
4108 reply->snaps = m->snaps;
4109 mon->send_reply(op, reply.detach());
4110 }
4111
7c673cae
FG
4112 ignore:
4113 return true;
4114}
4115
4116bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4117{
4118 op->mark_osdmon_event(__func__);
9f95a23c 4119 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4120 dout(7) << "prepare_remove_snaps " << *m << dendl;
4121
9f95a23c
TL
4122 for (auto& [pool, snaps] : m->snaps) {
4123 if (!osdmap.have_pg_pool(pool)) {
4124 dout(10) << " ignoring removed_snaps " << snaps
4125 << " on non-existent pool " << pool << dendl;
7c673cae
FG
4126 continue;
4127 }
4128
9f95a23c
TL
4129 pg_pool_t& pi = osdmap.pools[pool];
4130 for (auto s : snaps) {
4131 if (!_is_removed_snap(pool, s) &&
4132 (!pending_inc.new_pools.count(pool) ||
4133 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4134 (!pending_inc.new_removed_snaps.count(pool) ||
4135 !pending_inc.new_removed_snaps[pool].contains(s))) {
4136 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4137 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4138 newpi->removed_snaps.insert(s);
4139 dout(10) << " pool " << pool << " removed_snaps added " << s
4140 << " (now " << newpi->removed_snaps << ")" << dendl;
4141 }
11fdf7f2 4142 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
9f95a23c
TL
4143 if (s > newpi->get_snap_seq()) {
4144 dout(10) << " pool " << pool << " snap_seq "
4145 << newpi->get_snap_seq() << " -> " << s << dendl;
4146 newpi->set_snap_seq(s);
7c673cae
FG
4147 }
4148 newpi->set_snap_epoch(pending_inc.epoch);
9f95a23c
TL
4149 dout(10) << " added pool " << pool << " snap " << s
4150 << " to removed_snaps queue" << dendl;
4151 pending_inc.new_removed_snaps[pool].insert(s);
7c673cae
FG
4152 }
4153 }
4154 }
9f95a23c
TL
4155
4156 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4157 auto reply = make_message<MRemoveSnaps>();
4158 reply->snaps = m->snaps;
4159 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4160 }
4161
4162 return true;
4163}
4164
4165bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4166{
4167 op->mark_osdmon_event(__func__);
4168 auto m = op->get_req<MMonGetPurgedSnaps>();
4169 dout(7) << __func__ << " " << *m << dendl;
4170
4171 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4172
4173 string k = make_purged_snap_epoch_key(m->start);
4174 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4175 it->upper_bound(k);
4176 unsigned long epoch = m->last;
4177 while (it->valid()) {
4178 if (it->key().find("purged_epoch_") != 0) {
4179 break;
4180 }
4181 string k = it->key();
4182 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4183 if (n != 1) {
4184 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4185 } else if (epoch > m->last) {
4186 break;
4187 } else {
4188 bufferlist bl = it->value();
4189 auto p = bl.cbegin();
4190 auto &v = r[epoch];
4191 try {
4192 ceph::decode(v, p);
4193 } catch (buffer::error& e) {
4194 derr << __func__ << " unable to parse value for key '" << it->key()
4195 << "': \n";
4196 bl.hexdump(*_dout);
4197 *_dout << dendl;
4198 }
4199 n += 4 + v.size() * 16;
4200 }
4201 if (n > 1048576) {
4202 // impose a semi-arbitrary limit to message size
4203 break;
4204 }
4205 it->next();
4206 }
4207
4208 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4209 reply->purged_snaps.swap(r);
4210 mon->send_reply(op, reply.detach());
4211
7c673cae
FG
4212 return true;
4213}
4214
4215// osd beacon
4216bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4217{
4218 op->mark_osdmon_event(__func__);
7c673cae 4219 // check caps
11fdf7f2 4220 auto session = op->get_session();
94b18763 4221 mon->no_reply(op);
7c673cae
FG
4222 if (!session) {
4223 dout(10) << __func__ << " no monitor session!" << dendl;
4224 return true;
4225 }
4226 if (!session->is_capable("osd", MON_CAP_X)) {
4227 derr << __func__ << " received from entity "
4228 << "with insufficient privileges " << session->caps << dendl;
4229 return true;
4230 }
4231 // Always forward the beacon to the leader, even if they are the same as
4232 // the old one. The leader will mark as down osds that haven't sent
4233 // beacon for a few minutes.
4234 return false;
4235}
4236
4237bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4238{
4239 op->mark_osdmon_event(__func__);
9f95a23c 4240 const auto beacon = op->get_req<MOSDBeacon>();
7c673cae
FG
4241 const auto src = beacon->get_orig_source();
4242 dout(10) << __func__ << " " << *beacon
4243 << " from " << src << dendl;
4244 int from = src.num();
4245
4246 if (!src.is_osd() ||
4247 !osdmap.is_up(from) ||
11fdf7f2
TL
4248 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4249 if (src.is_osd() && !osdmap.is_up(from)) {
4250 // share some new maps with this guy in case it may not be
4251 // aware of its own deadness...
4252 send_latest(op, beacon->version+1);
4253 }
4254 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
7c673cae
FG
4255 return false;
4256 }
4257
4258 last_osd_report[from] = ceph_clock_now();
4259 osd_epochs[from] = beacon->version;
4260
4261 for (const auto& pg : beacon->pgs) {
4262 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4263 }
9f95a23c
TL
4264
4265 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4266 beacon->last_purged_snaps_scrub) {
4267 if (pending_inc.new_xinfo.count(from) == 0) {
4268 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4269 }
4270 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4271 beacon->last_purged_snaps_scrub;
4272 return true;
4273 } else {
4274 return false;
4275 }
7c673cae
FG
4276}
4277
4278// ---------------
4279// map helpers
4280
4281void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4282{
4283 op->mark_osdmon_event(__func__);
4284 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4285 << " start " << start << dendl;
4286 if (start == 0)
4287 send_full(op);
4288 else
4289 send_incremental(op, start);
4290}
4291
4292
28e407b8 4293MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
7c673cae 4294{
28e407b8
AA
4295 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4296 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
7c673cae
FG
4297 r->oldest_map = get_first_committed();
4298 r->newest_map = osdmap.get_epoch();
4299 return r;
4300}
4301
28e407b8 4302MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
7c673cae 4303{
11fdf7f2
TL
4304 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4305 << std::hex << features << std::dec << dendl;
28e407b8 4306 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
4307 m->oldest_map = get_first_committed();
4308 m->newest_map = osdmap.get_epoch();
4309
4310 for (epoch_t e = to; e >= from && e > 0; e--) {
4311 bufferlist bl;
28e407b8 4312 int err = get_version(e, features, bl);
7c673cae 4313 if (err == 0) {
11fdf7f2 4314 ceph_assert(bl.length());
7c673cae
FG
4315 // if (get_version(e, bl) > 0) {
4316 dout(20) << "build_incremental inc " << e << " "
4317 << bl.length() << " bytes" << dendl;
4318 m->incremental_maps[e] = bl;
4319 } else {
11fdf7f2
TL
4320 ceph_assert(err == -ENOENT);
4321 ceph_assert(!bl.length());
28e407b8 4322 get_version_full(e, features, bl);
7c673cae
FG
4323 if (bl.length() > 0) {
4324 //else if (get_version("full", e, bl) > 0) {
4325 dout(20) << "build_incremental full " << e << " "
4326 << bl.length() << " bytes" << dendl;
4327 m->maps[e] = bl;
4328 } else {
4329 ceph_abort(); // we should have all maps.
4330 }
4331 }
4332 }
4333 return m;
4334}
4335
4336void OSDMonitor::send_full(MonOpRequestRef op)
4337{
4338 op->mark_osdmon_event(__func__);
4339 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
28e407b8 4340 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
7c673cae
FG
4341}
4342
4343void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4344{
4345 op->mark_osdmon_event(__func__);
4346
4347 MonSession *s = op->get_session();
11fdf7f2 4348 ceph_assert(s);
7c673cae 4349
11fdf7f2 4350 if (s->proxy_con) {
7c673cae
FG
4351 // oh, we can tell the other mon to do it
4352 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4353 << first << dendl;
4354 MRoute *r = new MRoute(s->proxy_tid, NULL);
4355 r->send_osdmap_first = first;
4356 s->proxy_con->send_message(r);
4357 op->mark_event("reply: send routed send_osdmap_first reply");
4358 } else {
4359 // do it ourselves
4360 send_incremental(first, s, false, op);
4361 }
4362}
4363
4364void OSDMonitor::send_incremental(epoch_t first,
4365 MonSession *session,
4366 bool onetime,
4367 MonOpRequestRef req)
4368{
4369 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
11fdf7f2 4370 << " to " << session->name << dendl;
7c673cae 4371
28e407b8
AA
4372 // get feature of the peer
4373 // use quorum_con_features, if it's an anonymous connection.
4374 uint64_t features = session->con_features ? session->con_features :
4375 mon->get_quorum_con_features();
4376
7c673cae 4377 if (first <= session->osd_epoch) {
11fdf7f2 4378 dout(10) << __func__ << " " << session->name << " should already have epoch "
7c673cae
FG
4379 << session->osd_epoch << dendl;
4380 first = session->osd_epoch + 1;
4381 }
4382
4383 if (first < get_first_committed()) {
11fdf7f2
TL
4384 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4385 m->oldest_map = get_first_committed();
4386 m->newest_map = osdmap.get_epoch();
4387
7c673cae
FG
4388 first = get_first_committed();
4389 bufferlist bl;
28e407b8 4390 int err = get_version_full(first, features, bl);
11fdf7f2
TL
4391 ceph_assert(err == 0);
4392 ceph_assert(bl.length());
7c673cae
FG
4393 dout(20) << "send_incremental starting with base full "
4394 << first << " " << bl.length() << " bytes" << dendl;
7c673cae
FG
4395 m->maps[first] = bl;
4396
4397 if (req) {
4398 mon->send_reply(req, m);
4399 session->osd_epoch = first;
4400 return;
4401 } else {
4402 session->con->send_message(m);
4403 session->osd_epoch = first;
4404 }
4405 first++;
4406 }
4407
4408 while (first <= osdmap.get_epoch()) {
11fdf7f2 4409 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
28e407b8
AA
4410 osdmap.get_epoch());
4411 MOSDMap *m = build_incremental(first, last, features);
7c673cae
FG
4412
4413 if (req) {
4414 // send some maps. it may not be all of them, but it will get them
4415 // started.
4416 mon->send_reply(req, m);
4417 } else {
4418 session->con->send_message(m);
4419 first = last + 1;
4420 }
4421 session->osd_epoch = last;
4422 if (onetime || req)
4423 break;
4424 }
4425}
4426
4427int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4428{
28e407b8
AA
4429 return get_version(ver, mon->get_quorum_con_features(), bl);
4430}
4431
4432void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4433{
4434 OSDMap::Incremental inc;
11fdf7f2 4435 auto q = bl.cbegin();
28e407b8
AA
4436 inc.decode(q);
4437 // always encode with subset of osdmap's canonical features
4438 uint64_t f = features & inc.encode_features;
4439 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4440 << dendl;
4441 bl.clear();
4442 if (inc.fullmap.length()) {
4443 // embedded full map?
4444 OSDMap m;
4445 m.decode(inc.fullmap);
4446 inc.fullmap.clear();
4447 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4448 }
4449 if (inc.crush.length()) {
4450 // embedded crush map
4451 CrushWrapper c;
11fdf7f2 4452 auto p = inc.crush.cbegin();
28e407b8
AA
4453 c.decode(p);
4454 inc.crush.clear();
4455 c.encode(inc.crush, f);
4456 }
4457 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4458}
4459
4460void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4461{
4462 OSDMap m;
11fdf7f2 4463 auto q = bl.cbegin();
28e407b8
AA
4464 m.decode(q);
4465 // always encode with subset of osdmap's canonical features
4466 uint64_t f = features & m.get_encoding_features();
4467 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4468 << dendl;
4469 bl.clear();
4470 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4471}
4472
4473int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4474{
4475 uint64_t significant_features = OSDMap::get_significant_features(features);
4476 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4477 return 0;
4478 }
4479 int ret = PaxosService::get_version(ver, bl);
4480 if (ret < 0) {
7c673cae 4481 return ret;
28e407b8
AA
4482 }
4483 // NOTE: this check is imprecise; the OSDMap encoding features may
4484 // be a subset of the latest mon quorum features, but worst case we
4485 // reencode once and then cache the (identical) result under both
4486 // feature masks.
4487 if (significant_features !=
4488 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4489 reencode_incremental_map(bl, features);
4490 }
eafe8130 4491 inc_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4492 return 0;
7c673cae
FG
4493}
4494
11fdf7f2
TL
4495int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4496{
4497 bufferlist inc_bl;
4498 int err = get_version(ver, inc_bl);
4499 ceph_assert(err == 0);
4500 ceph_assert(inc_bl.length());
4501
4502 auto p = inc_bl.cbegin();
4503 inc.decode(p);
4504 dout(10) << __func__ << " "
4505 << " epoch " << inc.epoch
4506 << " inc_crc " << inc.inc_crc
4507 << " full_crc " << inc.full_crc
4508 << " encode_features " << inc.encode_features << dendl;
4509 return 0;
4510}
4511
4512int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4513{
4514 dout(10) << __func__ << " ver " << ver << dendl;
4515
4516 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4517 if (closest_pinned == 0) {
4518 return -ENOENT;
4519 }
4520 if (closest_pinned > ver) {
4521 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4522 }
4523 ceph_assert(closest_pinned <= ver);
4524
4525 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4526
4527 // get osdmap incremental maps and apply on top of this one.
4528 bufferlist osdm_bl;
4529 bool has_cached_osdmap = false;
4530 for (version_t v = ver-1; v >= closest_pinned; --v) {
4531 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4532 &osdm_bl)) {
4533 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4534 closest_pinned = v;
4535 has_cached_osdmap = true;
4536 break;
4537 }
4538 }
4539
4540 if (!has_cached_osdmap) {
4541 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4542 if (err != 0) {
4543 derr << __func__ << " closest pinned map ver " << closest_pinned
4544 << " not available! error: " << cpp_strerror(err) << dendl;
4545 }
4546 ceph_assert(err == 0);
4547 }
4548
4549 ceph_assert(osdm_bl.length());
4550
4551 OSDMap osdm;
4552 osdm.decode(osdm_bl);
4553
4554 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4555 << " e" << osdm.epoch
4556 << " crc " << osdm.get_crc()
4557 << " -- applying incremental maps." << dendl;
4558
4559 uint64_t encode_features = 0;
4560 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4561 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4562
4563 OSDMap::Incremental inc;
4564 int err = get_inc(v, inc);
4565 ceph_assert(err == 0);
4566
4567 encode_features = inc.encode_features;
4568
4569 err = osdm.apply_incremental(inc);
4570 ceph_assert(err == 0);
4571
4572 // this block performs paranoid checks on map retrieval
4573 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4574 inc.full_crc != 0) {
4575
4576 uint64_t f = encode_features;
4577 if (!f) {
4578 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4579 }
4580
4581 // encode osdmap to force calculating crcs
4582 bufferlist tbl;
4583 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4584 // decode osdmap to compare crcs with what's expected by incremental
4585 OSDMap tosdm;
4586 tosdm.decode(tbl);
4587
4588 if (tosdm.get_crc() != inc.full_crc) {
4589 derr << __func__
4590 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4591 << ", expected " << inc.full_crc << ")" << dendl;
4592 ceph_abort_msg("osdmap crc mismatch");
4593 }
4594 }
4595
4596 // note: we cannot add the recently computed map to the cache, as is,
4597 // because we have not encoded the map into a bl.
4598 }
4599
4600 if (!encode_features) {
4601 dout(10) << __func__
4602 << " last incremental map didn't have features;"
4603 << " defaulting to quorum's or all" << dendl;
4604 encode_features =
4605 (mon->quorum_con_features ? mon->quorum_con_features : -1);
4606 }
4607 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4608
4609 return 0;
4610}
4611
7c673cae
FG
4612int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4613{
28e407b8
AA
4614 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4615}
4616
4617int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4618 bufferlist& bl)
4619{
4620 uint64_t significant_features = OSDMap::get_significant_features(features);
4621 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4622 return 0;
4623 }
4624 int ret = PaxosService::get_version_full(ver, bl);
11fdf7f2
TL
4625 if (ret == -ENOENT) {
4626 // build map?
4627 ret = get_full_from_pinned_map(ver, bl);
4628 }
28e407b8 4629 if (ret < 0) {
7c673cae 4630 return ret;
28e407b8
AA
4631 }
4632 // NOTE: this check is imprecise; the OSDMap encoding features may
4633 // be a subset of the latest mon quorum features, but worst case we
4634 // reencode once and then cache the (identical) result under both
4635 // feature masks.
4636 if (significant_features !=
4637 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4638 reencode_full_map(bl, features);
4639 }
eafe8130 4640 full_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4641 return 0;
7c673cae
FG
4642}
4643
11fdf7f2
TL
4644epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4645{
4646 dout(10) << "blacklist " << av << " until " << until << dendl;
4647 for (auto a : av.v) {
9f95a23c 4648 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4649 a.set_type(entity_addr_t::TYPE_ANY);
4650 } else {
4651 a.set_type(entity_addr_t::TYPE_LEGACY);
4652 }
4653 pending_inc.new_blacklist[a] = until;
4654 }
4655 return pending_inc.epoch;
4656}
4657
4658epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
7c673cae 4659{
9f95a23c 4660 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4661 a.set_type(entity_addr_t::TYPE_ANY);
4662 } else {
4663 a.set_type(entity_addr_t::TYPE_LEGACY);
4664 }
7c673cae
FG
4665 dout(10) << "blacklist " << a << " until " << until << dendl;
4666 pending_inc.new_blacklist[a] = until;
4667 return pending_inc.epoch;
4668}
4669
4670
4671void OSDMonitor::check_osdmap_subs()
4672{
4673 dout(10) << __func__ << dendl;
4674 if (!osdmap.get_epoch()) {
4675 return;
4676 }
4677 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4678 if (osdmap_subs == mon->session_map.subs.end()) {
4679 return;
4680 }
4681 auto p = osdmap_subs->second->begin();
4682 while (!p.end()) {
4683 auto sub = *p;
4684 ++p;
4685 check_osdmap_sub(sub);
4686 }
4687}
4688
4689void OSDMonitor::check_osdmap_sub(Subscription *sub)
4690{
4691 dout(10) << __func__ << " " << sub << " next " << sub->next
4692 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4693 if (sub->next <= osdmap.get_epoch()) {
4694 if (sub->next >= 1)
4695 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4696 else
28e407b8 4697 sub->session->con->send_message(build_latest_full(sub->session->con_features));
7c673cae
FG
4698 if (sub->onetime)
4699 mon->session_map.remove_sub(sub);
4700 else
4701 sub->next = osdmap.get_epoch() + 1;
4702 }
4703}
4704
4705void OSDMonitor::check_pg_creates_subs()
4706{
7c673cae
FG
4707 if (!osdmap.get_num_up_osds()) {
4708 return;
4709 }
11fdf7f2 4710 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
7c673cae
FG
4711 mon->with_session_map([this](const MonSessionMap& session_map) {
4712 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4713 if (pg_creates_subs == session_map.subs.end()) {
4714 return;
4715 }
4716 for (auto sub : *pg_creates_subs->second) {
4717 check_pg_creates_sub(sub);
4718 }
4719 });
4720}
4721
4722void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4723{
11fdf7f2
TL
4724 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4725 ceph_assert(sub->type == "osd_pg_creates");
7c673cae
FG
4726 // only send these if the OSD is up. we will check_subs() when they do
4727 // come up so they will get the creates then.
11fdf7f2
TL
4728 if (sub->session->name.is_osd() &&
4729 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4730 sub->next = send_pg_creates(sub->session->name.num(),
7c673cae
FG
4731 sub->session->con.get(),
4732 sub->next);
4733 }
4734}
4735
c07f9fc5 4736void OSDMonitor::do_application_enable(int64_t pool_id,
11fdf7f2
TL
4737 const std::string &app_name,
4738 const std::string &app_key,
1911f103
TL
4739 const std::string &app_value,
4740 bool force)
c07f9fc5 4741{
11fdf7f2 4742 ceph_assert(paxos->is_plugged() && is_writeable());
c07f9fc5
FG
4743
4744 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4745 << dendl;
4746
9f95a23c 4747 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
35e4c445 4748
c07f9fc5 4749 auto pp = osdmap.get_pg_pool(pool_id);
11fdf7f2 4750 ceph_assert(pp != nullptr);
c07f9fc5
FG
4751
4752 pg_pool_t p = *pp;
4753 if (pending_inc.new_pools.count(pool_id)) {
4754 p = pending_inc.new_pools[pool_id];
4755 }
4756
11fdf7f2
TL
4757 if (app_key.empty()) {
4758 p.application_metadata.insert({app_name, {}});
4759 } else {
1911f103
TL
4760 if (force) {
4761 p.application_metadata[app_name][app_key] = app_value;
4762 } else {
4763 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4764 }
11fdf7f2 4765 }
c07f9fc5
FG
4766 p.last_change = pending_inc.epoch;
4767 pending_inc.new_pools[pool_id] = p;
4768}
4769
494da23a
TL
4770void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4771 pool_opts_t::key_t opt,
4772 pool_opts_t::value_t val)
4773{
4774 auto p = pending_inc.new_pools.try_emplace(
4775 pool_id, *osdmap.get_pg_pool(pool_id));
4776 p.first->second.opts.set(opt, val);
4777}
4778
31f18b77 4779unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
4780 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4781 const mempool::osdmap::set<int64_t>& removed_pools,
4782 utime_t modified,
4783 creating_pgs_t* creating_pgs) const
4784{
31f18b77 4785 unsigned queued = 0;
7c673cae
FG
4786 for (auto& p : pools) {
4787 int64_t poolid = p.first;
11fdf7f2
TL
4788 if (creating_pgs->created_pools.count(poolid)) {
4789 dout(10) << __func__ << " already created " << poolid << dendl;
4790 continue;
4791 }
7c673cae 4792 const pg_pool_t& pool = p.second;
31f18b77 4793 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
4794 pool.get_type(), pool.get_size());
4795 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4796 continue;
4797
4798 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4799 const auto created = pool.get_last_change();
4800 if (last_scan_epoch && created <= last_scan_epoch) {
4801 dout(10) << __func__ << " no change in pool " << poolid
4802 << " " << pool << dendl;
4803 continue;
4804 }
4805 if (removed_pools.count(poolid)) {
4806 dout(10) << __func__ << " pool is being removed: " << poolid
4807 << " " << pool << dendl;
4808 continue;
4809 }
31f18b77 4810 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 4811 << " " << pool << dendl;
11fdf7f2
TL
4812 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4813 created, modified);
4814 queued++;
7c673cae 4815 }
31f18b77 4816 return queued;
7c673cae
FG
4817}
4818
4819void OSDMonitor::update_creating_pgs()
4820{
31f18b77
FG
4821 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4822 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
4823 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4824 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 4825 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
4826 int acting_primary = -1;
4827 auto pgid = pg.first;
94b18763
FG
4828 if (!osdmap.pg_exists(pgid)) {
4829 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4830 << dendl;
4831 continue;
4832 }
9f95a23c 4833 auto mapped = pg.second.create_epoch;
c07f9fc5 4834 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
11fdf7f2
TL
4835 spg_t spgid(pgid);
4836 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
7c673cae
FG
4837 // check the previous creating_pgs, look for the target to whom the pg was
4838 // previously mapped
4839 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4840 const auto last_acting_primary = pgs_by_epoch.first;
4841 for (auto& pgs: pgs_by_epoch.second) {
11fdf7f2 4842 if (pgs.second.count(spgid)) {
7c673cae
FG
4843 if (last_acting_primary == acting_primary) {
4844 mapped = pgs.first;
4845 } else {
4846 dout(20) << __func__ << " " << pgid << " "
4847 << " acting_primary:" << last_acting_primary
4848 << " -> " << acting_primary << dendl;
4849 // note epoch if the target of the create message changed.
4850 mapped = mapping.get_epoch();
4851 }
4852 break;
31f18b77
FG
4853 } else {
4854 // newly creating
4855 mapped = mapping.get_epoch();
4856 }
7c673cae
FG
4857 }
4858 }
4859 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 4860 << " to create " << pgid << "@" << mapped << dendl;
11fdf7f2 4861 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
7c673cae
FG
4862 }
4863 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4864 creating_pgs_epoch = mapping.get_epoch();
4865}
4866
c07f9fc5 4867epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
4868{
4869 dout(30) << __func__ << " osd." << osd << " next=" << next
4870 << " " << creating_pgs_by_osd_epoch << dendl;
4871 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
4872 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4873 dout(20) << __func__
4874 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4875 // the subscribers will be updated when the mapping is completed anyway
4876 return next;
4877 }
7c673cae
FG
4878 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4879 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4880 return next;
11fdf7f2
TL
4881 ceph_assert(!creating_pgs_by_epoch->second.empty());
4882
4883 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4884 MOSDPGCreate2 *m = nullptr;
4885
9f95a23c 4886 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
7c673cae 4887
7c673cae
FG
4888 epoch_t last = 0;
4889 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4890 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4891 auto epoch = epoch_pgs->first;
4892 auto& pgs = epoch_pgs->second;
4893 dout(20) << __func__ << " osd." << osd << " from " << next
4894 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4895 last = epoch;
4896 for (auto& pg : pgs) {
7c673cae
FG
4897 // Need the create time from the monitor using its clock to set
4898 // last_scrub_stamp upon pg creation.
11fdf7f2
TL
4899 auto create = creating_pgs.pgs.find(pg.pgid);
4900 ceph_assert(create != creating_pgs.pgs.end());
4901 if (old) {
4902 if (!oldm) {
4903 oldm = new MOSDPGCreate(creating_pgs_epoch);
4904 }
4905 oldm->mkpg.emplace(pg.pgid,
9f95a23c
TL
4906 pg_create_t{create->second.create_epoch, pg.pgid, 0});
4907 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
11fdf7f2
TL
4908 } else {
4909 if (!m) {
4910 m = new MOSDPGCreate2(creating_pgs_epoch);
4911 }
9f95a23c
TL
4912 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4913 create->second.create_stamp));
4914 if (create->second.history.epoch_created) {
4915 dout(20) << __func__ << " " << pg << " " << create->second.history
4916 << " " << create->second.past_intervals << dendl;
4917 m->pg_extra.emplace(pg, make_pair(create->second.history,
4918 create->second.past_intervals));
4919 }
11fdf7f2 4920 }
7c673cae 4921 dout(20) << __func__ << " will create " << pg
9f95a23c 4922 << " at " << create->second.create_epoch << dendl;
7c673cae
FG
4923 }
4924 }
11fdf7f2
TL
4925 if (m) {
4926 con->send_message(m);
4927 } else if (oldm) {
4928 con->send_message(oldm);
4929 } else {
7c673cae
FG
4930 dout(20) << __func__ << " osd." << osd << " from " << next
4931 << " has nothing to send" << dendl;
4932 return next;
4933 }
11fdf7f2 4934
7c673cae
FG
4935 // sub is current through last + 1
4936 return last + 1;
4937}
4938
4939// TICK
4940
4941
4942void OSDMonitor::tick()
4943{
4944 if (!is_active()) return;
4945
4946 dout(10) << osdmap << dendl;
4947
11fdf7f2
TL
4948 // always update osdmap manifest, regardless of being the leader.
4949 load_osdmap_manifest();
4950
1911f103
TL
4951 // always tune priority cache manager memory on leader and peons
4952 if (ceph_using_tcmalloc() && mon_memory_autotune) {
4953 std::lock_guard l(balancer_lock);
4954 if (pcm != nullptr) {
4955 pcm->tune_memory();
4956 pcm->balance();
4957 _set_new_cache_sizes();
4958 dout(10) << "tick balancer "
4959 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
4960 << " inc comtd_bytes: " << inc_cache->get_committed_size()
4961 << " inc used_bytes: " << inc_cache->_get_used_bytes()
4962 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
4963 << dendl;
4964 dout(10) << "tick balancer "
4965 << " full cache_bytes: " << full_cache->get_cache_bytes()
4966 << " full comtd_bytes: " << full_cache->get_committed_size()
4967 << " full used_bytes: " << full_cache->_get_used_bytes()
4968 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
4969 << dendl;
4970 }
4971 }
4972
7c673cae
FG
4973 if (!mon->is_leader()) return;
4974
4975 bool do_propose = false;
4976 utime_t now = ceph_clock_now();
4977
11fdf7f2 4978 if (handle_osd_timeouts(now, last_osd_report)) {
181888fb
FG
4979 do_propose = true;
4980 }
7c673cae
FG
4981
4982 // mark osds down?
11fdf7f2 4983 if (check_failures(now)) {
7c673cae 4984 do_propose = true;
11fdf7f2
TL
4985 }
4986
4987 // Force a proposal if we need to prune; pruning is performed on
4988 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4989 // even if there's nothing going on.
4990 if (is_prune_enabled() && should_prune()) {
4991 do_propose = true;
4992 }
7c673cae
FG
4993
4994 // mark down osds out?
4995
4996 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4997 * influence at all. The decision is made based on the ratio of "in" osds,
4998 * and the function returns false if this ratio is lower that the minimum
11fdf7f2 4999 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
7c673cae
FG
5000 */
5001 if (can_mark_out(-1)) {
11fdf7f2
TL
5002 string down_out_subtree_limit = g_conf().get_val<string>(
5003 "mon_osd_down_out_subtree_limit");
7c673cae
FG
5004 set<int> down_cache; // quick cache of down subtrees
5005
5006 map<int,utime_t>::iterator i = down_pending_out.begin();
5007 while (i != down_pending_out.end()) {
5008 int o = i->first;
5009 utime_t down = now;
5010 down -= i->second;
5011 ++i;
5012
5013 if (osdmap.is_down(o) &&
5014 osdmap.is_in(o) &&
5015 can_mark_out(o)) {
11fdf7f2 5016 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
7c673cae
FG
5017 utime_t grace = orig_grace;
5018 double my_grace = 0.0;
5019
11fdf7f2 5020 if (g_conf()->mon_osd_adjust_down_out_interval) {
7c673cae
FG
5021 // scale grace period the same way we do the heartbeat grace.
5022 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
11fdf7f2 5023 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
5024 double decay_k = ::log(.5) / halflife;
5025 double decay = exp((double)down * decay_k);
5026 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5027 << " down for " << down << " decay " << decay << dendl;
5028 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5029 grace += my_grace;
5030 }
5031
5032 // is this an entire large subtree down?
11fdf7f2
TL
5033 if (down_out_subtree_limit.length()) {
5034 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
7c673cae 5035 if (type > 0) {
11fdf7f2
TL
5036 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5037 dout(10) << "tick entire containing " << down_out_subtree_limit
5038 << " subtree for osd." << o
5039 << " is down; resetting timer" << dendl;
7c673cae
FG
5040 // reset timer, too.
5041 down_pending_out[o] = now;
5042 continue;
5043 }
5044 }
5045 }
5046
c07f9fc5 5047 bool down_out = !osdmap.is_destroyed(o) &&
11fdf7f2 5048 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
c07f9fc5 5049 bool destroyed_out = osdmap.is_destroyed(o) &&
11fdf7f2 5050 g_conf()->mon_osd_destroyed_out_interval > 0 &&
c07f9fc5
FG
5051 // this is not precise enough as we did not make a note when this osd
5052 // was marked as destroyed, but let's not bother with that
5053 // complexity for now.
11fdf7f2 5054 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
c07f9fc5 5055 if (down_out || destroyed_out) {
7c673cae
FG
5056 dout(10) << "tick marking osd." << o << " OUT after " << down
5057 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5058 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5059
5060 // set the AUTOOUT bit.
5061 if (pending_inc.new_state.count(o) == 0)
5062 pending_inc.new_state[o] = 0;
5063 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5064
5065 // remember previous weight
5066 if (pending_inc.new_xinfo.count(o) == 0)
5067 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5068 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5069
5070 do_propose = true;
5071
224ce89b
WB
5072 mon->clog->info() << "Marking osd." << o << " out (has been down for "
5073 << int(down.sec()) << " seconds)";
7c673cae
FG
5074 } else
5075 continue;
5076 }
5077
5078 down_pending_out.erase(o);
5079 }
5080 } else {
5081 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5082 }
5083
5084 // expire blacklisted items?
5085 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5086 p != osdmap.blacklist.end();
5087 ++p) {
5088 if (p->second < now) {
5089 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5090 pending_inc.old_blacklist.push_back(p->first);
5091 do_propose = true;
5092 }
5093 }
5094
11fdf7f2
TL
5095 if (try_prune_purged_snaps()) {
5096 do_propose = true;
7c673cae
FG
5097 }
5098
5099 if (update_pools_status())
5100 do_propose = true;
5101
5102 if (do_propose ||
5103 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5104 propose_pending();
eafe8130
TL
5105}
5106
5107void OSDMonitor::_set_new_cache_sizes()
5108{
5109 uint64_t cache_size = 0;
5110 int64_t inc_alloc = 0;
5111 int64_t full_alloc = 0;
5112 int64_t kv_alloc = 0;
5113
5114 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5115 cache_size = pcm->get_tuned_mem();
5116 inc_alloc = inc_cache->get_committed_size();
5117 full_alloc = full_cache->get_committed_size();
5118 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5119 }
5120
5121 inc_osd_cache.set_bytes(inc_alloc);
5122 full_osd_cache.set_bytes(full_alloc);
5123
92f5a8d4 5124 dout(1) << __func__ << " cache_size:" << cache_size
eafe8130
TL
5125 << " inc_alloc: " << inc_alloc
5126 << " full_alloc: " << full_alloc
5127 << " kv_alloc: " << kv_alloc
5128 << dendl;
7c673cae
FG
5129}
5130
5131bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5132 std::map<int,utime_t> &last_osd_report)
5133{
11fdf7f2 5134 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
7c673cae
FG
5135 if (now - mon->get_leader_since() < timeo) {
5136 // We haven't been the leader for long enough to consider OSD timeouts
5137 return false;
5138 }
5139
5140 int max_osd = osdmap.get_max_osd();
5141 bool new_down = false;
5142
5143 for (int i=0; i < max_osd; ++i) {
5144 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
5145 if (!osdmap.exists(i)) {
5146 last_osd_report.erase(i); // if any
5147 continue;
5148 }
7c673cae
FG
5149 if (!osdmap.is_up(i))
5150 continue;
5151 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5152 if (t == last_osd_report.end()) {
5153 // it wasn't in the map; start the timer.
5154 last_osd_report[i] = now;
5155 } else if (can_mark_down(i)) {
5156 utime_t diff = now - t->second;
5157 if (diff > timeo) {
31f18b77
FG
5158 mon->clog->info() << "osd." << i << " marked down after no beacon for "
5159 << diff << " seconds";
5160 derr << "no beacon from osd." << i << " since " << t->second
5161 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
5162 pending_inc.new_state[i] = CEPH_OSD_UP;
5163 new_down = true;
5164 }
5165 }
5166 }
5167 return new_down;
5168}
5169
11fdf7f2
TL
5170static void dump_cpu_list(Formatter *f, const char *name,
5171 const string& strlist)
7c673cae 5172{
11fdf7f2
TL
5173 cpu_set_t cpu_set;
5174 size_t cpu_set_size;
5175 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5176 return;
5177 }
5178 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5179 f->open_array_section(name);
5180 for (auto cpu : cpus) {
5181 f->dump_int("cpu", cpu);
7c673cae 5182 }
11fdf7f2 5183 f->close_section();
7c673cae
FG
5184}
5185
5186void OSDMonitor::dump_info(Formatter *f)
5187{
5188 f->open_object_section("osdmap");
5189 osdmap.dump(f);
5190 f->close_section();
5191
5192 f->open_array_section("osd_metadata");
5193 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5194 if (osdmap.exists(i)) {
5195 f->open_object_section("osd");
5196 f->dump_unsigned("id", i);
5197 dump_osd_metadata(i, f, NULL);
5198 f->close_section();
5199 }
5200 }
5201 f->close_section();
5202
1911f103
TL
5203 f->open_object_section("osdmap_clean_epochs");
5204 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5205
5206 f->open_object_section("last_epoch_clean");
5207 last_epoch_clean.dump(f);
5208 f->close_section();
5209
5210 f->open_array_section("osd_epochs");
5211 for (auto& osd_epoch : osd_epochs) {
5212 f->open_object_section("osd");
5213 f->dump_unsigned("id", osd_epoch.first);
5214 f->dump_unsigned("epoch", osd_epoch.second);
5215 f->close_section();
5216 }
5217 f->close_section(); // osd_epochs
5218
5219 f->close_section(); // osd_clean_epochs
5220
7c673cae
FG
5221 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5222 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5223
5224 f->open_object_section("crushmap");
5225 osdmap.crush->dump(f);
5226 f->close_section();
11fdf7f2
TL
5227
5228 if (has_osdmap_manifest) {
5229 f->open_object_section("osdmap_manifest");
5230 osdmap_manifest.dump(f);
5231 f->close_section();
5232 }
7c673cae
FG
5233}
5234
5235namespace {
5236 enum osd_pool_get_choices {
11fdf7f2 5237 SIZE, MIN_SIZE,
28e407b8 5238 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
7c673cae
FG
5239 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5240 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5241 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
11fdf7f2 5242 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
7c673cae
FG
5243 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5244 CACHE_TARGET_FULL_RATIO,
5245 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5246 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5247 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5248 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5249 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5250 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5251 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5252 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
11fdf7f2
TL
5253 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5254 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5255 PG_AUTOSCALE_BIAS };
7c673cae
FG
5256
5257 std::set<osd_pool_get_choices>
5258 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5259 const std::set<osd_pool_get_choices>& second)
5260 {
5261 std::set<osd_pool_get_choices> result;
5262 std::set_difference(first.begin(), first.end(),
5263 second.begin(), second.end(),
5264 std::inserter(result, result.end()));
5265 return result;
5266 }
5267}
5268
5269
5270bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5271{
5272 op->mark_osdmon_event(__func__);
9f95a23c 5273 auto m = op->get_req<MMonCommand>();
7c673cae
FG
5274 int r = 0;
5275 bufferlist rdata;
5276 stringstream ss, ds;
5277
11fdf7f2 5278 cmdmap_t cmdmap;
7c673cae
FG
5279 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5280 string rs = ss.str();
5281 mon->reply_command(op, -EINVAL, rs, get_last_committed());
5282 return true;
5283 }
5284
11fdf7f2 5285 MonSession *session = op->get_session();
7c673cae 5286 if (!session) {
11fdf7f2 5287 derr << __func__ << " no session" << dendl;
7c673cae
FG
5288 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5289 return true;
5290 }
5291
5292 string prefix;
9f95a23c 5293 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
5294
5295 string format;
9f95a23c 5296 cmd_getval(cmdmap, "format", format, string("plain"));
7c673cae
FG
5297 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5298
5299 if (prefix == "osd stat") {
92f5a8d4
TL
5300 if (f) {
5301 f->open_object_section("osdmap");
5302 osdmap.print_summary(f.get(), ds, "", true);
5303 f->close_section();
7c673cae 5304 f->flush(rdata);
92f5a8d4
TL
5305 } else {
5306 osdmap.print_summary(nullptr, ds, "", true);
7c673cae 5307 rdata.append(ds);
92f5a8d4 5308 }
7c673cae 5309 }
7c673cae
FG
5310 else if (prefix == "osd dump" ||
5311 prefix == "osd tree" ||
11fdf7f2 5312 prefix == "osd tree-from" ||
7c673cae
FG
5313 prefix == "osd ls" ||
5314 prefix == "osd getmap" ||
31f18b77 5315 prefix == "osd getcrushmap" ||
9f95a23c
TL
5316 prefix == "osd ls-tree" ||
5317 prefix == "osd info") {
7c673cae
FG
5318 string val;
5319
5320 epoch_t epoch = 0;
5321 int64_t epochnum;
9f95a23c 5322 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
7c673cae
FG
5323 epoch = epochnum;
5324
5325 bufferlist osdmap_bl;
5326 int err = get_version_full(epoch, osdmap_bl);
5327 if (err == -ENOENT) {
5328 r = -ENOENT;
5329 ss << "there is no map for epoch " << epoch;
5330 goto reply;
5331 }
11fdf7f2
TL
5332 ceph_assert(err == 0);
5333 ceph_assert(osdmap_bl.length());
7c673cae
FG
5334
5335 OSDMap *p;
5336 if (epoch == osdmap.get_epoch()) {
5337 p = &osdmap;
5338 } else {
5339 p = new OSDMap;
5340 p->decode(osdmap_bl);
5341 }
5342
224ce89b
WB
5343 auto sg = make_scope_guard([&] {
5344 if (p != &osdmap) {
5345 delete p;
5346 }
5347 });
5348
7c673cae
FG
5349 if (prefix == "osd dump") {
5350 stringstream ds;
5351 if (f) {
5352 f->open_object_section("osdmap");
5353 p->dump(f.get());
5354 f->close_section();
5355 f->flush(ds);
5356 } else {
5357 p->print(ds);
5358 }
5359 rdata.append(ds);
5360 if (!f)
5361 ds << " ";
5362 } else if (prefix == "osd ls") {
5363 if (f) {
5364 f->open_array_section("osds");
5365 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5366 if (osdmap.exists(i)) {
5367 f->dump_int("osd", i);
5368 }
5369 }
5370 f->close_section();
5371 f->flush(ds);
5372 } else {
5373 bool first = true;
5374 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5375 if (osdmap.exists(i)) {
5376 if (!first)
5377 ds << "\n";
5378 first = false;
5379 ds << i;
5380 }
5381 }
5382 }
5383 rdata.append(ds);
9f95a23c
TL
5384 } else if (prefix == "osd info") {
5385 int64_t osd_id;
5386 bool do_single_osd = true;
5387 if (!cmd_getval(cmdmap, "id", osd_id)) {
5388 do_single_osd = false;
5389 }
5390
5391 if (do_single_osd && !osdmap.exists(osd_id)) {
5392 ss << "osd." << osd_id << " does not exist";
5393 r = -EINVAL;
5394 goto reply;
5395 }
5396
5397 if (f) {
5398 if (do_single_osd) {
5399 osdmap.dump_osd(osd_id, f.get());
5400 } else {
5401 osdmap.dump_osds(f.get());
5402 }
5403 f->flush(ds);
5404 } else {
5405 if (do_single_osd) {
5406 osdmap.print_osd(osd_id, ds);
5407 } else {
5408 osdmap.print_osds(ds);
5409 }
5410 }
5411 rdata.append(ds);
11fdf7f2
TL
5412 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5413 string bucket;
5414 if (prefix == "osd tree-from") {
9f95a23c 5415 cmd_getval(cmdmap, "bucket", bucket);
11fdf7f2
TL
5416 if (!osdmap.crush->name_exists(bucket)) {
5417 ss << "bucket '" << bucket << "' does not exist";
5418 r = -ENOENT;
5419 goto reply;
5420 }
5421 int id = osdmap.crush->get_item_id(bucket);
5422 if (id >= 0) {
5423 ss << "\"" << bucket << "\" is not a bucket";
5424 r = -EINVAL;
5425 goto reply;
5426 }
5427 }
5428
31f18b77 5429 vector<string> states;
9f95a23c 5430 cmd_getval(cmdmap, "states", states);
31f18b77
FG
5431 unsigned filter = 0;
5432 for (auto& s : states) {
5433 if (s == "up") {
5434 filter |= OSDMap::DUMP_UP;
5435 } else if (s == "down") {
5436 filter |= OSDMap::DUMP_DOWN;
5437 } else if (s == "in") {
5438 filter |= OSDMap::DUMP_IN;
5439 } else if (s == "out") {
5440 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
5441 } else if (s == "destroyed") {
5442 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
5443 } else {
5444 ss << "unrecognized state '" << s << "'";
5445 r = -EINVAL;
5446 goto reply;
5447 }
5448 }
5449 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
5450 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5451 ss << "cannot specify both 'in' and 'out'";
5452 r = -EINVAL;
5453 goto reply;
5454 }
5455 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5456 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5457 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5458 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5459 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5460 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5461 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
5462 r = -EINVAL;
5463 goto reply;
5464 }
7c673cae
FG
5465 if (f) {
5466 f->open_object_section("tree");
11fdf7f2 5467 p->print_tree(f.get(), NULL, filter, bucket);
7c673cae
FG
5468 f->close_section();
5469 f->flush(ds);
5470 } else {
11fdf7f2 5471 p->print_tree(NULL, &ds, filter, bucket);
7c673cae
FG
5472 }
5473 rdata.append(ds);
5474 } else if (prefix == "osd getmap") {
5475 rdata.append(osdmap_bl);
5476 ss << "got osdmap epoch " << p->get_epoch();
5477 } else if (prefix == "osd getcrushmap") {
5478 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
5479 ss << p->get_crush_version();
5480 } else if (prefix == "osd ls-tree") {
5481 string bucket_name;
9f95a23c 5482 cmd_getval(cmdmap, "name", bucket_name);
31f18b77
FG
5483 set<int> osds;
5484 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5485 if (r == -ENOENT) {
5486 ss << "\"" << bucket_name << "\" does not exist";
5487 goto reply;
5488 } else if (r < 0) {
5489 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5490 goto reply;
5491 }
5492
5493 if (f) {
5494 f->open_array_section("osds");
5495 for (auto &i : osds) {
5496 if (osdmap.exists(i)) {
5497 f->dump_int("osd", i);
5498 }
5499 }
5500 f->close_section();
5501 f->flush(ds);
5502 } else {
5503 bool first = true;
5504 for (auto &i : osds) {
5505 if (osdmap.exists(i)) {
5506 if (!first)
5507 ds << "\n";
5508 first = false;
5509 ds << i;
5510 }
5511 }
5512 }
5513
5514 rdata.append(ds);
7c673cae 5515 }
7c673cae
FG
5516 } else if (prefix == "osd getmaxosd") {
5517 if (f) {
5518 f->open_object_section("getmaxosd");
5519 f->dump_unsigned("epoch", osdmap.get_epoch());
5520 f->dump_int("max_osd", osdmap.get_max_osd());
5521 f->close_section();
5522 f->flush(rdata);
5523 } else {
5524 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5525 rdata.append(ds);
5526 }
5527 } else if (prefix == "osd utilization") {
5528 string out;
5529 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5530 if (f)
5531 f->flush(rdata);
5532 else
5533 rdata.append(out);
5534 r = 0;
5535 goto reply;
5536 } else if (prefix == "osd find") {
5537 int64_t osd;
9f95a23c 5538 if (!cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5539 ss << "unable to parse osd id value '"
5540 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5541 r = -EINVAL;
5542 goto reply;
5543 }
5544 if (!osdmap.exists(osd)) {
5545 ss << "osd." << osd << " does not exist";
5546 r = -ENOENT;
5547 goto reply;
5548 }
5549 string format;
9f95a23c 5550 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5551 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5552 f->open_object_section("osd_location");
5553 f->dump_int("osd", osd);
11fdf7f2 5554 f->dump_object("addrs", osdmap.get_addrs(osd));
f64942e4 5555 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
11fdf7f2
TL
5556
5557 // try to identify host, pod/container name, etc.
5558 map<string,string> m;
5559 load_metadata(osd, m, nullptr);
5560 if (auto p = m.find("hostname"); p != m.end()) {
5561 f->dump_string("host", p->second);
5562 }
5563 for (auto& k : {
5564 "pod_name", "pod_namespace", // set by rook
9f95a23c 5565 "container_name" // set by cephadm, ceph-ansible
11fdf7f2
TL
5566 }) {
5567 if (auto p = m.find(k); p != m.end()) {
5568 f->dump_string(k, p->second);
5569 }
5570 }
5571
5572 // crush is helpful too
7c673cae
FG
5573 f->open_object_section("crush_location");
5574 map<string,string> loc = osdmap.crush->get_full_location(osd);
5575 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5576 f->dump_string(p->first.c_str(), p->second);
5577 f->close_section();
5578 f->close_section();
5579 f->flush(rdata);
5580 } else if (prefix == "osd metadata") {
5581 int64_t osd = -1;
5582 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
9f95a23c 5583 !cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5584 ss << "unable to parse osd id value '"
5585 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5586 r = -EINVAL;
5587 goto reply;
5588 }
5589 if (osd >= 0 && !osdmap.exists(osd)) {
5590 ss << "osd." << osd << " does not exist";
5591 r = -ENOENT;
5592 goto reply;
5593 }
5594 string format;
9f95a23c 5595 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5596 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5597 if (osd >= 0) {
5598 f->open_object_section("osd_metadata");
5599 f->dump_unsigned("id", osd);
5600 r = dump_osd_metadata(osd, f.get(), &ss);
5601 if (r < 0)
5602 goto reply;
5603 f->close_section();
5604 } else {
5605 r = 0;
5606 f->open_array_section("osd_metadata");
5607 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5608 if (osdmap.exists(i)) {
5609 f->open_object_section("osd");
5610 f->dump_unsigned("id", i);
5611 r = dump_osd_metadata(i, f.get(), NULL);
5612 if (r == -EINVAL || r == -ENOENT) {
5613 // Drop error, continue to get other daemons' metadata
5614 dout(4) << "No metadata for osd." << i << dendl;
5615 r = 0;
5616 } else if (r < 0) {
5617 // Unexpected error
5618 goto reply;
5619 }
5620 f->close_section();
5621 }
5622 }
5623 f->close_section();
5624 }
5625 f->flush(rdata);
31f18b77
FG
5626 } else if (prefix == "osd versions") {
5627 if (!f)
5628 f.reset(Formatter::create("json-pretty"));
5629 count_metadata("ceph_version", f.get());
5630 f->flush(rdata);
5631 r = 0;
5632 } else if (prefix == "osd count-metadata") {
5633 if (!f)
5634 f.reset(Formatter::create("json-pretty"));
5635 string field;
9f95a23c 5636 cmd_getval(cmdmap, "property", field);
31f18b77
FG
5637 count_metadata(field, f.get());
5638 f->flush(rdata);
5639 r = 0;
11fdf7f2
TL
5640 } else if (prefix == "osd numa-status") {
5641 TextTable tbl;
5642 if (f) {
5643 f->open_array_section("osds");
5644 } else {
5645 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5646 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5647 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5648 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5649 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5650 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5651 }
5652 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5653 if (osdmap.exists(i)) {
5654 map<string,string> m;
5655 ostringstream err;
5656 if (load_metadata(i, m, &err) < 0) {
5657 continue;
5658 }
5659 string host;
5660 auto p = m.find("hostname");
5661 if (p != m.end()) {
5662 host = p->second;
5663 }
5664 if (f) {
5665 f->open_object_section("osd");
5666 f->dump_int("osd", i);
5667 f->dump_string("host", host);
5668 for (auto n : { "network_numa_node", "objectstore_numa_node",
5669 "numa_node" }) {
5670 p = m.find(n);
5671 if (p != m.end()) {
5672 f->dump_int(n, atoi(p->second.c_str()));
5673 }
5674 }
5675 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5676 p = m.find(n);
5677 if (p != m.end()) {
5678 list<string> ls = get_str_list(p->second, ",");
5679 f->open_array_section(n);
5680 for (auto node : ls) {
5681 f->dump_int("node", atoi(node.c_str()));
5682 }
5683 f->close_section();
5684 }
5685 }
5686 for (auto n : { "numa_node_cpus" }) {
5687 p = m.find(n);
5688 if (p != m.end()) {
5689 dump_cpu_list(f.get(), n, p->second);
5690 }
5691 }
5692 f->close_section();
5693 } else {
5694 tbl << i;
5695 tbl << host;
5696 p = m.find("network_numa_nodes");
5697 if (p != m.end()) {
5698 tbl << p->second;
5699 } else {
5700 tbl << "-";
5701 }
5702 p = m.find("objectstore_numa_nodes");
5703 if (p != m.end()) {
5704 tbl << p->second;
5705 } else {
5706 tbl << "-";
5707 }
5708 p = m.find("numa_node");
5709 auto q = m.find("numa_node_cpus");
5710 if (p != m.end() && q != m.end()) {
5711 tbl << p->second;
5712 tbl << q->second;
5713 } else {
5714 tbl << "-";
5715 tbl << "-";
5716 }
5717 tbl << TextTable::endrow;
5718 }
5719 }
5720 }
5721 if (f) {
5722 f->close_section();
5723 f->flush(rdata);
5724 } else {
5725 rdata.append(stringify(tbl));
5726 }
7c673cae
FG
5727 } else if (prefix == "osd map") {
5728 string poolstr, objstr, namespacestr;
9f95a23c
TL
5729 cmd_getval(cmdmap, "pool", poolstr);
5730 cmd_getval(cmdmap, "object", objstr);
5731 cmd_getval(cmdmap, "nspace", namespacestr);
7c673cae
FG
5732
5733 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5734 if (pool < 0) {
5735 ss << "pool " << poolstr << " does not exist";
5736 r = -ENOENT;
5737 goto reply;
5738 }
5739 object_locator_t oloc(pool, namespacestr);
5740 object_t oid(objstr);
5741 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5742 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5743 vector<int> up, acting;
5744 int up_p, acting_p;
5745 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5746
5747 string fullobjname;
5748 if (!namespacestr.empty())
5749 fullobjname = namespacestr + string("/") + oid.name;
5750 else
5751 fullobjname = oid.name;
5752 if (f) {
5753 f->open_object_section("osd_map");
5754 f->dump_unsigned("epoch", osdmap.get_epoch());
5755 f->dump_string("pool", poolstr);
5756 f->dump_int("pool_id", pool);
5757 f->dump_stream("objname") << fullobjname;
5758 f->dump_stream("raw_pgid") << pgid;
5759 f->dump_stream("pgid") << mpgid;
5760 f->open_array_section("up");
5761 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5762 f->dump_int("osd", *p);
5763 f->close_section();
5764 f->dump_int("up_primary", up_p);
5765 f->open_array_section("acting");
5766 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5767 f->dump_int("osd", *p);
5768 f->close_section();
5769 f->dump_int("acting_primary", acting_p);
5770 f->close_section(); // osd_map
5771 f->flush(rdata);
5772 } else {
5773 ds << "osdmap e" << osdmap.get_epoch()
5774 << " pool '" << poolstr << "' (" << pool << ")"
5775 << " object '" << fullobjname << "' ->"
5776 << " pg " << pgid << " (" << mpgid << ")"
5777 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5778 << pg_vector_string(acting) << ", p" << acting_p << ")";
5779 rdata.append(ds);
5780 }
5781
5782 } else if (prefix == "pg map") {
5783 pg_t pgid;
5784 string pgidstr;
9f95a23c 5785 cmd_getval(cmdmap, "pgid", pgidstr);
7c673cae
FG
5786 if (!pgid.parse(pgidstr.c_str())) {
5787 ss << "invalid pgid '" << pgidstr << "'";
5788 r = -EINVAL;
5789 goto reply;
5790 }
5791 vector<int> up, acting;
5792 if (!osdmap.have_pg_pool(pgid.pool())) {
5793 ss << "pg '" << pgidstr << "' does not exist";
5794 r = -ENOENT;
5795 goto reply;
5796 }
5797 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5798 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5799 if (f) {
5800 f->open_object_section("pg_map");
5801 f->dump_unsigned("epoch", osdmap.get_epoch());
5802 f->dump_stream("raw_pgid") << pgid;
5803 f->dump_stream("pgid") << mpgid;
5804 f->open_array_section("up");
5805 for (auto osd : up) {
5806 f->dump_int("up_osd", osd);
5807 }
5808 f->close_section();
5809 f->open_array_section("acting");
5810 for (auto osd : acting) {
5811 f->dump_int("acting_osd", osd);
5812 }
5813 f->close_section();
5814 f->close_section();
5815 f->flush(rdata);
5816 } else {
5817 ds << "osdmap e" << osdmap.get_epoch()
5818 << " pg " << pgid << " (" << mpgid << ")"
5819 << " -> up " << up << " acting " << acting;
5820 rdata.append(ds);
5821 }
5822 goto reply;
5823
7c673cae 5824 } else if (prefix == "osd lspools") {
7c673cae
FG
5825 if (f)
5826 f->open_array_section("pools");
5827 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5828 p != osdmap.pools.end();
5829 ++p) {
11fdf7f2
TL
5830 if (f) {
5831 f->open_object_section("pool");
5832 f->dump_int("poolnum", p->first);
5833 f->dump_string("poolname", osdmap.pool_name[p->first]);
5834 f->close_section();
5835 } else {
5836 ds << p->first << ' ' << osdmap.pool_name[p->first];
5837 if (next(p) != osdmap.pools.end()) {
5838 ds << '\n';
7c673cae
FG
5839 }
5840 }
5841 }
5842 if (f) {
5843 f->close_section();
5844 f->flush(ds);
5845 }
5846 rdata.append(ds);
5847 } else if (prefix == "osd blacklist ls") {
5848 if (f)
5849 f->open_array_section("blacklist");
5850
5851 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5852 p != osdmap.blacklist.end();
5853 ++p) {
5854 if (f) {
5855 f->open_object_section("entry");
11fdf7f2 5856 f->dump_string("addr", p->first.get_legacy_str());
7c673cae
FG
5857 f->dump_stream("until") << p->second;
5858 f->close_section();
5859 } else {
5860 stringstream ss;
5861 string s;
5862 ss << p->first << " " << p->second;
5863 getline(ss, s);
5864 s += "\n";
5865 rdata.append(s);
5866 }
5867 }
5868 if (f) {
5869 f->close_section();
5870 f->flush(rdata);
5871 }
5872 ss << "listed " << osdmap.blacklist.size() << " entries";
5873
5874 } else if (prefix == "osd pool ls") {
5875 string detail;
9f95a23c 5876 cmd_getval(cmdmap, "detail", detail);
7c673cae
FG
5877 if (!f && detail == "detail") {
5878 ostringstream ss;
5879 osdmap.print_pools(ss);
5880 rdata.append(ss.str());
5881 } else {
5882 if (f)
5883 f->open_array_section("pools");
5884 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5885 it != osdmap.get_pools().end();
5886 ++it) {
5887 if (f) {
5888 if (detail == "detail") {
5889 f->open_object_section("pool");
eafe8130 5890 f->dump_int("pool_id", it->first);
7c673cae
FG
5891 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5892 it->second.dump(f.get());
5893 f->close_section();
5894 } else {
5895 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5896 }
5897 } else {
5898 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5899 }
5900 }
5901 if (f) {
5902 f->close_section();
5903 f->flush(rdata);
5904 }
5905 }
5906
5907 } else if (prefix == "osd crush get-tunable") {
5908 string tunable;
9f95a23c 5909 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
5910 ostringstream rss;
5911 if (f)
5912 f->open_object_section("tunable");
5913 if (tunable == "straw_calc_version") {
5914 if (f)
5915 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5916 else
5917 rss << osdmap.crush->get_straw_calc_version() << "\n";
5918 } else {
5919 r = -EINVAL;
5920 goto reply;
5921 }
5922 if (f) {
5923 f->close_section();
5924 f->flush(rdata);
5925 } else {
5926 rdata.append(rss.str());
5927 }
5928 r = 0;
5929
5930 } else if (prefix == "osd pool get") {
5931 string poolstr;
9f95a23c 5932 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
5933 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5934 if (pool < 0) {
5935 ss << "unrecognized pool '" << poolstr << "'";
5936 r = -ENOENT;
5937 goto reply;
5938 }
5939
5940 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5941 string var;
9f95a23c 5942 cmd_getval(cmdmap, "var", var);
7c673cae
FG
5943
5944 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5945 const choices_map_t ALL_CHOICES = {
5946 {"size", SIZE},
5947 {"min_size", MIN_SIZE},
7c673cae 5948 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
28e407b8
AA
5949 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5950 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
7c673cae
FG
5951 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5952 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5953 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5954 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5955 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5956 {"use_gmt_hitset", USE_GMT_HITSET},
11fdf7f2 5957 {"target_max_objects", TARGET_MAX_OBJECTS},
7c673cae
FG
5958 {"target_max_bytes", TARGET_MAX_BYTES},
5959 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5960 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5961 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5962 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5963 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5964 {"erasure_code_profile", ERASURE_CODE_PROFILE},
5965 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5966 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5967 {"fast_read", FAST_READ},
5968 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5969 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5970 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5971 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5972 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5973 {"recovery_priority", RECOVERY_PRIORITY},
5974 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5975 {"scrub_priority", SCRUB_PRIORITY},
5976 {"compression_mode", COMPRESSION_MODE},
5977 {"compression_algorithm", COMPRESSION_ALGORITHM},
5978 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5979 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5980 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5981 {"csum_type", CSUM_TYPE},
5982 {"csum_max_block", CSUM_MAX_BLOCK},
5983 {"csum_min_block", CSUM_MIN_BLOCK},
11fdf7f2
TL
5984 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5985 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5986 {"pg_num_min", PG_NUM_MIN},
5987 {"target_size_bytes", TARGET_SIZE_BYTES},
5988 {"target_size_ratio", TARGET_SIZE_RATIO},
5989 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
7c673cae
FG
5990 };
5991
5992 typedef std::set<osd_pool_get_choices> choices_set_t;
5993
5994 const choices_set_t ONLY_TIER_CHOICES = {
5995 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5996 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5997 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5998 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5999 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 6000 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
6001 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6002 };
6003 const choices_set_t ONLY_ERASURE_CHOICES = {
28e407b8 6004 EC_OVERWRITES, ERASURE_CODE_PROFILE
7c673cae
FG
6005 };
6006
6007 choices_set_t selected_choices;
6008 if (var == "all") {
6009 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6010 it != ALL_CHOICES.end(); ++it) {
6011 selected_choices.insert(it->second);
6012 }
6013
6014 if(!p->is_tier()) {
6015 selected_choices = subtract_second_from_first(selected_choices,
6016 ONLY_TIER_CHOICES);
6017 }
6018
6019 if(!p->is_erasure()) {
6020 selected_choices = subtract_second_from_first(selected_choices,
6021 ONLY_ERASURE_CHOICES);
6022 }
6023 } else /* var != "all" */ {
6024 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6025 osd_pool_get_choices selected = found->second;
6026
6027 if (!p->is_tier() &&
6028 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6029 ss << "pool '" << poolstr
6030 << "' is not a tier pool: variable not applicable";
6031 r = -EACCES;
6032 goto reply;
6033 }
6034
6035 if (!p->is_erasure() &&
6036 ONLY_ERASURE_CHOICES.find(selected)
6037 != ONLY_ERASURE_CHOICES.end()) {
6038 ss << "pool '" << poolstr
6039 << "' is not a erasure pool: variable not applicable";
6040 r = -EACCES;
6041 goto reply;
6042 }
6043
94b18763
FG
6044 if (pool_opts_t::is_opt_name(var) &&
6045 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6046 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6047 r = -ENOENT;
6048 goto reply;
6049 }
6050
7c673cae
FG
6051 selected_choices.insert(selected);
6052 }
6053
6054 if (f) {
94b18763
FG
6055 f->open_object_section("pool");
6056 f->dump_string("pool", poolstr);
6057 f->dump_int("pool_id", pool);
7c673cae
FG
6058 for(choices_set_t::const_iterator it = selected_choices.begin();
6059 it != selected_choices.end(); ++it) {
6060 choices_map_t::const_iterator i;
c07f9fc5
FG
6061 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6062 if (i->second == *it) {
6063 break;
6064 }
6065 }
11fdf7f2 6066 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6067 switch(*it) {
6068 case PG_NUM:
6069 f->dump_int("pg_num", p->get_pg_num());
6070 break;
6071 case PGP_NUM:
6072 f->dump_int("pgp_num", p->get_pgp_num());
6073 break;
7c673cae
FG
6074 case SIZE:
6075 f->dump_int("size", p->get_size());
6076 break;
6077 case MIN_SIZE:
6078 f->dump_int("min_size", p->get_min_size());
6079 break;
7c673cae 6080 case CRUSH_RULE:
31f18b77 6081 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6082 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 6083 p->get_crush_rule()));
7c673cae 6084 } else {
31f18b77 6085 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
6086 }
6087 break;
28e407b8
AA
6088 case EC_OVERWRITES:
6089 f->dump_bool("allow_ec_overwrites",
6090 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6091 break;
11fdf7f2
TL
6092 case PG_AUTOSCALE_MODE:
6093 f->dump_string("pg_autoscale_mode",
6094 pg_pool_t::get_pg_autoscale_mode_name(
6095 p->pg_autoscale_mode));
6096 break;
7c673cae
FG
6097 case HASHPSPOOL:
6098 case NODELETE:
6099 case NOPGCHANGE:
6100 case NOSIZECHANGE:
6101 case WRITE_FADVISE_DONTNEED:
6102 case NOSCRUB:
6103 case NODEEP_SCRUB:
94b18763
FG
6104 f->dump_bool(i->first.c_str(),
6105 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
6106 break;
6107 case HIT_SET_PERIOD:
6108 f->dump_int("hit_set_period", p->hit_set_period);
6109 break;
6110 case HIT_SET_COUNT:
6111 f->dump_int("hit_set_count", p->hit_set_count);
6112 break;
6113 case HIT_SET_TYPE:
6114 f->dump_string("hit_set_type",
6115 HitSet::get_type_name(p->hit_set_params.get_type()));
6116 break;
6117 case HIT_SET_FPP:
6118 {
6119 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6120 BloomHitSet::Params *bloomp =
6121 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6122 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6123 } else if(var != "all") {
6124 f->close_section();
6125 ss << "hit set is not of type Bloom; " <<
6126 "invalid to get a false positive rate!";
6127 r = -EINVAL;
6128 goto reply;
6129 }
6130 }
6131 break;
6132 case USE_GMT_HITSET:
6133 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6134 break;
6135 case TARGET_MAX_OBJECTS:
6136 f->dump_unsigned("target_max_objects", p->target_max_objects);
6137 break;
6138 case TARGET_MAX_BYTES:
6139 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6140 break;
6141 case CACHE_TARGET_DIRTY_RATIO:
6142 f->dump_unsigned("cache_target_dirty_ratio_micro",
6143 p->cache_target_dirty_ratio_micro);
6144 f->dump_float("cache_target_dirty_ratio",
6145 ((float)p->cache_target_dirty_ratio_micro/1000000));
6146 break;
6147 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6148 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6149 p->cache_target_dirty_high_ratio_micro);
6150 f->dump_float("cache_target_dirty_high_ratio",
6151 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6152 break;
6153 case CACHE_TARGET_FULL_RATIO:
6154 f->dump_unsigned("cache_target_full_ratio_micro",
6155 p->cache_target_full_ratio_micro);
6156 f->dump_float("cache_target_full_ratio",
6157 ((float)p->cache_target_full_ratio_micro/1000000));
6158 break;
6159 case CACHE_MIN_FLUSH_AGE:
6160 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6161 break;
6162 case CACHE_MIN_EVICT_AGE:
6163 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6164 break;
6165 case ERASURE_CODE_PROFILE:
6166 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6167 break;
6168 case MIN_READ_RECENCY_FOR_PROMOTE:
6169 f->dump_int("min_read_recency_for_promote",
6170 p->min_read_recency_for_promote);
6171 break;
6172 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6173 f->dump_int("min_write_recency_for_promote",
6174 p->min_write_recency_for_promote);
6175 break;
6176 case FAST_READ:
6177 f->dump_int("fast_read", p->fast_read);
6178 break;
6179 case HIT_SET_GRADE_DECAY_RATE:
6180 f->dump_int("hit_set_grade_decay_rate",
6181 p->hit_set_grade_decay_rate);
6182 break;
6183 case HIT_SET_SEARCH_LAST_N:
6184 f->dump_int("hit_set_search_last_n",
6185 p->hit_set_search_last_n);
6186 break;
6187 case SCRUB_MIN_INTERVAL:
6188 case SCRUB_MAX_INTERVAL:
6189 case DEEP_SCRUB_INTERVAL:
6190 case RECOVERY_PRIORITY:
6191 case RECOVERY_OP_PRIORITY:
6192 case SCRUB_PRIORITY:
6193 case COMPRESSION_MODE:
6194 case COMPRESSION_ALGORITHM:
6195 case COMPRESSION_REQUIRED_RATIO:
6196 case COMPRESSION_MAX_BLOB_SIZE:
6197 case COMPRESSION_MIN_BLOB_SIZE:
6198 case CSUM_TYPE:
6199 case CSUM_MAX_BLOCK:
6200 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6201 case FINGERPRINT_ALGORITHM:
6202 case PG_NUM_MIN:
6203 case TARGET_SIZE_BYTES:
6204 case TARGET_SIZE_RATIO:
6205 case PG_AUTOSCALE_BIAS:
c07f9fc5
FG
6206 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6207 if (p->opts.is_set(key)) {
c07f9fc5 6208 if(*it == CSUM_TYPE) {
11fdf7f2 6209 int64_t val;
c07f9fc5
FG
6210 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6211 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6212 } else {
6213 p->opts.dump(i->first, f.get());
6214 }
94b18763 6215 }
7c673cae
FG
6216 break;
6217 }
7c673cae 6218 }
94b18763
FG
6219 f->close_section();
6220 f->flush(rdata);
7c673cae
FG
6221 } else /* !f */ {
6222 for(choices_set_t::const_iterator it = selected_choices.begin();
6223 it != selected_choices.end(); ++it) {
6224 choices_map_t::const_iterator i;
6225 switch(*it) {
6226 case PG_NUM:
6227 ss << "pg_num: " << p->get_pg_num() << "\n";
6228 break;
6229 case PGP_NUM:
6230 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6231 break;
7c673cae
FG
6232 case SIZE:
6233 ss << "size: " << p->get_size() << "\n";
6234 break;
6235 case MIN_SIZE:
6236 ss << "min_size: " << p->get_min_size() << "\n";
6237 break;
7c673cae 6238 case CRUSH_RULE:
31f18b77 6239 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6240 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 6241 p->get_crush_rule()) << "\n";
7c673cae 6242 } else {
31f18b77 6243 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
6244 }
6245 break;
11fdf7f2
TL
6246 case PG_AUTOSCALE_MODE:
6247 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6248 p->pg_autoscale_mode) <<"\n";
6249 break;
7c673cae
FG
6250 case HIT_SET_PERIOD:
6251 ss << "hit_set_period: " << p->hit_set_period << "\n";
6252 break;
6253 case HIT_SET_COUNT:
6254 ss << "hit_set_count: " << p->hit_set_count << "\n";
6255 break;
6256 case HIT_SET_TYPE:
6257 ss << "hit_set_type: " <<
6258 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6259 break;
6260 case HIT_SET_FPP:
6261 {
6262 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6263 BloomHitSet::Params *bloomp =
6264 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6265 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6266 } else if(var != "all") {
6267 ss << "hit set is not of type Bloom; " <<
6268 "invalid to get a false positive rate!";
6269 r = -EINVAL;
6270 goto reply;
6271 }
6272 }
6273 break;
6274 case USE_GMT_HITSET:
6275 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6276 break;
6277 case TARGET_MAX_OBJECTS:
6278 ss << "target_max_objects: " << p->target_max_objects << "\n";
6279 break;
6280 case TARGET_MAX_BYTES:
6281 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6282 break;
6283 case CACHE_TARGET_DIRTY_RATIO:
6284 ss << "cache_target_dirty_ratio: "
6285 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6286 break;
6287 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6288 ss << "cache_target_dirty_high_ratio: "
6289 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6290 break;
6291 case CACHE_TARGET_FULL_RATIO:
6292 ss << "cache_target_full_ratio: "
6293 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6294 break;
6295 case CACHE_MIN_FLUSH_AGE:
6296 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6297 break;
6298 case CACHE_MIN_EVICT_AGE:
6299 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6300 break;
6301 case ERASURE_CODE_PROFILE:
6302 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6303 break;
6304 case MIN_READ_RECENCY_FOR_PROMOTE:
6305 ss << "min_read_recency_for_promote: " <<
6306 p->min_read_recency_for_promote << "\n";
6307 break;
6308 case HIT_SET_GRADE_DECAY_RATE:
6309 ss << "hit_set_grade_decay_rate: " <<
6310 p->hit_set_grade_decay_rate << "\n";
6311 break;
6312 case HIT_SET_SEARCH_LAST_N:
6313 ss << "hit_set_search_last_n: " <<
6314 p->hit_set_search_last_n << "\n";
6315 break;
28e407b8
AA
6316 case EC_OVERWRITES:
6317 ss << "allow_ec_overwrites: " <<
6318 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6319 "\n";
6320 break;
7c673cae
FG
6321 case HASHPSPOOL:
6322 case NODELETE:
6323 case NOPGCHANGE:
6324 case NOSIZECHANGE:
6325 case WRITE_FADVISE_DONTNEED:
6326 case NOSCRUB:
6327 case NODEEP_SCRUB:
6328 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6329 if (i->second == *it)
6330 break;
6331 }
11fdf7f2 6332 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6333 ss << i->first << ": " <<
6334 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6335 "true" : "false") << "\n";
6336 break;
6337 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6338 ss << "min_write_recency_for_promote: " <<
6339 p->min_write_recency_for_promote << "\n";
6340 break;
6341 case FAST_READ:
6342 ss << "fast_read: " << p->fast_read << "\n";
6343 break;
6344 case SCRUB_MIN_INTERVAL:
6345 case SCRUB_MAX_INTERVAL:
6346 case DEEP_SCRUB_INTERVAL:
6347 case RECOVERY_PRIORITY:
6348 case RECOVERY_OP_PRIORITY:
6349 case SCRUB_PRIORITY:
6350 case COMPRESSION_MODE:
6351 case COMPRESSION_ALGORITHM:
6352 case COMPRESSION_REQUIRED_RATIO:
6353 case COMPRESSION_MAX_BLOB_SIZE:
6354 case COMPRESSION_MIN_BLOB_SIZE:
6355 case CSUM_TYPE:
6356 case CSUM_MAX_BLOCK:
6357 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6358 case FINGERPRINT_ALGORITHM:
6359 case PG_NUM_MIN:
6360 case TARGET_SIZE_BYTES:
6361 case TARGET_SIZE_RATIO:
6362 case PG_AUTOSCALE_BIAS:
7c673cae
FG
6363 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6364 if (i->second == *it)
6365 break;
6366 }
11fdf7f2 6367 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6368 {
6369 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6370 if (p->opts.is_set(key)) {
6371 if(key == pool_opts_t::CSUM_TYPE) {
11fdf7f2 6372 int64_t val;
7c673cae
FG
6373 p->opts.get(key, &val);
6374 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6375 } else {
6376 ss << i->first << ": " << p->opts.get(key) << "\n";
6377 }
6378 }
6379 }
6380 break;
6381 }
6382 rdata.append(ss.str());
6383 ss.str("");
6384 }
6385 }
6386 r = 0;
7c673cae
FG
6387 } else if (prefix == "osd pool get-quota") {
6388 string pool_name;
9f95a23c 6389 cmd_getval(cmdmap, "pool", pool_name);
7c673cae
FG
6390
6391 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6392 if (poolid < 0) {
11fdf7f2 6393 ceph_assert(poolid == -ENOENT);
7c673cae
FG
6394 ss << "unrecognized pool '" << pool_name << "'";
6395 r = -ENOENT;
6396 goto reply;
6397 }
6398 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
9f95a23c
TL
6399 const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6400 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
6401 if (f) {
6402 f->open_object_section("pool_quotas");
6403 f->dump_string("pool_name", pool_name);
6404 f->dump_unsigned("pool_id", poolid);
6405 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
9f95a23c 6406 f->dump_int("current_num_objects", sum.num_objects);
7c673cae 6407 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
9f95a23c 6408 f->dump_int("current_num_bytes", sum.num_bytes);
7c673cae
FG
6409 f->close_section();
6410 f->flush(rdata);
6411 } else {
6412 stringstream rs;
6413 rs << "quotas for pool '" << pool_name << "':\n"
6414 << " max objects: ";
6415 if (p->quota_max_objects == 0)
6416 rs << "N/A";
9f95a23c 6417 else {
1adf2230 6418 rs << si_u_t(p->quota_max_objects) << " objects";
9f95a23c
TL
6419 rs << " (current num objects: " << sum.num_objects << " objects)";
6420 }
7c673cae
FG
6421 rs << "\n"
6422 << " max bytes : ";
6423 if (p->quota_max_bytes == 0)
6424 rs << "N/A";
9f95a23c 6425 else {
1adf2230 6426 rs << byte_u_t(p->quota_max_bytes);
9f95a23c
TL
6427 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6428 }
7c673cae
FG
6429 rdata.append(rs.str());
6430 }
6431 rdata.append("\n");
6432 r = 0;
6433 } else if (prefix == "osd crush rule list" ||
6434 prefix == "osd crush rule ls") {
c07f9fc5
FG
6435 if (f) {
6436 f->open_array_section("rules");
6437 osdmap.crush->list_rules(f.get());
6438 f->close_section();
6439 f->flush(rdata);
6440 } else {
6441 ostringstream ss;
6442 osdmap.crush->list_rules(&ss);
6443 rdata.append(ss.str());
6444 }
b5b8bbf5
FG
6445 } else if (prefix == "osd crush rule ls-by-class") {
6446 string class_name;
9f95a23c 6447 cmd_getval(cmdmap, "class", class_name);
b5b8bbf5
FG
6448 if (class_name.empty()) {
6449 ss << "no class specified";
6450 r = -EINVAL;
6451 goto reply;
6452 }
6453 set<int> rules;
6454 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6455 if (r < 0) {
6456 ss << "failed to get rules by class '" << class_name << "'";
6457 goto reply;
6458 }
6459 if (f) {
6460 f->open_array_section("rules");
6461 for (auto &rule: rules) {
6462 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6463 }
6464 f->close_section();
6465 f->flush(rdata);
6466 } else {
6467 ostringstream rs;
6468 for (auto &rule: rules) {
6469 rs << osdmap.crush->get_rule_name(rule) << "\n";
6470 }
6471 rdata.append(rs.str());
6472 }
7c673cae
FG
6473 } else if (prefix == "osd crush rule dump") {
6474 string name;
9f95a23c 6475 cmd_getval(cmdmap, "name", name);
7c673cae 6476 string format;
9f95a23c 6477 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6478 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6479 if (name == "") {
6480 f->open_array_section("rules");
6481 osdmap.crush->dump_rules(f.get());
6482 f->close_section();
6483 } else {
6484 int ruleno = osdmap.crush->get_rule_id(name);
6485 if (ruleno < 0) {
31f18b77 6486 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
6487 r = ruleno;
6488 goto reply;
6489 }
6490 osdmap.crush->dump_rule(ruleno, f.get());
6491 }
6492 ostringstream rs;
6493 f->flush(rs);
6494 rs << "\n";
6495 rdata.append(rs.str());
6496 } else if (prefix == "osd crush dump") {
6497 string format;
9f95a23c 6498 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6499 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6500 f->open_object_section("crush_map");
6501 osdmap.crush->dump(f.get());
6502 f->close_section();
6503 ostringstream rs;
6504 f->flush(rs);
6505 rs << "\n";
6506 rdata.append(rs.str());
6507 } else if (prefix == "osd crush show-tunables") {
6508 string format;
9f95a23c 6509 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6510 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6511 f->open_object_section("crush_map_tunables");
6512 osdmap.crush->dump_tunables(f.get());
6513 f->close_section();
6514 ostringstream rs;
6515 f->flush(rs);
6516 rs << "\n";
6517 rdata.append(rs.str());
6518 } else if (prefix == "osd crush tree") {
c07f9fc5 6519 string shadow;
9f95a23c 6520 cmd_getval(cmdmap, "shadow", shadow);
c07f9fc5
FG
6521 bool show_shadow = shadow == "--show-shadow";
6522 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6523 if (f) {
91327a77 6524 f->open_object_section("crush_tree");
c07f9fc5
FG
6525 osdmap.crush->dump_tree(nullptr,
6526 f.get(),
6527 osdmap.get_pool_names(),
6528 show_shadow);
91327a77 6529 f->close_section();
c07f9fc5
FG
6530 f->flush(rdata);
6531 } else {
6532 ostringstream ss;
6533 osdmap.crush->dump_tree(&ss,
6534 nullptr,
6535 osdmap.get_pool_names(),
6536 show_shadow);
6537 rdata.append(ss.str());
6538 }
d2e6a577
FG
6539 } else if (prefix == "osd crush ls") {
6540 string name;
9f95a23c 6541 if (!cmd_getval(cmdmap, "node", name)) {
d2e6a577
FG
6542 ss << "no node specified";
6543 r = -EINVAL;
6544 goto reply;
6545 }
6546 if (!osdmap.crush->name_exists(name)) {
6547 ss << "node '" << name << "' does not exist";
6548 r = -ENOENT;
6549 goto reply;
6550 }
6551 int id = osdmap.crush->get_item_id(name);
6552 list<int> result;
6553 if (id >= 0) {
6554 result.push_back(id);
6555 } else {
6556 int num = osdmap.crush->get_bucket_size(id);
6557 for (int i = 0; i < num; ++i) {
6558 result.push_back(osdmap.crush->get_bucket_item(id, i));
6559 }
6560 }
6561 if (f) {
6562 f->open_array_section("items");
6563 for (auto i : result) {
6564 f->dump_string("item", osdmap.crush->get_item_name(i));
6565 }
6566 f->close_section();
6567 f->flush(rdata);
6568 } else {
6569 ostringstream ss;
6570 for (auto i : result) {
6571 ss << osdmap.crush->get_item_name(i) << "\n";
6572 }
6573 rdata.append(ss.str());
6574 }
6575 r = 0;
7c673cae
FG
6576 } else if (prefix == "osd crush class ls") {
6577 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6578 f->open_array_section("crush_classes");
6579 for (auto i : osdmap.crush->class_name)
6580 f->dump_string("class", i.second);
6581 f->close_section();
6582 f->flush(rdata);
224ce89b
WB
6583 } else if (prefix == "osd crush class ls-osd") {
6584 string name;
9f95a23c 6585 cmd_getval(cmdmap, "class", name);
224ce89b
WB
6586 set<int> osds;
6587 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
6588 if (f) {
6589 f->open_array_section("osds");
6590 for (auto &osd: osds)
6591 f->dump_int("osd", osd);
6592 f->close_section();
6593 f->flush(rdata);
6594 } else {
6595 bool first = true;
6596 for (auto &osd : osds) {
6597 if (!first)
6598 ds << "\n";
6599 first = false;
6600 ds << osd;
6601 }
6602 rdata.append(ds);
6603 }
11fdf7f2
TL
6604 } else if (prefix == "osd crush get-device-class") {
6605 vector<string> idvec;
9f95a23c 6606 cmd_getval(cmdmap, "ids", idvec);
11fdf7f2
TL
6607 map<int, string> class_by_osd;
6608 for (auto& id : idvec) {
6609 ostringstream ts;
6610 long osd = parse_osd_id(id.c_str(), &ts);
6611 if (osd < 0) {
6612 ss << "unable to parse osd id:'" << id << "'";
6613 r = -EINVAL;
6614 goto reply;
6615 }
6616 auto device_class = osdmap.crush->get_item_class(osd);
6617 if (device_class)
6618 class_by_osd[osd] = device_class;
6619 else
6620 class_by_osd[osd] = ""; // no class
6621 }
6622 if (f) {
6623 f->open_array_section("osd_device_classes");
6624 for (auto& i : class_by_osd) {
6625 f->open_object_section("osd_device_class");
6626 f->dump_int("osd", i.first);
6627 f->dump_string("device_class", i.second);
6628 f->close_section();
6629 }
6630 f->close_section();
6631 f->flush(rdata);
6632 } else {
6633 if (class_by_osd.size() == 1) {
6634 // for single input, make a clean output
6635 ds << class_by_osd.begin()->second;
6636 } else {
6637 // note that we do not group osds by class here
6638 for (auto it = class_by_osd.begin();
6639 it != class_by_osd.end();
6640 it++) {
6641 ds << "osd." << it->first << ' ' << it->second;
6642 if (next(it) != class_by_osd.end())
6643 ds << '\n';
6644 }
6645 }
6646 rdata.append(ds);
6647 }
7c673cae
FG
6648 } else if (prefix == "osd erasure-code-profile ls") {
6649 const auto &profiles = osdmap.get_erasure_code_profiles();
6650 if (f)
6651 f->open_array_section("erasure-code-profiles");
6652 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6653 if (f)
6654 f->dump_string("profile", i->first.c_str());
6655 else
6656 rdata.append(i->first + "\n");
6657 }
6658 if (f) {
6659 f->close_section();
6660 ostringstream rs;
6661 f->flush(rs);
6662 rs << "\n";
6663 rdata.append(rs.str());
6664 }
c07f9fc5
FG
6665 } else if (prefix == "osd crush weight-set ls") {
6666 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6667 if (f) {
6668 f->open_array_section("weight_sets");
6669 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6670 f->dump_string("pool", "(compat)");
6671 }
6672 for (auto& i : osdmap.crush->choose_args) {
6673 if (i.first >= 0) {
6674 f->dump_string("pool", osdmap.get_pool_name(i.first));
6675 }
6676 }
6677 f->close_section();
6678 f->flush(rdata);
6679 } else {
6680 ostringstream rs;
6681 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6682 rs << "(compat)\n";
6683 }
6684 for (auto& i : osdmap.crush->choose_args) {
6685 if (i.first >= 0) {
6686 rs << osdmap.get_pool_name(i.first) << "\n";
6687 }
6688 }
6689 rdata.append(rs.str());
6690 }
6691 } else if (prefix == "osd crush weight-set dump") {
6692 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6693 "json-pretty"));
6694 osdmap.crush->dump_choose_args(f.get());
6695 f->flush(rdata);
7c673cae
FG
6696 } else if (prefix == "osd erasure-code-profile get") {
6697 string name;
9f95a23c 6698 cmd_getval(cmdmap, "name", name);
7c673cae
FG
6699 if (!osdmap.has_erasure_code_profile(name)) {
6700 ss << "unknown erasure code profile '" << name << "'";
6701 r = -ENOENT;
6702 goto reply;
6703 }
6704 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6705 if (f)
6706 f->open_object_section("profile");
6707 for (map<string,string>::const_iterator i = profile.begin();
6708 i != profile.end();
6709 ++i) {
6710 if (f)
6711 f->dump_string(i->first.c_str(), i->second.c_str());
6712 else
6713 rdata.append(i->first + "=" + i->second + "\n");
6714 }
6715 if (f) {
6716 f->close_section();
6717 ostringstream rs;
6718 f->flush(rs);
6719 rs << "\n";
6720 rdata.append(rs.str());
6721 }
181888fb
FG
6722 } else if (prefix == "osd pool application get") {
6723 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6724 "json-pretty"));
6725 string pool_name;
9f95a23c 6726 cmd_getval(cmdmap, "pool", pool_name);
181888fb 6727 string app;
9f95a23c 6728 cmd_getval(cmdmap, "app", app);
181888fb 6729 string key;
9f95a23c 6730 cmd_getval(cmdmap, "key", key);
181888fb
FG
6731
6732 if (pool_name.empty()) {
6733 // all
6734 f->open_object_section("pools");
6735 for (const auto &pool : osdmap.pools) {
6736 std::string name("<unknown>");
6737 const auto &pni = osdmap.pool_name.find(pool.first);
6738 if (pni != osdmap.pool_name.end())
6739 name = pni->second;
6740 f->open_object_section(name.c_str());
6741 for (auto &app_pair : pool.second.application_metadata) {
6742 f->open_object_section(app_pair.first.c_str());
6743 for (auto &kv_pair : app_pair.second) {
6744 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6745 }
6746 f->close_section();
6747 }
6748 f->close_section(); // name
6749 }
6750 f->close_section(); // pools
6751 f->flush(rdata);
6752 } else {
6753 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6754 if (pool < 0) {
6755 ss << "unrecognized pool '" << pool_name << "'";
6756 r = -ENOENT;
6757 goto reply;
6758 }
6759 auto p = osdmap.get_pg_pool(pool);
6760 // filter by pool
6761 if (app.empty()) {
6762 f->open_object_section(pool_name.c_str());
6763 for (auto &app_pair : p->application_metadata) {
6764 f->open_object_section(app_pair.first.c_str());
6765 for (auto &kv_pair : app_pair.second) {
6766 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6767 }
6768 f->close_section(); // application
6769 }
6770 f->close_section(); // pool_name
6771 f->flush(rdata);
6772 goto reply;
6773 }
6774
6775 auto app_it = p->application_metadata.find(app);
6776 if (app_it == p->application_metadata.end()) {
6777 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6778 r = -ENOENT;
6779 goto reply;
6780 }
6781 // filter by pool + app
6782 if (key.empty()) {
6783 f->open_object_section(app_it->first.c_str());
6784 for (auto &kv_pair : app_it->second) {
6785 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6786 }
6787 f->close_section(); // application
6788 f->flush(rdata);
6789 goto reply;
6790 }
6791 // filter by pool + app + key
6792 auto key_it = app_it->second.find(key);
6793 if (key_it == app_it->second.end()) {
6794 ss << "application '" << app << "' on pool '" << pool_name
6795 << "' does not have key '" << key << "'";
6796 r = -ENOENT;
6797 goto reply;
6798 }
6799 ss << key_it->second << "\n";
6800 rdata.append(ss.str());
6801 ss.str("");
6802 }
11fdf7f2 6803 } else if (prefix == "osd get-require-min-compat-client") {
9f95a23c 6804 ss << osdmap.require_min_compat_client << std::endl;
11fdf7f2
TL
6805 rdata.append(ss.str());
6806 ss.str("");
6807 goto reply;
6808 } else if (prefix == "osd pool application enable" ||
6809 prefix == "osd pool application disable" ||
6810 prefix == "osd pool application set" ||
6811 prefix == "osd pool application rm") {
6812 bool changed = false;
6813 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6814 if (r != 0) {
6815 // Error, reply.
6816 goto reply;
6817 } else if (changed) {
6818 // Valid mutation, proceed to prepare phase
6819 return false;
6820 } else {
6821 // Idempotent case, reply
6822 goto reply;
6823 }
7c673cae
FG
6824 } else {
6825 // try prepare update
6826 return false;
6827 }
6828
6829 reply:
6830 string rs;
6831 getline(ss, rs);
6832 mon->reply_command(op, r, rs, rdata, get_last_committed());
6833 return true;
6834}
6835
3efd9988
FG
6836void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6837{
6838 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6839 osdmap.get_pg_pool(pool_id));
11fdf7f2 6840 ceph_assert(pool);
3efd9988
FG
6841 pool->set_flag(flags);
6842}
6843
6844void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 6845{
3efd9988
FG
6846 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6847 osdmap.get_pg_pool(pool_id));
11fdf7f2 6848 ceph_assert(pool);
3efd9988 6849 pool->unset_flag(flags);
7c673cae
FG
6850}
6851
9f95a23c 6852string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
11fdf7f2
TL
6853{
6854 char k[80];
9f95a23c 6855 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
11fdf7f2
TL
6856 return k;
6857}
6858
9f95a23c 6859string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
11fdf7f2
TL
6860{
6861 char k[80];
9f95a23c 6862 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
11fdf7f2
TL
6863 (unsigned long long)pool, (unsigned long long)snap);
6864 return k;
6865}
6866
9f95a23c 6867string OSDMonitor::make_purged_snap_key_value(
11fdf7f2
TL
6868 int64_t pool, snapid_t snap, snapid_t num,
6869 epoch_t epoch, bufferlist *v)
6870{
6871 // encode the *last* epoch in the key so that we can use forward
6872 // iteration only to search for an epoch in an interval.
6873 encode(snap, *v);
6874 encode(snap + num, *v);
6875 encode(epoch, *v);
9f95a23c 6876 return make_purged_snap_key(pool, snap + num - 1);
11fdf7f2
TL
6877}
6878
11fdf7f2 6879
9f95a23c
TL
6880int OSDMonitor::lookup_purged_snap(
6881 int64_t pool, snapid_t snap,
6882 snapid_t *begin, snapid_t *end)
11fdf7f2 6883{
9f95a23c 6884 string k = make_purged_snap_key(pool, snap);
11fdf7f2
TL
6885 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6886 it->lower_bound(k);
6887 if (!it->valid()) {
9f95a23c
TL
6888 dout(20) << __func__
6889 << " pool " << pool << " snap " << snap
6890 << " - key '" << k << "' not found" << dendl;
6891 return -ENOENT;
6892 }
6893 if (it->key().find("purged_snap_") != 0) {
6894 dout(20) << __func__
6895 << " pool " << pool << " snap " << snap
6896 << " - key '" << k << "' got '" << it->key()
6897 << "', wrong prefix" << dendl;
11fdf7f2
TL
6898 return -ENOENT;
6899 }
9f95a23c
TL
6900 string gotk = it->key();
6901 const char *format = "purged_snap_%llu_";
6902 long long int keypool;
6903 int n = sscanf(gotk.c_str(), format, &keypool);
6904 if (n != 1) {
6905 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6906 return -ENOENT;
6907 }
6908 if (pool != keypool) {
6909 dout(20) << __func__
6910 << " pool " << pool << " snap " << snap
6911 << " - key '" << k << "' got '" << gotk
6912 << "', wrong pool " << keypool
6913 << dendl;
11fdf7f2
TL
6914 return -ENOENT;
6915 }
6916 bufferlist v = it->value();
6917 auto p = v.cbegin();
6918 decode(*begin, p);
6919 decode(*end, p);
6920 if (snap < *begin || snap >= *end) {
9f95a23c
TL
6921 dout(20) << __func__
6922 << " pool " << pool << " snap " << snap
6923 << " - found [" << *begin << "," << *end << "), no overlap"
6924 << dendl;
11fdf7f2
TL
6925 return -ENOENT;
6926 }
6927 return 0;
6928}
6929
9f95a23c
TL
6930void OSDMonitor::insert_purged_snap_update(
6931 int64_t pool,
6932 snapid_t start, snapid_t end,
6933 epoch_t epoch,
6934 MonitorDBStore::TransactionRef t)
6935{
6936 snapid_t before_begin, before_end;
6937 snapid_t after_begin, after_end;
6938 int b = lookup_purged_snap(pool, start - 1,
6939 &before_begin, &before_end);
6940 int a = lookup_purged_snap(pool, end,
6941 &after_begin, &after_end);
6942 if (!b && !a) {
6943 dout(10) << __func__
6944 << " [" << start << "," << end << ") - joins ["
6945 << before_begin << "," << before_end << ") and ["
6946 << after_begin << "," << after_end << ")" << dendl;
6947 // erase only the begin record; we'll overwrite the end one.
6948 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6949 bufferlist v;
6950 string k = make_purged_snap_key_value(pool,
6951 before_begin, after_end - before_begin,
6952 pending_inc.epoch, &v);
6953 t->put(OSD_SNAP_PREFIX, k, v);
6954 } else if (!b) {
6955 dout(10) << __func__
6956 << " [" << start << "," << end << ") - join with earlier ["
6957 << before_begin << "," << before_end << ")" << dendl;
6958 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6959 bufferlist v;
6960 string k = make_purged_snap_key_value(pool,
6961 before_begin, end - before_begin,
6962 pending_inc.epoch, &v);
6963 t->put(OSD_SNAP_PREFIX, k, v);
6964 } else if (!a) {
6965 dout(10) << __func__
6966 << " [" << start << "," << end << ") - join with later ["
6967 << after_begin << "," << after_end << ")" << dendl;
6968 // overwrite after record
6969 bufferlist v;
6970 string k = make_purged_snap_key_value(pool,
6971 start, after_end - start,
6972 pending_inc.epoch, &v);
6973 t->put(OSD_SNAP_PREFIX, k, v);
6974 } else {
6975 dout(10) << __func__
6976 << " [" << start << "," << end << ") - new"
6977 << dendl;
6978 bufferlist v;
6979 string k = make_purged_snap_key_value(pool,
6980 start, end - start,
6981 pending_inc.epoch, &v);
6982 t->put(OSD_SNAP_PREFIX, k, v);
6983 }
6984}
6985
11fdf7f2
TL
6986bool OSDMonitor::try_prune_purged_snaps()
6987{
6988 if (!mon->mgrstatmon()->is_readable()) {
6989 return false;
6990 }
11fdf7f2
TL
6991 if (!pending_inc.new_purged_snaps.empty()) {
6992 return false; // we already pruned for this epoch
6993 }
6994
6995 unsigned max_prune = cct->_conf.get_val<uint64_t>(
6996 "mon_max_snap_prune_per_epoch");
6997 if (!max_prune) {
6998 max_prune = 100000;
6999 }
7000 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7001
7002 unsigned actually_pruned = 0;
7003 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7004 for (auto& p : osdmap.get_pools()) {
7005 auto q = purged_snaps.find(p.first);
7006 if (q == purged_snaps.end()) {
7007 continue;
7008 }
7009 auto& purged = q->second;
7010 if (purged.empty()) {
7011 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7012 continue;
7013 }
7014 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
9f95a23c 7015 snap_interval_set_t to_prune;
11fdf7f2
TL
7016 unsigned maybe_pruned = actually_pruned;
7017 for (auto i = purged.begin(); i != purged.end(); ++i) {
7018 snapid_t begin = i.get_start();
7019 auto end = i.get_start() + i.get_len();
7020 snapid_t pbegin = 0, pend = 0;
9f95a23c 7021 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
11fdf7f2
TL
7022 if (r == 0) {
7023 // already purged.
7024 // be a bit aggressive about backing off here, because the mon may
7025 // do a lot of work going through this set, and if we know the
7026 // purged set from the OSDs is at least *partly* stale we may as
7027 // well wait for it to be fresh.
9f95a23c 7028 dout(20) << __func__ << " we've already purged " << pbegin
11fdf7f2
TL
7029 << "~" << (pend - pbegin) << dendl;
7030 break; // next pool
7031 }
9f95a23c 7032 if (pbegin && pbegin > begin && pbegin < end) {
11fdf7f2 7033 // the tail of [begin,end) is purged; shorten the range
11fdf7f2
TL
7034 end = pbegin;
7035 }
7036 to_prune.insert(begin, end - begin);
7037 maybe_pruned += end - begin;
7038 if (maybe_pruned >= max_prune) {
7039 break;
7040 }
7041 }
7042 if (!to_prune.empty()) {
7043 // PGs may still be reporting things as purged that we have already
7044 // pruned from removed_snaps_queue.
9f95a23c 7045 snap_interval_set_t actual;
11fdf7f2
TL
7046 auto r = osdmap.removed_snaps_queue.find(p.first);
7047 if (r != osdmap.removed_snaps_queue.end()) {
7048 actual.intersection_of(to_prune, r->second);
7049 }
7050 actually_pruned += actual.size();
7051 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7052 << ", actual pruned " << actual << dendl;
7053 if (!actual.empty()) {
7054 pending_inc.new_purged_snaps[p.first].swap(actual);
7055 }
7056 }
7057 if (actually_pruned >= max_prune) {
7058 break;
7059 }
7060 }
7061 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7062 return !!actually_pruned;
7063}
7064
7c673cae
FG
7065bool OSDMonitor::update_pools_status()
7066{
11fdf7f2 7067 if (!mon->mgrstatmon()->is_readable())
7c673cae
FG
7068 return false;
7069
7070 bool ret = false;
7071
7072 auto& pools = osdmap.get_pools();
7073 for (auto it = pools.begin(); it != pools.end(); ++it) {
11fdf7f2 7074 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
31f18b77 7075 if (!pstat)
7c673cae 7076 continue;
31f18b77 7077 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
7078 const pg_pool_t &pool = it->second;
7079 const string& pool_name = osdmap.get_pool_name(it->first);
7080
7081 bool pool_is_full =
7082 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7083 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7084
11fdf7f2 7085 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7c673cae
FG
7086 if (pool_is_full)
7087 continue;
7088
7089 mon->clog->info() << "pool '" << pool_name
3efd9988
FG
7090 << "' no longer out of quota; removing NO_QUOTA flag";
7091 // below we cancel FLAG_FULL too, we'll set it again in
7092 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7093 clear_pool_flags(it->first,
11fdf7f2 7094 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
7095 ret = true;
7096 } else {
7097 if (!pool_is_full)
7098 continue;
7099
7100 if (pool.quota_max_bytes > 0 &&
7101 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7102 mon->clog->warn() << "pool '" << pool_name << "' is full"
7103 << " (reached quota's max_bytes: "
1adf2230 7104 << byte_u_t(pool.quota_max_bytes) << ")";
7c673cae
FG
7105 }
7106 if (pool.quota_max_objects > 0 &&
7107 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7108 mon->clog->warn() << "pool '" << pool_name << "' is full"
7109 << " (reached quota's max_objects: "
7110 << pool.quota_max_objects << ")";
7111 }
11fdf7f2 7112 // set both FLAG_FULL_QUOTA and FLAG_FULL
3efd9988
FG
7113 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7114 // since FLAG_FULL should always take precedence
7115 set_pool_flags(it->first,
11fdf7f2 7116 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
3efd9988
FG
7117 clear_pool_flags(it->first,
7118 pg_pool_t::FLAG_NEARFULL |
7119 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
7120 ret = true;
7121 }
7122 }
7123 return ret;
7124}
7125
7c673cae
FG
7126int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7127{
7128 op->mark_osdmon_event(__func__);
9f95a23c 7129 auto m = op->get_req<MPoolOp>();
7c673cae 7130 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
11fdf7f2 7131 MonSession *session = op->get_session();
7c673cae
FG
7132 if (!session)
7133 return -EPERM;
7134 string erasure_code_profile;
7135 stringstream ss;
31f18b77 7136 string rule_name;
94b18763 7137 int ret = 0;
11fdf7f2
TL
7138 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7139 0, 0, 0, 0, 0, 0.0,
7140 erasure_code_profile,
9f95a23c
TL
7141 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7142 &ss);
94b18763
FG
7143
7144 if (ret < 0) {
7145 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7146 }
7147 return ret;
7c673cae
FG
7148}
7149
7150int OSDMonitor::crush_rename_bucket(const string& srcname,
7151 const string& dstname,
7152 ostream *ss)
7153{
7154 int ret;
7155 //
7156 // Avoid creating a pending crush if it does not already exists and
7157 // the rename would fail.
7158 //
7159 if (!_have_pending_crush()) {
7160 ret = _get_stable_crush().can_rename_bucket(srcname,
7161 dstname,
7162 ss);
7163 if (ret)
7164 return ret;
7165 }
7166
7167 CrushWrapper newcrush;
7168 _get_pending_crush(newcrush);
7169
7170 ret = newcrush.rename_bucket(srcname,
7171 dstname,
7172 ss);
7173 if (ret)
7174 return ret;
7175
7176 pending_inc.crush.clear();
7177 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7178 *ss << "renamed bucket " << srcname << " into " << dstname;
7179 return 0;
7180}
7181
7182void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7183{
7184 string replacement = "";
7185
7186 if (plugin == "jerasure_generic" ||
7187 plugin == "jerasure_sse3" ||
7188 plugin == "jerasure_sse4" ||
7189 plugin == "jerasure_neon") {
7190 replacement = "jerasure";
7191 } else if (plugin == "shec_generic" ||
7192 plugin == "shec_sse3" ||
7193 plugin == "shec_sse4" ||
7194 plugin == "shec_neon") {
7195 replacement = "shec";
7196 }
7197
7198 if (replacement != "") {
7199 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7200 << plugin << " that has been deprecated. Please use "
7201 << replacement << " instead." << dendl;
7202 }
7203}
7204
7205int OSDMonitor::normalize_profile(const string& profilename,
7206 ErasureCodeProfile &profile,
7207 bool force,
7208 ostream *ss)
7209{
7210 ErasureCodeInterfaceRef erasure_code;
7211 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7212 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7213 check_legacy_ec_plugin(plugin->second, profilename);
7214 int err = instance.factory(plugin->second,
11fdf7f2 7215 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7216 profile, &erasure_code, ss);
7217 if (err) {
7218 return err;
7219 }
7220
7221 err = erasure_code->init(profile, ss);
7222 if (err) {
7223 return err;
7224 }
7225
7226 auto it = profile.find("stripe_unit");
7227 if (it != profile.end()) {
7228 string err_str;
1adf2230 7229 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7c673cae
FG
7230 if (!err_str.empty()) {
7231 *ss << "could not parse stripe_unit '" << it->second
7232 << "': " << err_str << std::endl;
7233 return -EINVAL;
7234 }
7235 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7236 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7237 if (chunk_size != stripe_unit) {
7238 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7239 << "alignment. Would be padded to " << chunk_size
7240 << std::endl;
7241 return -EINVAL;
7242 }
7243 if ((stripe_unit % 4096) != 0 && !force) {
7244 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7245 << "use --force to override this check" << std::endl;
7246 return -EINVAL;
7247 }
7248 }
7249 return 0;
7250}
7251
31f18b77 7252int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 7253 const string &profile,
31f18b77 7254 int *rule,
7c673cae
FG
7255 ostream *ss)
7256{
7257 int ruleid = osdmap.crush->get_rule_id(name);
7258 if (ruleid != -ENOENT) {
31f18b77 7259 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
7260 return -EEXIST;
7261 }
7262
7263 CrushWrapper newcrush;
7264 _get_pending_crush(newcrush);
7265
7266 ruleid = newcrush.get_rule_id(name);
7267 if (ruleid != -ENOENT) {
31f18b77 7268 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
7269 return -EALREADY;
7270 } else {
7271 ErasureCodeInterfaceRef erasure_code;
7272 int err = get_erasure_code(profile, &erasure_code, ss);
7273 if (err) {
7274 *ss << "failed to load plugin using profile " << profile << std::endl;
7275 return err;
7276 }
7277
224ce89b 7278 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
7279 erasure_code.reset();
7280 if (err < 0)
7281 return err;
31f18b77 7282 *rule = err;
7c673cae
FG
7283 pending_inc.crush.clear();
7284 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7285 return 0;
7286 }
7287}
7288
7289int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7290 ErasureCodeInterfaceRef *erasure_code,
7291 ostream *ss) const
7292{
7293 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7294 return -EAGAIN;
7295 ErasureCodeProfile profile =
7296 osdmap.get_erasure_code_profile(erasure_code_profile);
7297 ErasureCodeProfile::const_iterator plugin =
7298 profile.find("plugin");
7299 if (plugin == profile.end()) {
7300 *ss << "cannot determine the erasure code plugin"
7301 << " because there is no 'plugin' entry in the erasure_code_profile "
7302 << profile << std::endl;
7303 return -EINVAL;
7304 }
7305 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7306 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7307 return instance.factory(plugin->second,
11fdf7f2 7308 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7309 profile, erasure_code, ss);
7310}
7311
7312int OSDMonitor::check_cluster_features(uint64_t features,
7313 stringstream &ss)
7314{
7315 stringstream unsupported_ss;
7316 int unsupported_count = 0;
7317 if ((mon->get_quorum_con_features() & features) != features) {
7318 unsupported_ss << "the monitor cluster";
7319 ++unsupported_count;
7320 }
7321
7322 set<int32_t> up_osds;
7323 osdmap.get_up_osds(up_osds);
7324 for (set<int32_t>::iterator it = up_osds.begin();
7325 it != up_osds.end(); ++it) {
7326 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7327 if ((xi.features & features) != features) {
7328 if (unsupported_count > 0)
7329 unsupported_ss << ", ";
7330 unsupported_ss << "osd." << *it;
7331 unsupported_count ++;
7332 }
7333 }
7334
7335 if (unsupported_count > 0) {
7336 ss << "features " << features << " unsupported by: "
7337 << unsupported_ss.str();
7338 return -ENOTSUP;
7339 }
7340
7341 // check pending osd state, too!
7342 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7343 pending_inc.new_xinfo.begin();
7344 p != pending_inc.new_xinfo.end(); ++p) {
7345 const osd_xinfo_t &xi = p->second;
7346 if ((xi.features & features) != features) {
7347 dout(10) << __func__ << " pending osd." << p->first
7348 << " features are insufficient; retry" << dendl;
7349 return -EAGAIN;
7350 }
7351 }
7352
7353 return 0;
7354}
7355
7356bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7357 stringstream& ss)
7358{
7359 OSDMap::Incremental new_pending = pending_inc;
11fdf7f2 7360 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7c673cae
FG
7361 OSDMap newmap;
7362 newmap.deepish_copy_from(osdmap);
7363 newmap.apply_incremental(new_pending);
7364
7365 // client compat
9f95a23c 7366 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7c673cae 7367 auto mv = newmap.get_min_compat_client();
31f18b77 7368 if (mv > newmap.require_min_compat_client) {
9f95a23c 7369 ss << "new crush map requires client version " << mv
7c673cae 7370 << " but require_min_compat_client is "
9f95a23c 7371 << newmap.require_min_compat_client;
7c673cae
FG
7372 return false;
7373 }
7374 }
7375
7376 // osd compat
7377 uint64_t features =
7378 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7379 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7380 stringstream features_ss;
7381 int r = check_cluster_features(features, features_ss);
7382 if (r) {
7383 ss << "Could not change CRUSH: " << features_ss.str();
7384 return false;
7385 }
7386
7387 return true;
7388}
7389
7390bool OSDMonitor::erasure_code_profile_in_use(
7391 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7392 const string &profile,
7393 ostream *ss)
7394{
7395 bool found = false;
7396 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7397 p != pools.end();
7398 ++p) {
11fdf7f2 7399 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7c673cae
FG
7400 *ss << osdmap.pool_name[p->first] << " ";
7401 found = true;
7402 }
7403 }
7404 if (found) {
7405 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7406 }
7407 return found;
7408}
7409
7410int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7411 map<string,string> *erasure_code_profile_map,
7412 ostream *ss)
7413{
11fdf7f2
TL
7414 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7415 get_json_str_map,
7416 *ss,
7417 erasure_code_profile_map,
7418 true);
7c673cae
FG
7419 if (r)
7420 return r;
11fdf7f2 7421 ceph_assert((*erasure_code_profile_map).count("plugin"));
7c673cae
FG
7422 string default_plugin = (*erasure_code_profile_map)["plugin"];
7423 map<string,string> user_map;
7424 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7425 i != erasure_code_profile.end();
7426 ++i) {
7427 size_t equal = i->find('=');
7428 if (equal == string::npos) {
7429 user_map[*i] = string();
7430 (*erasure_code_profile_map)[*i] = string();
7431 } else {
11fdf7f2 7432 const string key = i->substr(0, equal);
7c673cae
FG
7433 equal++;
7434 const string value = i->substr(equal);
11fdf7f2
TL
7435 if (key.find("ruleset-") == 0) {
7436 *ss << "property '" << key << "' is no longer supported; try "
7437 << "'crush-" << key.substr(8) << "' instead";
7438 return -EINVAL;
3efd9988 7439 }
7c673cae
FG
7440 user_map[key] = value;
7441 (*erasure_code_profile_map)[key] = value;
7442 }
7443 }
7444
7445 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7446 (*erasure_code_profile_map) = user_map;
7447
7448 return 0;
7449}
7450
7451int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7452 const string &erasure_code_profile,
11fdf7f2 7453 uint8_t repl_size,
7c673cae
FG
7454 unsigned *size, unsigned *min_size,
7455 ostream *ss)
7456{
7457 int err = 0;
7458 switch (pool_type) {
7459 case pg_pool_t::TYPE_REPLICATED:
11fdf7f2
TL
7460 if (repl_size == 0) {
7461 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7462 }
7463 *size = repl_size;
7464 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7c673cae
FG
7465 break;
7466 case pg_pool_t::TYPE_ERASURE:
7467 {
7468 ErasureCodeInterfaceRef erasure_code;
7469 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7470 if (err == 0) {
7471 *size = erasure_code->get_chunk_count();
11fdf7f2
TL
7472 *min_size =
7473 erasure_code->get_data_chunk_count() +
7474 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7475 assert(*min_size <= *size);
7476 assert(*min_size >= erasure_code->get_data_chunk_count());
7c673cae
FG
7477 }
7478 }
7479 break;
7480 default:
7481 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7482 err = -EINVAL;
7483 break;
7484 }
7485 return err;
7486}
7487
7488int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7489 const string &erasure_code_profile,
7490 uint32_t *stripe_width,
7491 ostream *ss)
7492{
7493 int err = 0;
7494 switch (pool_type) {
7495 case pg_pool_t::TYPE_REPLICATED:
7496 // ignored
7497 break;
7498 case pg_pool_t::TYPE_ERASURE:
7499 {
7500 ErasureCodeProfile profile =
7501 osdmap.get_erasure_code_profile(erasure_code_profile);
7502 ErasureCodeInterfaceRef erasure_code;
7503 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7504 if (err)
7505 break;
7506 uint32_t data_chunks = erasure_code->get_data_chunk_count();
11fdf7f2 7507 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7c673cae
FG
7508 auto it = profile.find("stripe_unit");
7509 if (it != profile.end()) {
7510 string err_str;
1adf2230 7511 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
11fdf7f2 7512 ceph_assert(err_str.empty());
7c673cae
FG
7513 }
7514 *stripe_width = data_chunks *
7515 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7516 }
7517 break;
7518 default:
7519 *ss << "prepare_pool_stripe_width: "
7520 << pool_type << " is not a known pool type";
7521 err = -EINVAL;
7522 break;
7523 }
7524 return err;
7525}
7526
31f18b77 7527int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
7528 const string &erasure_code_profile,
7529 const string &rule_name,
7530 int *crush_rule,
7531 ostream *ss)
7c673cae
FG
7532{
7533
31f18b77 7534 if (*crush_rule < 0) {
7c673cae
FG
7535 switch (pool_type) {
7536 case pg_pool_t::TYPE_REPLICATED:
7537 {
31f18b77 7538 if (rule_name == "") {
224ce89b 7539 // Use default rule
11fdf7f2 7540 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
31f18b77
FG
7541 if (*crush_rule < 0) {
7542 // Errors may happen e.g. if no valid rule is available
7543 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
7544 << "'osd pool default crush *' config options";
7545 return -ENOENT;
7546 }
7547 } else {
31f18b77 7548 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
7549 }
7550 }
7551 break;
7552 case pg_pool_t::TYPE_ERASURE:
7553 {
31f18b77 7554 int err = crush_rule_create_erasure(rule_name,
7c673cae 7555 erasure_code_profile,
31f18b77 7556 crush_rule, ss);
7c673cae
FG
7557 switch (err) {
7558 case -EALREADY:
31f18b77
FG
7559 dout(20) << "prepare_pool_crush_rule: rule "
7560 << rule_name << " try again" << dendl;
7c673cae
FG
7561 // fall through
7562 case 0:
7563 // need to wait for the crush rule to be proposed before proceeding
7564 err = -EAGAIN;
7565 break;
7566 case -EEXIST:
7567 err = 0;
7568 break;
7569 }
7570 return err;
7571 }
7572 break;
7573 default:
31f18b77 7574 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
7575 << " is not a known pool type";
7576 return -EINVAL;
7577 break;
7578 }
7579 } else {
31f18b77
FG
7580 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7581 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
7582 return -ENOENT;
7583 }
7584 }
7585
7586 return 0;
7587}
7588
31f18b77 7589int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
7590 int *crush_rule,
7591 ostream *ss)
7c673cae
FG
7592{
7593 int ret;
31f18b77 7594 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
7595 if (ret != -ENOENT) {
7596 // found it, use it
31f18b77 7597 *crush_rule = ret;
7c673cae
FG
7598 } else {
7599 CrushWrapper newcrush;
7600 _get_pending_crush(newcrush);
7601
31f18b77 7602 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
7603 if (ret != -ENOENT) {
7604 // found it, wait for it to be proposed
31f18b77 7605 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
7606 << " try again" << dendl;
7607 return -EAGAIN;
7608 } else {
224ce89b 7609 // Cannot find it , return error
31f18b77 7610 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
7611 return ret;
7612 }
7613 }
7614 return 0;
7615}
7616
3efd9988
FG
7617int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7618{
11fdf7f2 7619 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988
FG
7620 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7621 auto max_pgs = max_pgs_per_osd * num_osds;
7622 uint64_t projected = 0;
7623 if (pool < 0) {
7624 projected += pg_num * size;
7625 }
7626 for (const auto& i : osdmap.get_pools()) {
7627 if (i.first == pool) {
7628 projected += pg_num * size;
7629 } else {
11fdf7f2 7630 projected += i.second.get_pg_num_target() * i.second.get_size();
3efd9988
FG
7631 }
7632 }
7633 if (projected > max_pgs) {
7634 if (pool >= 0) {
7635 *ss << "pool id " << pool;
7636 }
7637 *ss << " pg_num " << pg_num << " size " << size
7638 << " would mean " << projected
7639 << " total pgs, which exceeds max " << max_pgs
7640 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7641 << " * num_in_osds " << num_osds << ")";
7642 return -ERANGE;
7643 }
7644 return 0;
7645}
7646
7c673cae
FG
7647/**
7648 * @param name The name of the new pool
31f18b77
FG
7649 * @param crush_rule The crush rule to use. If <0, will use the system default
7650 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
7651 * @param pg_num The pg_num to use. If set to 0, will use the system default
7652 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
11fdf7f2 7653 * @param repl_size Replication factor, or 0 for default
7c673cae
FG
7654 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7655 * @param pool_type TYPE_ERASURE, or TYPE_REP
7656 * @param expected_num_objects expected number of objects on the pool
7657 * @param fast_read fast read type.
7658 * @param ss human readable error message, if any.
7659 *
7660 * @return 0 on success, negative errno on failure.
7661 */
11fdf7f2 7662int OSDMonitor::prepare_new_pool(string& name,
31f18b77
FG
7663 int crush_rule,
7664 const string &crush_rule_name,
7c673cae 7665 unsigned pg_num, unsigned pgp_num,
11fdf7f2
TL
7666 unsigned pg_num_min,
7667 const uint64_t repl_size,
7668 const uint64_t target_size_bytes,
7669 const float target_size_ratio,
7c673cae
FG
7670 const string &erasure_code_profile,
7671 const unsigned pool_type,
7672 const uint64_t expected_num_objects,
7673 FastReadType fast_read,
9f95a23c 7674 const string& pg_autoscale_mode,
7c673cae
FG
7675 ostream *ss)
7676{
7677 if (name.length() == 0)
7678 return -EINVAL;
7679 if (pg_num == 0)
11fdf7f2 7680 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7c673cae 7681 if (pgp_num == 0)
11fdf7f2
TL
7682 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7683 if (!pgp_num)
7684 pgp_num = pg_num;
7685 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7c673cae 7686 *ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 7687 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7c673cae
FG
7688 << " (you may adjust 'mon max pool pg num' for higher values)";
7689 return -ERANGE;
7690 }
7691 if (pgp_num > pg_num) {
7692 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7693 << ", which in this case is " << pg_num;
7694 return -ERANGE;
7695 }
7696 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7697 *ss << "'fast_read' can only apply to erasure coding pool";
7698 return -EINVAL;
7699 }
7700 int r;
31f18b77
FG
7701 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7702 crush_rule_name, &crush_rule, ss);
7c673cae 7703 if (r) {
94b18763 7704 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
7705 return r;
7706 }
11fdf7f2 7707 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
7708 CrushWrapper newcrush;
7709 _get_pending_crush(newcrush);
7710 ostringstream err;
7711 CrushTester tester(newcrush, err);
b5b8bbf5 7712 tester.set_min_x(0);
224ce89b
WB
7713 tester.set_max_x(50);
7714 tester.set_rule(crush_rule);
b5b8bbf5 7715 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 7716 r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 7717 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 7718 if (r < 0) {
94b18763 7719 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
7720 << ": " << err.str() << dendl;
7721 *ss << "crush test failed with " << r << ": " << err.str();
7722 return r;
7723 }
181888fb 7724 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 7725 << duration << dendl;
7c673cae
FG
7726 }
7727 unsigned size, min_size;
11fdf7f2
TL
7728 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7729 &size, &min_size, ss);
7c673cae 7730 if (r) {
94b18763 7731 dout(10) << "prepare_pool_size returns " << r << dendl;
7c673cae
FG
7732 return r;
7733 }
3efd9988
FG
7734 r = check_pg_num(-1, pg_num, size, ss);
7735 if (r) {
94b18763 7736 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
7737 return r;
7738 }
7c673cae 7739
31f18b77 7740 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
7741 return -EINVAL;
7742 }
7743
7744 uint32_t stripe_width = 0;
7745 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7746 if (r) {
94b18763 7747 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
7748 return r;
7749 }
7750
7751 bool fread = false;
7752 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7753 switch (fast_read) {
7754 case FAST_READ_OFF:
7755 fread = false;
7756 break;
7757 case FAST_READ_ON:
7758 fread = true;
7759 break;
7760 case FAST_READ_DEFAULT:
11fdf7f2 7761 fread = g_conf()->osd_pool_default_ec_fast_read;
7c673cae
FG
7762 break;
7763 default:
7764 *ss << "invalid fast_read setting: " << fast_read;
7765 return -EINVAL;
7766 }
7767 }
7768
7769 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7770 p != pending_inc.new_pool_names.end();
7771 ++p) {
7772 if (p->second == name)
7773 return 0;
7774 }
7775
7776 if (-1 == pending_inc.new_pool_max)
7777 pending_inc.new_pool_max = osdmap.pool_max;
7778 int64_t pool = ++pending_inc.new_pool_max;
7779 pg_pool_t empty;
7780 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
11fdf7f2 7781 pi->create_time = ceph_clock_now();
7c673cae
FG
7782 pi->type = pool_type;
7783 pi->fast_read = fread;
11fdf7f2
TL
7784 pi->flags = g_conf()->osd_pool_default_flags;
7785 if (g_conf()->osd_pool_default_flag_hashpspool)
7c673cae 7786 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
11fdf7f2 7787 if (g_conf()->osd_pool_default_flag_nodelete)
7c673cae 7788 pi->set_flag(pg_pool_t::FLAG_NODELETE);
11fdf7f2 7789 if (g_conf()->osd_pool_default_flag_nopgchange)
7c673cae 7790 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
11fdf7f2 7791 if (g_conf()->osd_pool_default_flag_nosizechange)
7c673cae 7792 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
7793 pi->set_flag(pg_pool_t::FLAG_CREATING);
7794 if (g_conf()->osd_pool_use_gmt_hitset)
7c673cae
FG
7795 pi->use_gmt_hitset = true;
7796 else
7797 pi->use_gmt_hitset = false;
7798
7799 pi->size = size;
7800 pi->min_size = min_size;
31f18b77 7801 pi->crush_rule = crush_rule;
7c673cae
FG
7802 pi->expected_num_objects = expected_num_objects;
7803 pi->object_hash = CEPH_STR_HASH_RJENKINS;
11fdf7f2 7804
9f95a23c
TL
7805 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7806 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7807 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7808 pi->pg_autoscale_mode = m;
7809 } else {
7810 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
11fdf7f2
TL
7811 }
7812 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7813 pi->set_pg_num(
7814 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7815 : pg_num);
7816 pi->set_pg_num_pending(pi->get_pg_num());
7817 pi->set_pg_num_target(pg_num);
7818 pi->set_pgp_num(pi->get_pg_num());
7819 pi->set_pgp_num_target(pgp_num);
9f95a23c 7820 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
7821 pg_num_min) {
7822 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7823 }
9f95a23c
TL
7824 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7825 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7826 pi->pg_autoscale_mode = m;
7827 }
11fdf7f2 7828
7c673cae 7829 pi->last_change = pending_inc.epoch;
11fdf7f2
TL
7830 pi->auid = 0;
7831
7832 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7833 pi->erasure_code_profile = erasure_code_profile;
7834 } else {
7835 pi->erasure_code_profile = "";
7836 }
7c673cae 7837 pi->stripe_width = stripe_width;
11fdf7f2 7838
9f95a23c 7839 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
7840 target_size_bytes) {
7841 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7842 // larger than int32_t max.
7843 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7844 }
7845 if (target_size_ratio > 0.0 &&
9f95a23c 7846 osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
7847 // only store for nautilus+, just to be consistent and tidy.
7848 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7849 }
7850
7c673cae 7851 pi->cache_target_dirty_ratio_micro =
11fdf7f2 7852 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7c673cae 7853 pi->cache_target_dirty_high_ratio_micro =
11fdf7f2 7854 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7c673cae 7855 pi->cache_target_full_ratio_micro =
11fdf7f2
TL
7856 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7857 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7858 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7859
7c673cae
FG
7860 pending_inc.new_pool_names[pool] = name;
7861 return 0;
7862}
7863
7864bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7865{
7866 op->mark_osdmon_event(__func__);
7867 ostringstream ss;
7868 if (pending_inc.new_flags < 0)
7869 pending_inc.new_flags = osdmap.get_flags();
7870 pending_inc.new_flags |= flag;
7871 ss << OSDMap::get_flag_string(flag) << " is set";
7872 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7873 get_last_committed() + 1));
7874 return true;
7875}
7876
7877bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7878{
7879 op->mark_osdmon_event(__func__);
7880 ostringstream ss;
7881 if (pending_inc.new_flags < 0)
7882 pending_inc.new_flags = osdmap.get_flags();
7883 pending_inc.new_flags &= ~flag;
7884 ss << OSDMap::get_flag_string(flag) << " is unset";
7885 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7886 get_last_committed() + 1));
7887 return true;
7888}
7889
11fdf7f2 7890int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7c673cae
FG
7891 stringstream& ss)
7892{
7893 string poolstr;
9f95a23c 7894 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
7895 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7896 if (pool < 0) {
7897 ss << "unrecognized pool '" << poolstr << "'";
7898 return -ENOENT;
7899 }
7900 string var;
9f95a23c 7901 cmd_getval(cmdmap, "var", var);
7c673cae
FG
7902
7903 pg_pool_t p = *osdmap.get_pg_pool(pool);
7904 if (pending_inc.new_pools.count(pool))
7905 p = pending_inc.new_pools[pool];
7906
7907 // accept val as a json string in the normal case (current
7908 // generation monitor). parse out int or float values from the
7909 // string as needed. however, if it is not a string, try to pull
7910 // out an int, in case an older monitor with an older json schema is
7911 // forwarding a request.
7912 string val;
7913 string interr, floaterr;
7914 int64_t n = 0;
7915 double f = 0;
7916 int64_t uf = 0; // micro-f
9f95a23c 7917 cmd_getval(cmdmap, "val", val);
f64942e4 7918
9f95a23c
TL
7919 auto si_options = {
7920 "target_max_objects"
7921 };
7922 auto iec_options = {
7923 "target_max_bytes",
7924 "target_size_bytes",
7925 "compression_max_blob_size",
7926 "compression_min_blob_size",
7927 "csum_max_block",
7928 "csum_min_block",
7929 };
7930 if (count(begin(si_options), end(si_options), var)) {
92f5a8d4 7931 n = strict_si_cast<int64_t>(val.c_str(), &interr);
9f95a23c 7932 } else if (count(begin(iec_options), end(iec_options), var)) {
92f5a8d4
TL
7933 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7934 } else {
7935 // parse string as both int and float; different fields use different types.
7936 n = strict_strtoll(val.c_str(), 10, &interr);
7937 f = strict_strtod(val.c_str(), &floaterr);
7938 uf = llrintl(f * (double)1000000.0);
7939 }
7c673cae
FG
7940
7941 if (!p.is_tier() &&
7942 (var == "hit_set_type" || var == "hit_set_period" ||
7943 var == "hit_set_count" || var == "hit_set_fpp" ||
7944 var == "target_max_objects" || var == "target_max_bytes" ||
7945 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7946 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7947 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7948 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7949 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7950 return -EACCES;
7951 }
7952
7953 if (var == "size") {
7954 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7955 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7956 return -EPERM;
7957 }
7958 if (p.type == pg_pool_t::TYPE_ERASURE) {
7959 ss << "can not change the size of an erasure-coded pool";
7960 return -ENOTSUP;
7961 }
7962 if (interr.length()) {
7963 ss << "error parsing integer value '" << val << "': " << interr;
7964 return -EINVAL;
7965 }
7966 if (n <= 0 || n > 10) {
7967 ss << "pool size must be between 1 and 10";
7968 return -EINVAL;
7969 }
eafe8130
TL
7970 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
7971 return -EINVAL;
7972 }
3efd9988
FG
7973 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7974 if (r < 0) {
7975 return r;
7976 }
7c673cae 7977 p.size = n;
1911f103 7978 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
7c673cae
FG
7979 } else if (var == "min_size") {
7980 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7981 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7982 return -EPERM;
7983 }
7984 if (interr.length()) {
7985 ss << "error parsing integer value '" << val << "': " << interr;
7986 return -EINVAL;
7987 }
7988
7989 if (p.type != pg_pool_t::TYPE_ERASURE) {
7990 if (n < 1 || n > p.size) {
494da23a 7991 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7c673cae
FG
7992 return -EINVAL;
7993 }
7994 } else {
7995 ErasureCodeInterfaceRef erasure_code;
7996 int k;
7997 stringstream tmp;
7998 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7999 if (err == 0) {
8000 k = erasure_code->get_data_chunk_count();
8001 } else {
b32b8144 8002 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
8003 return err;
8004 }
8005
8006 if (n < k || n > p.size) {
494da23a 8007 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7c673cae
FG
8008 return -EINVAL;
8009 }
8010 }
8011 p.min_size = n;
11fdf7f2 8012 } else if (var == "pg_num_actual") {
7c673cae
FG
8013 if (interr.length()) {
8014 ss << "error parsing integer value '" << val << "': " << interr;
8015 return -EINVAL;
8016 }
11fdf7f2
TL
8017 if (n == (int)p.get_pg_num()) {
8018 return 0;
8019 }
8020 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8021 ss << "'pg_num' must be greater than 0 and less than or equal to "
8022 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8023 << " (you may adjust 'mon max pool pg num' for higher values)";
8024 return -ERANGE;
8025 }
8026 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8027 ss << "cannot adjust pg_num while initial PGs are being created";
8028 return -EBUSY;
8029 }
8030 if (n > (int)p.get_pg_num()) {
8031 if (p.get_pg_num() != p.get_pg_num_pending()) {
8032 // force pre-nautilus clients to resend their ops, since they
8033 // don't understand pg_num_pending changes form a new interval
8034 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8035 }
8036 p.set_pg_num(n);
8037 } else {
9f95a23c 8038 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8039 ss << "nautilus OSDs are required to adjust pg_num_pending";
8040 return -EPERM;
8041 }
8042 if (n < (int)p.get_pgp_num()) {
8043 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8044 return -EINVAL;
8045 }
8046 if (n < (int)p.get_pg_num() - 1) {
8047 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8048 << ") - 1; only single pg decrease is currently supported";
8049 return -EINVAL;
8050 }
8051 p.set_pg_num_pending(n);
8052 // force pre-nautilus clients to resend their ops, since they
8053 // don't understand pg_num_pending changes form a new interval
8054 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7c673cae 8055 }
11fdf7f2
TL
8056 // force pre-luminous clients to resend their ops, since they
8057 // don't understand that split PGs now form a new interval.
8058 p.last_force_op_resend_preluminous = pending_inc.epoch;
7c673cae
FG
8059 } else if (var == "pg_num") {
8060 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8061 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8062 return -EPERM;
8063 }
8064 if (interr.length()) {
8065 ss << "error parsing integer value '" << val << "': " << interr;
8066 return -EINVAL;
8067 }
11fdf7f2 8068 if (n == (int)p.get_pg_num_target()) {
7c673cae
FG
8069 return 0;
8070 }
11fdf7f2
TL
8071 if (n <= 0 || static_cast<uint64_t>(n) >
8072 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
c07f9fc5 8073 ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 8074 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
c07f9fc5
FG
8075 << " (you may adjust 'mon max pool pg num' for higher values)";
8076 return -ERANGE;
8077 }
11fdf7f2
TL
8078 if (n > (int)p.get_pg_num_target()) {
8079 int r = check_pg_num(pool, n, p.get_size(), &ss);
8080 if (r) {
8081 return r;
8082 }
8083 bool force = false;
9f95a23c 8084 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8085 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8086 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8087 return -EPERM;
8088 }
8089 } else {
9f95a23c 8090 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8091 ss << "nautilus OSDs are required to decrease pg_num";
8092 return -EPERM;
8093 }
7c673cae 8094 }
9f95a23c 8095 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8096 // pre-nautilus osdmap format; increase pg_num directly
8097 assert(n > (int)p.get_pg_num());
8098 // force pre-nautilus clients to resend their ops, since they
8099 // don't understand pg_num_target changes form a new interval
8100 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8101 // force pre-luminous clients to resend their ops, since they
8102 // don't understand that split PGs now form a new interval.
8103 p.last_force_op_resend_preluminous = pending_inc.epoch;
8104 p.set_pg_num(n);
8105 } else {
8106 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8107 // make pgp_num track pg_num if it already matches. if it is set
8108 // differently, leave it different and let the user control it
8109 // manually.
8110 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8111 p.set_pgp_num_target(n);
8112 }
8113 p.set_pg_num_target(n);
7c673cae 8114 }
11fdf7f2 8115 } else if (var == "pgp_num_actual") {
7c673cae
FG
8116 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8117 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8118 return -EPERM;
8119 }
8120 if (interr.length()) {
8121 ss << "error parsing integer value '" << val << "': " << interr;
8122 return -EINVAL;
8123 }
8124 if (n <= 0) {
8125 ss << "specified pgp_num must > 0, but you set to " << n;
8126 return -EINVAL;
8127 }
8128 if (n > (int)p.get_pg_num()) {
8129 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8130 return -EINVAL;
8131 }
11fdf7f2
TL
8132 if (n > (int)p.get_pg_num_pending()) {
8133 ss << "specified pgp_num " << n
8134 << " > pg_num_pending " << p.get_pg_num_pending();
8135 return -EINVAL;
8136 }
7c673cae 8137 p.set_pgp_num(n);
11fdf7f2
TL
8138 } else if (var == "pgp_num") {
8139 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8140 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8141 return -EPERM;
8142 }
8143 if (interr.length()) {
8144 ss << "error parsing integer value '" << val << "': " << interr;
8145 return -EINVAL;
8146 }
8147 if (n <= 0) {
8148 ss << "specified pgp_num must > 0, but you set to " << n;
8149 return -EINVAL;
8150 }
8151 if (n > (int)p.get_pg_num_target()) {
8152 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8153 return -EINVAL;
8154 }
9f95a23c 8155 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8156 // pre-nautilus osdmap format; increase pgp_num directly
8157 p.set_pgp_num(n);
8158 } else {
8159 p.set_pgp_num_target(n);
8160 }
11fdf7f2 8161 } else if (var == "pg_autoscale_mode") {
9f95a23c
TL
8162 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8163 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
11fdf7f2
TL
8164 ss << "specified invalid mode " << val;
8165 return -EINVAL;
8166 }
9f95a23c 8167 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8168 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8169 return -EINVAL;
8170 }
9f95a23c 8171 p.pg_autoscale_mode = m;
7c673cae
FG
8172 } else if (var == "crush_rule") {
8173 int id = osdmap.crush->get_rule_id(val);
8174 if (id == -ENOENT) {
8175 ss << "crush rule " << val << " does not exist";
8176 return -ENOENT;
8177 }
8178 if (id < 0) {
8179 ss << cpp_strerror(id);
8180 return -ENOENT;
8181 }
8182 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8183 return -EINVAL;
8184 }
31f18b77 8185 p.crush_rule = id;
7c673cae
FG
8186 } else if (var == "nodelete" || var == "nopgchange" ||
8187 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8188 var == "noscrub" || var == "nodeep-scrub") {
8189 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8190 // make sure we only compare against 'n' if we didn't receive a string
8191 if (val == "true" || (interr.empty() && n == 1)) {
8192 p.set_flag(flag);
8193 } else if (val == "false" || (interr.empty() && n == 0)) {
8194 p.unset_flag(flag);
8195 } else {
8196 ss << "expecting value 'true', 'false', '0', or '1'";
8197 return -EINVAL;
8198 }
8199 } else if (var == "hashpspool") {
8200 uint64_t flag = pg_pool_t::get_flag_by_name(var);
11fdf7f2 8201 bool force = false;
9f95a23c 8202 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8203
8204 if (!force) {
7c673cae
FG
8205 ss << "are you SURE? this will remap all placement groups in this pool,"
8206 " this triggers large data movement,"
8207 " pass --yes-i-really-mean-it if you really do.";
8208 return -EPERM;
8209 }
8210 // make sure we only compare against 'n' if we didn't receive a string
8211 if (val == "true" || (interr.empty() && n == 1)) {
8212 p.set_flag(flag);
8213 } else if (val == "false" || (interr.empty() && n == 0)) {
8214 p.unset_flag(flag);
8215 } else {
8216 ss << "expecting value 'true', 'false', '0', or '1'";
8217 return -EINVAL;
8218 }
8219 } else if (var == "hit_set_type") {
8220 if (val == "none")
8221 p.hit_set_params = HitSet::Params();
8222 else {
8223 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8224 if (err)
8225 return err;
8226 if (val == "bloom") {
8227 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 8228 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae
FG
8229 p.hit_set_params = HitSet::Params(bsp);
8230 } else if (val == "explicit_hash")
8231 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8232 else if (val == "explicit_object")
8233 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8234 else {
8235 ss << "unrecognized hit_set type '" << val << "'";
8236 return -EINVAL;
8237 }
8238 }
8239 } else if (var == "hit_set_period") {
8240 if (interr.length()) {
8241 ss << "error parsing integer value '" << val << "': " << interr;
8242 return -EINVAL;
11fdf7f2
TL
8243 } else if (n < 0) {
8244 ss << "hit_set_period should be non-negative";
8245 return -EINVAL;
7c673cae
FG
8246 }
8247 p.hit_set_period = n;
8248 } else if (var == "hit_set_count") {
8249 if (interr.length()) {
8250 ss << "error parsing integer value '" << val << "': " << interr;
8251 return -EINVAL;
11fdf7f2
TL
8252 } else if (n < 0) {
8253 ss << "hit_set_count should be non-negative";
8254 return -EINVAL;
7c673cae
FG
8255 }
8256 p.hit_set_count = n;
8257 } else if (var == "hit_set_fpp") {
8258 if (floaterr.length()) {
8259 ss << "error parsing floating point value '" << val << "': " << floaterr;
8260 return -EINVAL;
11fdf7f2
TL
8261 } else if (f < 0 || f > 1.0) {
8262 ss << "hit_set_fpp should be in the range 0..1";
8263 return -EINVAL;
7c673cae
FG
8264 }
8265 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8266 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8267 return -EINVAL;
8268 }
8269 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8270 bloomp->set_fpp(f);
8271 } else if (var == "use_gmt_hitset") {
8272 if (val == "true" || (interr.empty() && n == 1)) {
7c673cae
FG
8273 p.use_gmt_hitset = true;
8274 } else {
8275 ss << "expecting value 'true' or '1'";
8276 return -EINVAL;
8277 }
8278 } else if (var == "allow_ec_overwrites") {
8279 if (!p.is_erasure()) {
8280 ss << "ec overwrites can only be enabled for an erasure coded pool";
8281 return -EINVAL;
8282 }
224ce89b 8283 stringstream err;
11fdf7f2 8284 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
224ce89b
WB
8285 !is_pool_currently_all_bluestore(pool, p, &err)) {
8286 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8287 return -EINVAL;
8288 }
7c673cae
FG
8289 if (val == "true" || (interr.empty() && n == 1)) {
8290 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8291 } else if (val == "false" || (interr.empty() && n == 0)) {
8292 ss << "ec overwrites cannot be disabled once enabled";
8293 return -EINVAL;
8294 } else {
8295 ss << "expecting value 'true', 'false', '0', or '1'";
8296 return -EINVAL;
8297 }
7c673cae
FG
8298 } else if (var == "target_max_objects") {
8299 if (interr.length()) {
8300 ss << "error parsing int '" << val << "': " << interr;
8301 return -EINVAL;
8302 }
8303 p.target_max_objects = n;
8304 } else if (var == "target_max_bytes") {
8305 if (interr.length()) {
8306 ss << "error parsing int '" << val << "': " << interr;
8307 return -EINVAL;
8308 }
8309 p.target_max_bytes = n;
8310 } else if (var == "cache_target_dirty_ratio") {
8311 if (floaterr.length()) {
8312 ss << "error parsing float '" << val << "': " << floaterr;
8313 return -EINVAL;
8314 }
8315 if (f < 0 || f > 1.0) {
8316 ss << "value must be in the range 0..1";
8317 return -ERANGE;
8318 }
8319 p.cache_target_dirty_ratio_micro = uf;
8320 } else if (var == "cache_target_dirty_high_ratio") {
8321 if (floaterr.length()) {
8322 ss << "error parsing float '" << val << "': " << floaterr;
8323 return -EINVAL;
8324 }
8325 if (f < 0 || f > 1.0) {
8326 ss << "value must be in the range 0..1";
8327 return -ERANGE;
8328 }
8329 p.cache_target_dirty_high_ratio_micro = uf;
8330 } else if (var == "cache_target_full_ratio") {
8331 if (floaterr.length()) {
8332 ss << "error parsing float '" << val << "': " << floaterr;
8333 return -EINVAL;
8334 }
8335 if (f < 0 || f > 1.0) {
8336 ss << "value must be in the range 0..1";
8337 return -ERANGE;
8338 }
8339 p.cache_target_full_ratio_micro = uf;
8340 } else if (var == "cache_min_flush_age") {
8341 if (interr.length()) {
8342 ss << "error parsing int '" << val << "': " << interr;
8343 return -EINVAL;
8344 }
8345 p.cache_min_flush_age = n;
8346 } else if (var == "cache_min_evict_age") {
8347 if (interr.length()) {
8348 ss << "error parsing int '" << val << "': " << interr;
8349 return -EINVAL;
8350 }
8351 p.cache_min_evict_age = n;
8352 } else if (var == "min_read_recency_for_promote") {
8353 if (interr.length()) {
8354 ss << "error parsing integer value '" << val << "': " << interr;
8355 return -EINVAL;
8356 }
8357 p.min_read_recency_for_promote = n;
8358 } else if (var == "hit_set_grade_decay_rate") {
8359 if (interr.length()) {
8360 ss << "error parsing integer value '" << val << "': " << interr;
8361 return -EINVAL;
8362 }
8363 if (n > 100 || n < 0) {
8364 ss << "value out of range,valid range is 0 - 100";
8365 return -EINVAL;
8366 }
8367 p.hit_set_grade_decay_rate = n;
8368 } else if (var == "hit_set_search_last_n") {
8369 if (interr.length()) {
8370 ss << "error parsing integer value '" << val << "': " << interr;
8371 return -EINVAL;
8372 }
8373 if (n > p.hit_set_count || n < 0) {
8374 ss << "value out of range,valid range is 0 - hit_set_count";
8375 return -EINVAL;
8376 }
8377 p.hit_set_search_last_n = n;
8378 } else if (var == "min_write_recency_for_promote") {
8379 if (interr.length()) {
8380 ss << "error parsing integer value '" << val << "': " << interr;
8381 return -EINVAL;
8382 }
8383 p.min_write_recency_for_promote = n;
8384 } else if (var == "fast_read") {
8385 if (p.is_replicated()) {
8386 ss << "fast read is not supported in replication pool";
8387 return -EINVAL;
8388 }
8389 if (val == "true" || (interr.empty() && n == 1)) {
8390 p.fast_read = true;
8391 } else if (val == "false" || (interr.empty() && n == 0)) {
8392 p.fast_read = false;
8393 } else {
8394 ss << "expecting value 'true', 'false', '0', or '1'";
8395 return -EINVAL;
8396 }
8397 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 8398 bool unset = val == "unset";
7c673cae 8399 if (var == "compression_mode") {
224ce89b
WB
8400 if (!unset) {
8401 auto cmode = Compressor::get_comp_mode_type(val);
8402 if (!cmode) {
8403 ss << "unrecognized compression mode '" << val << "'";
8404 return -EINVAL;
8405 }
7c673cae
FG
8406 }
8407 } else if (var == "compression_algorithm") {
224ce89b
WB
8408 if (!unset) {
8409 auto alg = Compressor::get_comp_alg_type(val);
8410 if (!alg) {
8411 ss << "unrecognized compression_algorithm '" << val << "'";
8412 return -EINVAL;
8413 }
7c673cae
FG
8414 }
8415 } else if (var == "compression_required_ratio") {
8416 if (floaterr.length()) {
8417 ss << "error parsing float value '" << val << "': " << floaterr;
8418 return -EINVAL;
8419 }
224ce89b 8420 if (f < 0 || f > 1) {
7c673cae 8421 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 8422 return -EINVAL;
7c673cae
FG
8423 }
8424 } else if (var == "csum_type") {
224ce89b 8425 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
8426 if (t < 0 ) {
8427 ss << "unrecognized csum_type '" << val << "'";
224ce89b 8428 return -EINVAL;
7c673cae
FG
8429 }
8430 //preserve csum_type numeric value
8431 n = t;
8432 interr.clear();
8433 } else if (var == "compression_max_blob_size" ||
8434 var == "compression_min_blob_size" ||
8435 var == "csum_max_block" ||
8436 var == "csum_min_block") {
8437 if (interr.length()) {
8438 ss << "error parsing int value '" << val << "': " << interr;
8439 return -EINVAL;
8440 }
11fdf7f2
TL
8441 } else if (var == "fingerprint_algorithm") {
8442 if (!unset) {
8443 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8444 if (!alg) {
8445 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8446 return -EINVAL;
8447 }
8448 }
92f5a8d4
TL
8449 } else if (var == "target_size_bytes") {
8450 if (interr.length()) {
8451 ss << "error parsing unit value '" << val << "': " << interr;
8452 return -EINVAL;
8453 }
9f95a23c 8454 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
92f5a8d4
TL
8455 ss << "must set require_osd_release to nautilus or "
8456 << "later before setting target_size_bytes";
8457 return -EINVAL;
8458 }
11fdf7f2
TL
8459 } else if (var == "pg_num_min") {
8460 if (interr.length()) {
8461 ss << "error parsing int value '" << val << "': " << interr;
8462 return -EINVAL;
8463 }
8464 if (n > (int)p.get_pg_num_target()) {
8465 ss << "specified pg_num_min " << n
8466 << " > pg_num " << p.get_pg_num_target();
8467 return -EINVAL;
8468 }
8469 } else if (var == "recovery_priority") {
8470 if (interr.length()) {
8471 ss << "error parsing int value '" << val << "': " << interr;
8472 return -EINVAL;
8473 }
81eedcae
TL
8474 if (!g_conf()->debug_allow_any_pool_priority) {
8475 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8476 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8477 << " and " << OSD_POOL_PRIORITY_MAX;
8478 return -EINVAL;
8479 }
11fdf7f2
TL
8480 }
8481 } else if (var == "pg_autoscale_bias") {
8482 if (f < 0.0 || f > 1000.0) {
8483 ss << "pg_autoscale_bias must be between 0 and 1000";
8484 return -EINVAL;
8485 }
7c673cae
FG
8486 }
8487
8488 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8489 switch (desc.type) {
8490 case pool_opts_t::STR:
224ce89b 8491 if (unset) {
7c673cae
FG
8492 p.opts.unset(desc.key);
8493 } else {
8494 p.opts.set(desc.key, static_cast<std::string>(val));
8495 }
8496 break;
8497 case pool_opts_t::INT:
8498 if (interr.length()) {
8499 ss << "error parsing integer value '" << val << "': " << interr;
8500 return -EINVAL;
8501 }
8502 if (n == 0) {
8503 p.opts.unset(desc.key);
8504 } else {
11fdf7f2 8505 p.opts.set(desc.key, static_cast<int64_t>(n));
7c673cae
FG
8506 }
8507 break;
8508 case pool_opts_t::DOUBLE:
8509 if (floaterr.length()) {
8510 ss << "error parsing floating point value '" << val << "': " << floaterr;
8511 return -EINVAL;
8512 }
8513 if (f == 0) {
8514 p.opts.unset(desc.key);
8515 } else {
8516 p.opts.set(desc.key, static_cast<double>(f));
8517 }
8518 break;
8519 default:
11fdf7f2 8520 ceph_assert(!"unknown type");
7c673cae
FG
8521 }
8522 } else {
8523 ss << "unrecognized variable '" << var << "'";
8524 return -EINVAL;
8525 }
224ce89b
WB
8526 if (val != "unset") {
8527 ss << "set pool " << pool << " " << var << " to " << val;
8528 } else {
8529 ss << "unset pool " << pool << " " << var;
8530 }
7c673cae
FG
8531 p.last_change = pending_inc.epoch;
8532 pending_inc.new_pools[pool] = p;
8533 return 0;
8534}
8535
c07f9fc5 8536int OSDMonitor::prepare_command_pool_application(const string &prefix,
11fdf7f2 8537 const cmdmap_t& cmdmap,
c07f9fc5 8538 stringstream& ss)
11fdf7f2
TL
8539{
8540 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8541}
8542
8543int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8544 const cmdmap_t& cmdmap,
8545 stringstream& ss,
8546 bool *modified)
8547{
8548 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8549}
8550
8551
8552/**
8553 * Common logic for preprocess and prepare phases of pool application
8554 * tag commands. In preprocess mode we're only detecting invalid
8555 * commands, and determining whether it was a modification or a no-op.
8556 * In prepare mode we're actually updating the pending state.
8557 */
8558int OSDMonitor::_command_pool_application(const string &prefix,
8559 const cmdmap_t& cmdmap,
8560 stringstream& ss,
8561 bool *modified,
8562 bool preparing)
c07f9fc5
FG
8563{
8564 string pool_name;
9f95a23c 8565 cmd_getval(cmdmap, "pool", pool_name);
c07f9fc5
FG
8566 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8567 if (pool < 0) {
8568 ss << "unrecognized pool '" << pool_name << "'";
8569 return -ENOENT;
8570 }
8571
8572 pg_pool_t p = *osdmap.get_pg_pool(pool);
11fdf7f2
TL
8573 if (preparing) {
8574 if (pending_inc.new_pools.count(pool)) {
8575 p = pending_inc.new_pools[pool];
8576 }
c07f9fc5
FG
8577 }
8578
8579 string app;
9f95a23c 8580 cmd_getval(cmdmap, "app", app);
c07f9fc5
FG
8581 bool app_exists = (p.application_metadata.count(app) > 0);
8582
11fdf7f2 8583 string key;
9f95a23c 8584 cmd_getval(cmdmap, "key", key);
11fdf7f2
TL
8585 if (key == "all") {
8586 ss << "key cannot be 'all'";
8587 return -EINVAL;
8588 }
8589
8590 string value;
9f95a23c 8591 cmd_getval(cmdmap, "value", value);
11fdf7f2
TL
8592 if (value == "all") {
8593 ss << "value cannot be 'all'";
8594 return -EINVAL;
8595 }
8596
c07f9fc5
FG
8597 if (boost::algorithm::ends_with(prefix, "enable")) {
8598 if (app.empty()) {
8599 ss << "application name must be provided";
8600 return -EINVAL;
8601 }
8602
8603 if (p.is_tier()) {
8604 ss << "application must be enabled on base tier";
8605 return -EINVAL;
8606 }
8607
11fdf7f2 8608 bool force = false;
9f95a23c 8609 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8610
11fdf7f2 8611 if (!app_exists && !p.application_metadata.empty() && !force) {
c07f9fc5
FG
8612 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8613 << "application; pass --yes-i-really-mean-it to proceed anyway";
8614 return -EPERM;
8615 }
8616
8617 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8618 ss << "too many enabled applications on pool '" << pool_name << "'; "
8619 << "max " << MAX_POOL_APPLICATIONS;
8620 return -EINVAL;
8621 }
8622
8623 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8624 ss << "application name '" << app << "' too long; max length "
8625 << MAX_POOL_APPLICATION_LENGTH;
8626 return -EINVAL;
8627 }
8628
8629 if (!app_exists) {
8630 p.application_metadata[app] = {};
8631 }
8632 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8633
8634 } else if (boost::algorithm::ends_with(prefix, "disable")) {
11fdf7f2 8635 bool force = false;
9f95a23c 8636 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8637
11fdf7f2 8638 if (!force) {
c07f9fc5
FG
8639 ss << "Are you SURE? Disabling an application within a pool might result "
8640 << "in loss of application functionality; pass "
8641 << "--yes-i-really-mean-it to proceed anyway";
8642 return -EPERM;
8643 }
8644
8645 if (!app_exists) {
8646 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8647 << "'";
8648 return 0; // idempotent
8649 }
8650
8651 p.application_metadata.erase(app);
8652 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8653
8654 } else if (boost::algorithm::ends_with(prefix, "set")) {
8655 if (p.is_tier()) {
8656 ss << "application metadata must be set on base tier";
8657 return -EINVAL;
8658 }
8659
8660 if (!app_exists) {
8661 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8662 << "'";
8663 return -ENOENT;
8664 }
8665
8666 string key;
9f95a23c 8667 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
8668
8669 if (key.empty()) {
8670 ss << "key must be provided";
8671 return -EINVAL;
8672 }
8673
8674 auto &app_keys = p.application_metadata[app];
8675 if (app_keys.count(key) == 0 &&
8676 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8677 ss << "too many keys set for application '" << app << "' on pool '"
8678 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8679 return -EINVAL;
8680 }
8681
8682 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8683 ss << "key '" << app << "' too long; max length "
8684 << MAX_POOL_APPLICATION_LENGTH;
8685 return -EINVAL;
8686 }
8687
8688 string value;
9f95a23c 8689 cmd_getval(cmdmap, "value", value);
c07f9fc5
FG
8690 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8691 ss << "value '" << value << "' too long; max length "
8692 << MAX_POOL_APPLICATION_LENGTH;
8693 return -EINVAL;
8694 }
8695
8696 p.application_metadata[app][key] = value;
8697 ss << "set application '" << app << "' key '" << key << "' to '"
8698 << value << "' on pool '" << pool_name << "'";
8699 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8700 if (!app_exists) {
8701 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8702 << "'";
8703 return -ENOENT;
8704 }
8705
8706 string key;
9f95a23c 8707 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
8708 auto it = p.application_metadata[app].find(key);
8709 if (it == p.application_metadata[app].end()) {
8710 ss << "application '" << app << "' on pool '" << pool_name
8711 << "' does not have key '" << key << "'";
8712 return 0; // idempotent
8713 }
8714
8715 p.application_metadata[app].erase(it);
8716 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8717 << pool_name << "'";
8718 } else {
11fdf7f2
TL
8719 ceph_abort();
8720 }
8721
8722 if (preparing) {
8723 p.last_change = pending_inc.epoch;
8724 pending_inc.new_pools[pool] = p;
8725 }
8726
8727 // Because we fell through this far, we didn't hit no-op cases,
8728 // so pool was definitely modified
8729 if (modified != nullptr) {
8730 *modified = true;
c07f9fc5
FG
8731 }
8732
c07f9fc5
FG
8733 return 0;
8734}
8735
31f18b77
FG
8736int OSDMonitor::_prepare_command_osd_crush_remove(
8737 CrushWrapper &newcrush,
8738 int32_t id,
8739 int32_t ancestor,
8740 bool has_ancestor,
8741 bool unlink_only)
8742{
8743 int err = 0;
8744
8745 if (has_ancestor) {
11fdf7f2 8746 err = newcrush.remove_item_under(cct, id, ancestor,
31f18b77
FG
8747 unlink_only);
8748 } else {
11fdf7f2 8749 err = newcrush.remove_item(cct, id, unlink_only);
31f18b77
FG
8750 }
8751 return err;
8752}
8753
8754void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8755{
8756 pending_inc.crush.clear();
8757 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8758}
8759
8760int OSDMonitor::prepare_command_osd_crush_remove(
8761 CrushWrapper &newcrush,
8762 int32_t id,
8763 int32_t ancestor,
8764 bool has_ancestor,
8765 bool unlink_only)
8766{
8767 int err = _prepare_command_osd_crush_remove(
8768 newcrush, id, ancestor,
8769 has_ancestor, unlink_only);
8770
8771 if (err < 0)
8772 return err;
8773
11fdf7f2 8774 ceph_assert(err == 0);
31f18b77
FG
8775 do_osd_crush_remove(newcrush);
8776
8777 return 0;
8778}
8779
8780int OSDMonitor::prepare_command_osd_remove(int32_t id)
8781{
8782 if (osdmap.is_up(id)) {
8783 return -EBUSY;
8784 }
8785
8786 pending_inc.new_state[id] = osdmap.get_state(id);
8787 pending_inc.new_uuid[id] = uuid_d();
8788 pending_metadata_rm.insert(id);
8789 pending_metadata.erase(id);
8790
8791 return 0;
8792}
8793
8794int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8795{
11fdf7f2 8796 ceph_assert(existing_id);
31f18b77
FG
8797 *existing_id = -1;
8798
8799 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8800 if (!osdmap.exists(i) &&
8801 pending_inc.new_up_client.count(i) == 0 &&
8802 (pending_inc.new_state.count(i) == 0 ||
8803 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8804 *existing_id = i;
8805 return -1;
8806 }
8807 }
8808
8809 if (pending_inc.new_max_osd < 0) {
8810 return osdmap.get_max_osd();
8811 }
8812 return pending_inc.new_max_osd;
8813}
8814
8815void OSDMonitor::do_osd_create(
8816 const int32_t id,
8817 const uuid_d& uuid,
3a9019d9 8818 const string& device_class,
31f18b77
FG
8819 int32_t* new_id)
8820{
8821 dout(10) << __func__ << " uuid " << uuid << dendl;
11fdf7f2 8822 ceph_assert(new_id);
31f18b77
FG
8823
8824 // We presume validation has been performed prior to calling this
8825 // function. We assert with prejudice.
8826
8827 int32_t allocated_id = -1; // declare here so we can jump
8828 int32_t existing_id = -1;
8829 if (!uuid.is_zero()) {
8830 existing_id = osdmap.identify_osd(uuid);
8831 if (existing_id >= 0) {
11fdf7f2 8832 ceph_assert(id < 0 || id == existing_id);
31f18b77
FG
8833 *new_id = existing_id;
8834 goto out;
8835 } else if (id >= 0) {
8836 // uuid does not exist, and id has been provided, so just create
8837 // the new osd.id
8838 *new_id = id;
8839 goto out;
8840 }
8841 }
8842
8843 // allocate a new id
8844 allocated_id = _allocate_osd_id(&existing_id);
8845 dout(10) << __func__ << " allocated id " << allocated_id
8846 << " existing id " << existing_id << dendl;
8847 if (existing_id >= 0) {
11fdf7f2
TL
8848 ceph_assert(existing_id < osdmap.get_max_osd());
8849 ceph_assert(allocated_id < 0);
31f18b77
FG
8850 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8851 *new_id = existing_id;
31f18b77 8852 } else if (allocated_id >= 0) {
11fdf7f2 8853 ceph_assert(existing_id < 0);
31f18b77
FG
8854 // raise max_osd
8855 if (pending_inc.new_max_osd < 0) {
8856 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8857 } else {
8858 ++pending_inc.new_max_osd;
8859 }
8860 *new_id = pending_inc.new_max_osd - 1;
11fdf7f2 8861 ceph_assert(*new_id == allocated_id);
31f18b77 8862 } else {
11fdf7f2 8863 ceph_abort_msg("unexpected condition");
31f18b77
FG
8864 }
8865
8866out:
3a9019d9
FG
8867 if (device_class.size()) {
8868 CrushWrapper newcrush;
8869 _get_pending_crush(newcrush);
8870 if (newcrush.get_max_devices() < *new_id + 1) {
8871 newcrush.set_max_devices(*new_id + 1);
8872 }
8873 string name = string("osd.") + stringify(*new_id);
8874 if (!newcrush.item_exists(*new_id)) {
8875 newcrush.set_item_name(*new_id, name);
8876 }
8877 ostringstream ss;
8878 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8879 if (r < 0) {
8880 derr << __func__ << " failed to set " << name << " device_class "
8881 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8882 << dendl;
8883 // non-fatal... this might be a replay and we want to be idempotent.
8884 } else {
8885 dout(20) << __func__ << " set " << name << " device_class " << device_class
8886 << dendl;
8887 pending_inc.crush.clear();
8888 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8889 }
8890 } else {
8891 dout(20) << __func__ << " no device_class" << dendl;
8892 }
8893
31f18b77
FG
8894 dout(10) << __func__ << " using id " << *new_id << dendl;
8895 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8896 pending_inc.new_max_osd = *new_id + 1;
8897 }
8898
8899 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8900 if (!uuid.is_zero())
8901 pending_inc.new_uuid[*new_id] = uuid;
8902}
8903
8904int OSDMonitor::validate_osd_create(
8905 const int32_t id,
8906 const uuid_d& uuid,
8907 const bool check_osd_exists,
8908 int32_t* existing_id,
8909 stringstream& ss)
8910{
8911
8912 dout(10) << __func__ << " id " << id << " uuid " << uuid
8913 << " check_osd_exists " << check_osd_exists << dendl;
8914
11fdf7f2 8915 ceph_assert(existing_id);
31f18b77
FG
8916
8917 if (id < 0 && uuid.is_zero()) {
8918 // we have nothing to validate
8919 *existing_id = -1;
8920 return 0;
8921 } else if (uuid.is_zero()) {
8922 // we have an id but we will ignore it - because that's what
8923 // `osd create` does.
8924 return 0;
8925 }
8926
8927 /*
8928 * This function will be used to validate whether we are able to
8929 * create a new osd when the `uuid` is specified.
8930 *
8931 * It will be used by both `osd create` and `osd new`, as the checks
8932 * are basically the same when it pertains to osd id and uuid validation.
8933 * However, `osd create` presumes an `uuid` is optional, for legacy
8934 * reasons, while `osd new` requires the `uuid` to be provided. This
8935 * means that `osd create` will not be idempotent if an `uuid` is not
8936 * provided, but we will always guarantee the idempotency of `osd new`.
8937 */
8938
11fdf7f2 8939 ceph_assert(!uuid.is_zero());
31f18b77
FG
8940 if (pending_inc.identify_osd(uuid) >= 0) {
8941 // osd is about to exist
8942 return -EAGAIN;
8943 }
8944
8945 int32_t i = osdmap.identify_osd(uuid);
8946 if (i >= 0) {
8947 // osd already exists
8948 if (id >= 0 && i != id) {
8949 ss << "uuid " << uuid << " already in use for different id " << i;
8950 return -EEXIST;
8951 }
8952 // return a positive errno to distinguish between a blocking error
8953 // and an error we consider to not be a problem (i.e., this would be
8954 // an idempotent operation).
8955 *existing_id = i;
8956 return EEXIST;
8957 }
8958 // i < 0
8959 if (id >= 0) {
8960 if (pending_inc.new_state.count(id)) {
8961 // osd is about to exist
8962 return -EAGAIN;
8963 }
8964 // we may not care if an osd exists if we are recreating a previously
8965 // destroyed osd.
8966 if (check_osd_exists && osdmap.exists(id)) {
8967 ss << "id " << id << " already in use and does not match uuid "
8968 << uuid;
8969 return -EINVAL;
8970 }
8971 }
8972 return 0;
8973}
8974
8975int OSDMonitor::prepare_command_osd_create(
8976 const int32_t id,
8977 const uuid_d& uuid,
8978 int32_t* existing_id,
8979 stringstream& ss)
8980{
8981 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
11fdf7f2 8982 ceph_assert(existing_id);
b5b8bbf5
FG
8983 if (osdmap.is_destroyed(id)) {
8984 ss << "ceph osd create has been deprecated. Please use ceph osd new "
8985 "instead.";
8986 return -EINVAL;
8987 }
31f18b77
FG
8988
8989 if (uuid.is_zero()) {
8990 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8991 }
8992
8993 return validate_osd_create(id, uuid, true, existing_id, ss);
8994}
8995
8996int OSDMonitor::prepare_command_osd_new(
8997 MonOpRequestRef op,
11fdf7f2 8998 const cmdmap_t& cmdmap,
3a9019d9 8999 const map<string,string>& params,
31f18b77
FG
9000 stringstream &ss,
9001 Formatter *f)
9002{
9003 uuid_d uuid;
9004 string uuidstr;
9005 int64_t id = -1;
9006
11fdf7f2 9007 ceph_assert(paxos->is_plugged());
31f18b77
FG
9008
9009 dout(10) << __func__ << " " << op << dendl;
9010
9011 /* validate command. abort now if something's wrong. */
9012
9013 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9014 *
9015 * If `id` is not specified, we will identify any existing osd based
9016 * on `uuid`. Operation will be idempotent iff secrets match.
9017 *
9018 * If `id` is specified, we will identify any existing osd based on
9019 * `uuid` and match against `id`. If they match, operation will be
9020 * idempotent iff secrets match.
9021 *
9022 * `-i secrets.json` will be optional. If supplied, will be used
9023 * to check for idempotency when `id` and `uuid` match.
9024 *
9025 * If `id` is not specified, and `uuid` does not exist, an id will
9026 * be found or allocated for the osd.
9027 *
9028 * If `id` is specified, and the osd has been previously marked
9029 * as destroyed, then the `id` will be reused.
9030 */
9f95a23c 9031 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
31f18b77
FG
9032 ss << "requires the OSD's UUID to be specified.";
9033 return -EINVAL;
9034 } else if (!uuid.parse(uuidstr.c_str())) {
9035 ss << "invalid UUID value '" << uuidstr << "'.";
9036 return -EINVAL;
9037 }
9038
9f95a23c 9039 if (cmd_getval(cmdmap, "id", id) &&
31f18b77
FG
9040 (id < 0)) {
9041 ss << "invalid OSD id; must be greater or equal than zero.";
9042 return -EINVAL;
9043 }
9044
9045 // are we running an `osd create`-like command, or recreating
9046 // a previously destroyed osd?
9047
9048 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9049
9050 // we will care about `id` to assess whether osd is `destroyed`, or
9051 // to create a new osd.
9052 // we will need an `id` by the time we reach auth.
9053
9054 int32_t existing_id = -1;
9055 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9056 &existing_id, ss);
9057
9058 bool may_be_idempotent = false;
9059 if (err == EEXIST) {
9060 // this is idempotent from the osdmon's point-of-view
9061 may_be_idempotent = true;
11fdf7f2 9062 ceph_assert(existing_id >= 0);
31f18b77
FG
9063 id = existing_id;
9064 } else if (err < 0) {
9065 return err;
9066 }
9067
9068 if (!may_be_idempotent) {
9069 // idempotency is out of the window. We are either creating a new
9070 // osd or recreating a destroyed osd.
9071 //
9072 // We now need to figure out if we have an `id` (and if it's valid),
9073 // of find an `id` if we don't have one.
9074
9075 // NOTE: we need to consider the case where the `id` is specified for
9076 // `osd create`, and we must honor it. So this means checking if
9077 // the `id` is destroyed, and if so assume the destroy; otherwise,
9078 // check if it `exists` - in which case we complain about not being
9079 // `destroyed`. In the end, if nothing fails, we must allow the
9080 // creation, so that we are compatible with `create`.
9081 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9082 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9083 ss << "OSD " << id << " has not yet been destroyed";
9084 return -EINVAL;
9085 } else if (id < 0) {
9086 // find an `id`
9087 id = _allocate_osd_id(&existing_id);
9088 if (id < 0) {
11fdf7f2 9089 ceph_assert(existing_id >= 0);
31f18b77
FG
9090 id = existing_id;
9091 }
9092 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9093 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9094 dout(10) << __func__ << " recreating osd." << id << dendl;
9095 } else {
9096 dout(10) << __func__ << " creating new osd." << id << dendl;
9097 }
9098 } else {
11fdf7f2
TL
9099 ceph_assert(id >= 0);
9100 ceph_assert(osdmap.exists(id));
31f18b77
FG
9101 }
9102
9103 // we are now able to either create a brand new osd or reuse an existing
9104 // osd that has been previously destroyed.
9105
9106 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9107
3a9019d9 9108 if (may_be_idempotent && params.empty()) {
31f18b77 9109 // nothing to do, really.
3a9019d9 9110 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
11fdf7f2 9111 ceph_assert(id >= 0);
31f18b77
FG
9112 if (f) {
9113 f->open_object_section("created_osd");
9114 f->dump_int("osdid", id);
9115 f->close_section();
9116 } else {
9117 ss << id;
9118 }
9119 return EEXIST;
9120 }
9121
3a9019d9
FG
9122 string device_class;
9123 auto p = params.find("crush_device_class");
9124 if (p != params.end()) {
9125 device_class = p->second;
9126 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9127 }
31f18b77
FG
9128 string cephx_secret, lockbox_secret, dmcrypt_key;
9129 bool has_lockbox = false;
3a9019d9
FG
9130 bool has_secrets = params.count("cephx_secret")
9131 || params.count("cephx_lockbox_secret")
9132 || params.count("dmcrypt_key");
31f18b77
FG
9133
9134 ConfigKeyService *svc = nullptr;
9135 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9136
9137 if (has_secrets) {
3a9019d9 9138 if (params.count("cephx_secret") == 0) {
31f18b77
FG
9139 ss << "requires a cephx secret.";
9140 return -EINVAL;
9141 }
3a9019d9 9142 cephx_secret = params.at("cephx_secret");
31f18b77 9143
3a9019d9
FG
9144 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9145 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
9146
9147 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9148 << " dmcrypt " << has_dmcrypt_key << dendl;
9149
9150 if (has_lockbox_secret && has_dmcrypt_key) {
9151 has_lockbox = true;
3a9019d9
FG
9152 lockbox_secret = params.at("cephx_lockbox_secret");
9153 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
9154 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9155 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9156 return -EINVAL;
9157 }
9158
9159 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9160
9161 err = mon->authmon()->validate_osd_new(id, uuid,
9162 cephx_secret,
9163 lockbox_secret,
9164 cephx_entity,
9165 lockbox_entity,
9166 ss);
9167 if (err < 0) {
9168 return err;
9169 } else if (may_be_idempotent && err != EEXIST) {
9170 // for this to be idempotent, `id` should already be >= 0; no need
9171 // to use validate_id.
11fdf7f2 9172 ceph_assert(id >= 0);
31f18b77
FG
9173 ss << "osd." << id << " exists but secrets do not match";
9174 return -EEXIST;
9175 }
9176
9177 if (has_lockbox) {
9178 svc = (ConfigKeyService*)mon->config_key_service;
9179 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9180 if (err < 0) {
9181 return err;
9182 } else if (may_be_idempotent && err != EEXIST) {
11fdf7f2 9183 ceph_assert(id >= 0);
31f18b77
FG
9184 ss << "osd." << id << " exists but dm-crypt key does not match.";
9185 return -EEXIST;
9186 }
9187 }
9188 }
11fdf7f2
TL
9189 ceph_assert(!has_secrets || !cephx_secret.empty());
9190 ceph_assert(!has_lockbox || !lockbox_secret.empty());
31f18b77
FG
9191
9192 if (may_be_idempotent) {
9193 // we have nothing to do for either the osdmon or the authmon,
9194 // and we have no lockbox - so the config key service will not be
9195 // touched. This is therefore an idempotent operation, and we can
9196 // just return right away.
9197 dout(10) << __func__ << " idempotent -- no op." << dendl;
11fdf7f2 9198 ceph_assert(id >= 0);
31f18b77
FG
9199 if (f) {
9200 f->open_object_section("created_osd");
9201 f->dump_int("osdid", id);
9202 f->close_section();
9203 } else {
9204 ss << id;
9205 }
9206 return EEXIST;
9207 }
11fdf7f2 9208 ceph_assert(!may_be_idempotent);
31f18b77
FG
9209
9210 // perform updates.
9211 if (has_secrets) {
11fdf7f2
TL
9212 ceph_assert(!cephx_secret.empty());
9213 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
31f18b77
FG
9214 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9215
9216 err = mon->authmon()->do_osd_new(cephx_entity,
9217 lockbox_entity,
9218 has_lockbox);
11fdf7f2 9219 ceph_assert(0 == err);
31f18b77
FG
9220
9221 if (has_lockbox) {
11fdf7f2 9222 ceph_assert(nullptr != svc);
31f18b77
FG
9223 svc->do_osd_new(uuid, dmcrypt_key);
9224 }
9225 }
9226
9227 if (is_recreate_destroyed) {
11fdf7f2
TL
9228 ceph_assert(id >= 0);
9229 ceph_assert(osdmap.is_destroyed(id));
31f18b77 9230 pending_inc.new_weight[id] = CEPH_OSD_OUT;
11fdf7f2
TL
9231 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9232 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9233 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9234 }
c07f9fc5
FG
9235 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9236 // due to http://tracker.ceph.com/issues/20751 some clusters may
9237 // have UP set for non-existent OSDs; make sure it is cleared
9238 // for a newly created osd.
9239 pending_inc.new_state[id] |= CEPH_OSD_UP;
9240 }
31f18b77
FG
9241 pending_inc.new_uuid[id] = uuid;
9242 } else {
11fdf7f2 9243 ceph_assert(id >= 0);
31f18b77 9244 int32_t new_id = -1;
3a9019d9 9245 do_osd_create(id, uuid, device_class, &new_id);
11fdf7f2
TL
9246 ceph_assert(new_id >= 0);
9247 ceph_assert(id == new_id);
31f18b77
FG
9248 }
9249
9250 if (f) {
9251 f->open_object_section("created_osd");
9252 f->dump_int("osdid", id);
9253 f->close_section();
9254 } else {
9255 ss << id;
9256 }
9257
9258 return 0;
9259}
9260
7c673cae
FG
9261bool OSDMonitor::prepare_command(MonOpRequestRef op)
9262{
9263 op->mark_osdmon_event(__func__);
9f95a23c 9264 auto m = op->get_req<MMonCommand>();
7c673cae 9265 stringstream ss;
11fdf7f2 9266 cmdmap_t cmdmap;
7c673cae
FG
9267 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9268 string rs = ss.str();
9269 mon->reply_command(op, -EINVAL, rs, get_last_committed());
9270 return true;
9271 }
9272
11fdf7f2 9273 MonSession *session = op->get_session();
7c673cae 9274 if (!session) {
11fdf7f2 9275 derr << __func__ << " no session" << dendl;
7c673cae
FG
9276 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9277 return true;
9278 }
9279
9280 return prepare_command_impl(op, cmdmap);
9281}
9282
9283static int parse_reweights(CephContext *cct,
11fdf7f2 9284 const cmdmap_t& cmdmap,
7c673cae
FG
9285 const OSDMap& osdmap,
9286 map<int32_t, uint32_t>* weights)
9287{
9288 string weights_str;
9f95a23c 9289 if (!cmd_getval(cmdmap, "weights", weights_str)) {
7c673cae
FG
9290 return -EINVAL;
9291 }
9292 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9293 json_spirit::mValue json_value;
9294 if (!json_spirit::read(weights_str, json_value)) {
9295 return -EINVAL;
9296 }
9297 if (json_value.type() != json_spirit::obj_type) {
9298 return -EINVAL;
9299 }
9300 const auto obj = json_value.get_obj();
9301 try {
9302 for (auto& osd_weight : obj) {
9303 auto osd_id = std::stoi(osd_weight.first);
9304 if (!osdmap.exists(osd_id)) {
9305 return -ENOENT;
9306 }
9307 if (osd_weight.second.type() != json_spirit::str_type) {
9308 return -EINVAL;
9309 }
9310 auto weight = std::stoul(osd_weight.second.get_str());
9311 weights->insert({osd_id, weight});
9312 }
9313 } catch (const std::logic_error& e) {
9314 return -EINVAL;
9315 }
9316 return 0;
9317}
9318
31f18b77
FG
9319int OSDMonitor::prepare_command_osd_destroy(
9320 int32_t id,
9321 stringstream& ss)
9322{
11fdf7f2 9323 ceph_assert(paxos->is_plugged());
31f18b77
FG
9324
9325 // we check if the osd exists for the benefit of `osd purge`, which may
9326 // have previously removed the osd. If the osd does not exist, return
9327 // -ENOENT to convey this, and let the caller deal with it.
9328 //
9329 // we presume that all auth secrets and config keys were removed prior
9330 // to this command being called. if they exist by now, we also assume
9331 // they must have been created by some other command and do not pertain
9332 // to this non-existent osd.
9333 if (!osdmap.exists(id)) {
9334 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9335 return -ENOENT;
9336 }
9337
9338 uuid_d uuid = osdmap.get_uuid(id);
9339 dout(10) << __func__ << " destroying osd." << id
9340 << " uuid " << uuid << dendl;
9341
9342 // if it has been destroyed, we assume our work here is done.
9343 if (osdmap.is_destroyed(id)) {
9344 ss << "destroyed osd." << id;
9345 return 0;
9346 }
9347
9348 EntityName cephx_entity, lockbox_entity;
9349 bool idempotent_auth = false, idempotent_cks = false;
9350
9351 int err = mon->authmon()->validate_osd_destroy(id, uuid,
9352 cephx_entity,
9353 lockbox_entity,
9354 ss);
9355 if (err < 0) {
9356 if (err == -ENOENT) {
9357 idempotent_auth = true;
31f18b77
FG
9358 } else {
9359 return err;
9360 }
9361 }
9362
9363 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9364 err = svc->validate_osd_destroy(id, uuid);
9365 if (err < 0) {
11fdf7f2 9366 ceph_assert(err == -ENOENT);
31f18b77
FG
9367 err = 0;
9368 idempotent_cks = true;
9369 }
9370
9371 if (!idempotent_auth) {
9372 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
11fdf7f2 9373 ceph_assert(0 == err);
31f18b77
FG
9374 }
9375
9376 if (!idempotent_cks) {
9377 svc->do_osd_destroy(id, uuid);
9378 }
9379
9380 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9381 pending_inc.new_uuid[id] = uuid_d();
9382
9383 // we can only propose_pending() once per service, otherwise we'll be
9384 // defying PaxosService and all laws of nature. Therefore, as we may
9385 // be used during 'osd purge', let's keep the caller responsible for
9386 // proposing.
11fdf7f2 9387 ceph_assert(err == 0);
31f18b77
FG
9388 return 0;
9389}
9390
9391int OSDMonitor::prepare_command_osd_purge(
9392 int32_t id,
9393 stringstream& ss)
9394{
11fdf7f2 9395 ceph_assert(paxos->is_plugged());
31f18b77
FG
9396 dout(10) << __func__ << " purging osd." << id << dendl;
9397
11fdf7f2 9398 ceph_assert(!osdmap.is_up(id));
31f18b77
FG
9399
9400 /*
9401 * This may look a bit weird, but this is what's going to happen:
9402 *
9403 * 1. we make sure that removing from crush works
9404 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9405 * error, then we abort the whole operation, as no updates
9406 * have been made. However, we this function will have
9407 * side-effects, thus we need to make sure that all operations
9408 * performed henceforth will *always* succeed.
9409 * 3. we call `prepare_command_osd_remove()`. Although this
9410 * function can return an error, it currently only checks if the
9411 * osd is up - and we have made sure that it is not so, so there
9412 * is no conflict, and it is effectively an update.
9413 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9414 * the crush update we delayed from before.
9415 */
9416
9417 CrushWrapper newcrush;
9418 _get_pending_crush(newcrush);
9419
9420 bool may_be_idempotent = false;
9421
9422 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9423 if (err == -ENOENT) {
9424 err = 0;
9425 may_be_idempotent = true;
9426 } else if (err < 0) {
9427 ss << "error removing osd." << id << " from crush";
9428 return err;
9429 }
9430
9431 // no point destroying the osd again if it has already been marked destroyed
9432 if (!osdmap.is_destroyed(id)) {
9433 err = prepare_command_osd_destroy(id, ss);
9434 if (err < 0) {
9435 if (err == -ENOENT) {
9436 err = 0;
9437 } else {
9438 return err;
9439 }
9440 } else {
9441 may_be_idempotent = false;
9442 }
9443 }
11fdf7f2 9444 ceph_assert(0 == err);
31f18b77
FG
9445
9446 if (may_be_idempotent && !osdmap.exists(id)) {
9447 dout(10) << __func__ << " osd." << id << " does not exist and "
9448 << "we are idempotent." << dendl;
9449 return -ENOENT;
9450 }
9451
9452 err = prepare_command_osd_remove(id);
9453 // we should not be busy, as we should have made sure this id is not up.
11fdf7f2 9454 ceph_assert(0 == err);
31f18b77
FG
9455
9456 do_osd_crush_remove(newcrush);
9457 return 0;
9458}
9459
7c673cae 9460bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
11fdf7f2 9461 const cmdmap_t& cmdmap)
7c673cae
FG
9462{
9463 op->mark_osdmon_event(__func__);
9f95a23c 9464 auto m = op->get_req<MMonCommand>();
7c673cae
FG
9465 bool ret = false;
9466 stringstream ss;
9467 string rs;
9468 bufferlist rdata;
9469 int err = 0;
9470
9471 string format;
9f95a23c 9472 cmd_getval(cmdmap, "format", format, string("plain"));
7c673cae
FG
9473 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9474
9475 string prefix;
9f95a23c 9476 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
9477
9478 int64_t osdid;
11fdf7f2 9479 string osd_name;
b32b8144
FG
9480 bool osdid_present = false;
9481 if (prefix != "osd pg-temp" &&
9482 prefix != "osd pg-upmap" &&
9483 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9f95a23c 9484 osdid_present = cmd_getval(cmdmap, "id", osdid);
b32b8144 9485 }
7c673cae
FG
9486 if (osdid_present) {
9487 ostringstream oss;
9488 oss << "osd." << osdid;
11fdf7f2 9489 osd_name = oss.str();
7c673cae
FG
9490 }
9491
9492 // Even if there's a pending state with changes that could affect
9493 // a command, considering that said state isn't yet committed, we
9494 // just don't care about those changes if the command currently being
9495 // handled acts as a no-op against the current committed state.
9496 // In a nutshell, we assume this command happens *before*.
9497 //
9498 // Let me make this clearer:
9499 //
9500 // - If we have only one client, and that client issues some
9501 // operation that would conflict with this operation but is
9502 // still on the pending state, then we would be sure that said
9503 // operation wouldn't have returned yet, so the client wouldn't
9504 // issue this operation (unless the client didn't wait for the
9505 // operation to finish, and that would be the client's own fault).
9506 //
9507 // - If we have more than one client, each client will observe
9508 // whatever is the state at the moment of the commit. So, if we
9509 // have two clients, one issuing an unlink and another issuing a
9510 // link, and if the link happens while the unlink is still on the
9511 // pending state, from the link's point-of-view this is a no-op.
9512 // If different clients are issuing conflicting operations and
9513 // they care about that, then the clients should make sure they
9514 // enforce some kind of concurrency mechanism -- from our
9515 // perspective that's what Douglas Adams would call an SEP.
9516 //
9517 // This should be used as a general guideline for most commands handled
9518 // in this function. Adapt as you see fit, but please bear in mind that
9519 // this is the expected behavior.
9520
9521
9522 if (prefix == "osd setcrushmap" ||
9523 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
9524 if (pending_inc.crush.length()) {
9525 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9526 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9527 return true;
9528 }
7c673cae
FG
9529 dout(10) << "prepare_command setting new crush map" << dendl;
9530 bufferlist data(m->get_data());
9531 CrushWrapper crush;
9532 try {
11fdf7f2 9533 auto bl = data.cbegin();
7c673cae
FG
9534 crush.decode(bl);
9535 }
9536 catch (const std::exception &e) {
9537 err = -EINVAL;
9538 ss << "Failed to parse crushmap: " << e.what();
9539 goto reply;
9540 }
31f18b77
FG
9541
9542 int64_t prior_version = 0;
9f95a23c 9543 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
31f18b77
FG
9544 if (prior_version == osdmap.get_crush_version() - 1) {
9545 // see if we are a resend of the last update. this is imperfect
9546 // (multiple racing updaters may not both get reliable success)
9547 // but we expect crush updaters (via this interface) to be rare-ish.
9548 bufferlist current, proposed;
9549 osdmap.crush->encode(current, mon->get_quorum_con_features());
9550 crush.encode(proposed, mon->get_quorum_con_features());
9551 if (current.contents_equal(proposed)) {
9552 dout(10) << __func__
9553 << " proposed matches current and version equals previous"
9554 << dendl;
9555 err = 0;
9556 ss << osdmap.get_crush_version();
9557 goto reply;
9558 }
9559 }
9560 if (prior_version != osdmap.get_crush_version()) {
9561 err = -EPERM;
9562 ss << "prior_version " << prior_version << " != crush version "
9563 << osdmap.get_crush_version();
9564 goto reply;
9565 }
9566 }
7c673cae 9567
3efd9988 9568 if (crush.has_legacy_rule_ids()) {
31f18b77
FG
9569 err = -EINVAL;
9570 ss << "crush maps with ruleset != ruleid are no longer allowed";
9571 goto reply;
9572 }
7c673cae
FG
9573 if (!validate_crush_against_features(&crush, ss)) {
9574 err = -EINVAL;
9575 goto reply;
9576 }
31f18b77 9577
3efd9988
FG
9578 err = osdmap.validate_crush_rules(&crush, &ss);
9579 if (err < 0) {
9580 goto reply;
7c673cae
FG
9581 }
9582
11fdf7f2 9583 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
9584 // sanity check: test some inputs to make sure this map isn't
9585 // totally broken
9586 dout(10) << " testing map" << dendl;
9587 stringstream ess;
9588 CrushTester tester(crush, ess);
b5b8bbf5 9589 tester.set_min_x(0);
224ce89b 9590 tester.set_max_x(50);
b5b8bbf5 9591 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 9592 int r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 9593 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
9594 if (r < 0) {
9595 dout(10) << " tester.test_with_fork returns " << r
9596 << ": " << ess.str() << dendl;
9597 ss << "crush smoke test failed with " << r << ": " << ess.str();
9598 err = r;
9599 goto reply;
9600 }
b5b8bbf5
FG
9601 dout(10) << __func__ << " crush somke test duration: "
9602 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
9603 }
9604
7c673cae 9605 pending_inc.crush = data;
31f18b77 9606 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
9607 goto update;
9608
3efd9988
FG
9609 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9610 CrushWrapper newcrush;
9611 _get_pending_crush(newcrush);
9612 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9613 int bid = -1 - b;
9614 if (newcrush.bucket_exists(bid) &&
11fdf7f2 9615 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
3efd9988
FG
9616 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9617 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9618 }
9619 }
9620 if (!validate_crush_against_features(&newcrush, ss)) {
9621 err = -EINVAL;
9622 goto reply;
9623 }
9624 pending_inc.crush.clear();
9625 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9626 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9627 get_last_committed() + 1));
9628 return true;
7c673cae 9629 } else if (prefix == "osd crush set-device-class") {
7c673cae 9630 string device_class;
9f95a23c 9631 if (!cmd_getval(cmdmap, "class", device_class)) {
7c673cae
FG
9632 err = -EINVAL; // no value!
9633 goto reply;
9634 }
9635
224ce89b
WB
9636 bool stop = false;
9637 vector<string> idvec;
9f95a23c 9638 cmd_getval(cmdmap, "ids", idvec);
7c673cae
FG
9639 CrushWrapper newcrush;
9640 _get_pending_crush(newcrush);
224ce89b
WB
9641 set<int> updated;
9642 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9643 set<int> osds;
9644 // wildcard?
9645 if (j == 0 &&
9646 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9647 osdmap.get_all_osds(osds);
9648 stop = true;
9649 } else {
9650 // try traditional single osd way
9651 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9652 if (osd < 0) {
9653 // ss has reason for failure
9654 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9655 err = -EINVAL;
9656 continue;
9657 }
9658 osds.insert(osd);
9659 }
7c673cae 9660
224ce89b
WB
9661 for (auto &osd : osds) {
9662 if (!osdmap.exists(osd)) {
9663 ss << "osd." << osd << " does not exist. ";
9664 continue;
9665 }
7c673cae 9666
224ce89b
WB
9667 ostringstream oss;
9668 oss << "osd." << osd;
9669 string name = oss.str();
7c673cae 9670
3a9019d9
FG
9671 if (newcrush.get_max_devices() < osd + 1) {
9672 newcrush.set_max_devices(osd + 1);
9673 }
224ce89b
WB
9674 string action;
9675 if (newcrush.item_exists(osd)) {
9676 action = "updating";
9677 } else {
9678 action = "creating";
9679 newcrush.set_item_name(osd, name);
9680 }
7c673cae 9681
224ce89b
WB
9682 dout(5) << action << " crush item id " << osd << " name '" << name
9683 << "' device_class '" << device_class << "'"
9684 << dendl;
9685 err = newcrush.update_device_class(osd, device_class, name, &ss);
9686 if (err < 0) {
9687 goto reply;
9688 }
9689 if (err == 0 && !_have_pending_crush()) {
9690 if (!stop) {
9691 // for single osd only, wildcard makes too much noise
9692 ss << "set-device-class item id " << osd << " name '" << name
11fdf7f2 9693 << "' device_class '" << device_class << "': no change. ";
224ce89b
WB
9694 }
9695 } else {
9696 updated.insert(osd);
9697 }
9698 }
7c673cae
FG
9699 }
9700
224ce89b
WB
9701 if (!updated.empty()) {
9702 pending_inc.crush.clear();
9703 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9704 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9705 getline(ss, rs);
9706 wait_for_finished_proposal(op,
9707 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9708 return true;
9709 }
7c673cae 9710
c07f9fc5
FG
9711 } else if (prefix == "osd crush rm-device-class") {
9712 bool stop = false;
9713 vector<string> idvec;
9f95a23c 9714 cmd_getval(cmdmap, "ids", idvec);
c07f9fc5
FG
9715 CrushWrapper newcrush;
9716 _get_pending_crush(newcrush);
9717 set<int> updated;
9718
9719 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9720 set<int> osds;
9721
9722 // wildcard?
9723 if (j == 0 &&
9724 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9725 osdmap.get_all_osds(osds);
9726 stop = true;
9727 } else {
9728 // try traditional single osd way
9729 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9730 if (osd < 0) {
9731 // ss has reason for failure
9732 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9733 err = -EINVAL;
9734 goto reply;
9735 }
9736 osds.insert(osd);
9737 }
9738
9739 for (auto &osd : osds) {
9740 if (!osdmap.exists(osd)) {
9741 ss << "osd." << osd << " does not exist. ";
9742 continue;
9743 }
9744
9745 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
9746 if (!class_name) {
9747 ss << "osd." << osd << " belongs to no class, ";
9748 continue;
9749 }
9750 // note that we do not verify if class_is_in_use here
9751 // in case the device is misclassified and user wants
9752 // to overridely reset...
9753
11fdf7f2 9754 err = newcrush.remove_device_class(cct, osd, &ss);
c07f9fc5
FG
9755 if (err < 0) {
9756 // ss has reason for failure
9757 goto reply;
9758 }
9759 updated.insert(osd);
9760 }
9761 }
9762
9763 if (!updated.empty()) {
9764 pending_inc.crush.clear();
9765 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9766 ss << "done removing class of osd(s): " << updated;
9767 getline(ss, rs);
9768 wait_for_finished_proposal(op,
9769 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9770 return true;
9771 }
11fdf7f2
TL
9772 } else if (prefix == "osd crush class create") {
9773 string device_class;
9f95a23c 9774 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
9775 err = -EINVAL; // no value!
9776 goto reply;
9777 }
9f95a23c 9778 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
9779 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9780 << "luminous' before using crush device classes";
9781 err = -EPERM;
9782 goto reply;
9783 }
9784 if (!_have_pending_crush() &&
9785 _get_stable_crush().class_exists(device_class)) {
9786 ss << "class '" << device_class << "' already exists";
9787 goto reply;
9788 }
9789 CrushWrapper newcrush;
9790 _get_pending_crush(newcrush);
9791 if (newcrush.class_exists(device_class)) {
9792 ss << "class '" << device_class << "' already exists";
9793 goto update;
9794 }
9795 int class_id = newcrush.get_or_create_class_id(device_class);
9796 pending_inc.crush.clear();
9797 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9798 ss << "created class " << device_class << " with id " << class_id
9799 << " to crush map";
9800 goto update;
9801 } else if (prefix == "osd crush class rm") {
9802 string device_class;
9f95a23c 9803 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
9804 err = -EINVAL; // no value!
9805 goto reply;
9806 }
9f95a23c 9807 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
9808 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9809 << "luminous' before using crush device classes";
9810 err = -EPERM;
9811 goto reply;
9812 }
9813
9814 if (!osdmap.crush->class_exists(device_class)) {
9815 err = 0;
9816 goto reply;
9817 }
9818
9819 CrushWrapper newcrush;
9820 _get_pending_crush(newcrush);
9821 if (!newcrush.class_exists(device_class)) {
9822 err = 0; // make command idempotent
9823 goto wait;
9824 }
9825 int class_id = newcrush.get_class_id(device_class);
9826 stringstream ts;
9827 if (newcrush.class_is_in_use(class_id, &ts)) {
9828 err = -EBUSY;
9829 ss << "class '" << device_class << "' " << ts.str();
9830 goto reply;
9831 }
9832
9833 // check if class is used by any erasure-code-profiles
9834 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9835 osdmap.get_erasure_code_profiles();
9836 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9837#ifdef HAVE_STDLIB_MAP_SPLICING
9838 ec_profiles.merge(old_ec_profiles);
9839#else
9840 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9841 make_move_iterator(end(old_ec_profiles)));
9842#endif
9843 list<string> referenced_by;
9844 for (auto &i: ec_profiles) {
9845 for (auto &j: i.second) {
9846 if ("crush-device-class" == j.first && device_class == j.second) {
9847 referenced_by.push_back(i.first);
9848 }
9849 }
9850 }
9851 if (!referenced_by.empty()) {
9852 err = -EBUSY;
9853 ss << "class '" << device_class
9854 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9855 goto reply;
9856 }
9857
9858 set<int> osds;
9859 newcrush.get_devices_by_class(device_class, &osds);
9860 for (auto& p: osds) {
9861 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9862 if (err < 0) {
9863 // ss has reason for failure
9864 goto reply;
9865 }
9866 }
9867
9868 if (osds.empty()) {
9869 // empty class, remove directly
9870 err = newcrush.remove_class_name(device_class);
9871 if (err < 0) {
9872 ss << "class '" << device_class << "' cannot be removed '"
9873 << cpp_strerror(err) << "'";
9874 goto reply;
9875 }
9876 }
9877
9878 pending_inc.crush.clear();
9879 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9880 ss << "removed class " << device_class << " with id " << class_id
9881 << " from crush map";
9882 goto update;
35e4c445
FG
9883 } else if (prefix == "osd crush class rename") {
9884 string srcname, dstname;
9f95a23c 9885 if (!cmd_getval(cmdmap, "srcname", srcname)) {
35e4c445
FG
9886 err = -EINVAL;
9887 goto reply;
9888 }
9f95a23c 9889 if (!cmd_getval(cmdmap, "dstname", dstname)) {
35e4c445
FG
9890 err = -EINVAL;
9891 goto reply;
9892 }
9893
9894 CrushWrapper newcrush;
9895 _get_pending_crush(newcrush);
181888fb
FG
9896 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9897 // suppose this is a replay and return success
9898 // so command is idempotent
9899 ss << "already renamed to '" << dstname << "'";
9900 err = 0;
35e4c445
FG
9901 goto reply;
9902 }
c07f9fc5 9903
35e4c445
FG
9904 err = newcrush.rename_class(srcname, dstname);
9905 if (err < 0) {
9906 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9907 << cpp_strerror(err);
9908 goto reply;
9909 }
9910
9911 pending_inc.crush.clear();
9912 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9913 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9914 goto update;
7c673cae
FG
9915 } else if (prefix == "osd crush add-bucket") {
9916 // os crush add-bucket <name> <type>
9917 string name, typestr;
11fdf7f2 9918 vector<string> argvec;
9f95a23c
TL
9919 cmd_getval(cmdmap, "name", name);
9920 cmd_getval(cmdmap, "type", typestr);
9921 cmd_getval(cmdmap, "args", argvec);
11fdf7f2
TL
9922 map<string,string> loc;
9923 if (!argvec.empty()) {
9924 CrushWrapper::parse_loc_map(argvec, &loc);
9925 dout(0) << "will create and move bucket '" << name
9926 << "' to location " << loc << dendl;
9927 }
7c673cae
FG
9928
9929 if (!_have_pending_crush() &&
9930 _get_stable_crush().name_exists(name)) {
9931 ss << "bucket '" << name << "' already exists";
9932 goto reply;
9933 }
9934
9935 CrushWrapper newcrush;
9936 _get_pending_crush(newcrush);
9937
9938 if (newcrush.name_exists(name)) {
9939 ss << "bucket '" << name << "' already exists";
9940 goto update;
9941 }
9942 int type = newcrush.get_type_id(typestr);
9943 if (type < 0) {
9944 ss << "type '" << typestr << "' does not exist";
9945 err = -EINVAL;
9946 goto reply;
9947 }
9948 if (type == 0) {
9949 ss << "type '" << typestr << "' is for devices, not buckets";
9950 err = -EINVAL;
9951 goto reply;
9952 }
9953 int bucketno;
9954 err = newcrush.add_bucket(0, 0,
9955 CRUSH_HASH_DEFAULT, type, 0, NULL,
9956 NULL, &bucketno);
9957 if (err < 0) {
9958 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9959 goto reply;
9960 }
9961 err = newcrush.set_item_name(bucketno, name);
9962 if (err < 0) {
9963 ss << "error setting bucket name to '" << name << "'";
9964 goto reply;
9965 }
9966
11fdf7f2
TL
9967 if (!loc.empty()) {
9968 if (!newcrush.check_item_loc(cct, bucketno, loc,
9969 (int *)NULL)) {
9970 err = newcrush.move_bucket(cct, bucketno, loc);
9971 if (err < 0) {
9972 ss << "error moving bucket '" << name << "' to location " << loc;
9973 goto reply;
9974 }
9975 } else {
9976 ss << "no need to move item id " << bucketno << " name '" << name
9977 << "' to location " << loc << " in crush map";
9978 }
9979 }
9980
7c673cae
FG
9981 pending_inc.crush.clear();
9982 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
9983 if (loc.empty()) {
9984 ss << "added bucket " << name << " type " << typestr
9985 << " to crush map";
9986 } else {
9987 ss << "added bucket " << name << " type " << typestr
9988 << " to location " << loc;
9989 }
7c673cae
FG
9990 goto update;
9991 } else if (prefix == "osd crush rename-bucket") {
9992 string srcname, dstname;
9f95a23c
TL
9993 cmd_getval(cmdmap, "srcname", srcname);
9994 cmd_getval(cmdmap, "dstname", dstname);
7c673cae
FG
9995
9996 err = crush_rename_bucket(srcname, dstname, &ss);
9997 if (err == -EALREADY) // equivalent to success for idempotency
9998 err = 0;
9999 if (err)
10000 goto reply;
10001 else
10002 goto update;
c07f9fc5
FG
10003 } else if (prefix == "osd crush weight-set create" ||
10004 prefix == "osd crush weight-set create-compat") {
10005 CrushWrapper newcrush;
10006 _get_pending_crush(newcrush);
10007 int64_t pool;
10008 int positions;
10009 if (newcrush.has_non_straw2_buckets()) {
10010 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
10011 err = -EPERM;
10012 goto reply;
10013 }
c07f9fc5 10014 if (prefix == "osd crush weight-set create") {
9f95a23c
TL
10015 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10016 osdmap.require_min_compat_client < ceph_release_t::luminous) {
c07f9fc5 10017 ss << "require_min_compat_client "
9f95a23c 10018 << osdmap.require_min_compat_client
c07f9fc5
FG
10019 << " < luminous, which is required for per-pool weight-sets. "
10020 << "Try 'ceph osd set-require-min-compat-client luminous' "
10021 << "before using the new interface";
10022 err = -EPERM;
10023 goto reply;
10024 }
10025 string poolname, mode;
9f95a23c 10026 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10027 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10028 if (pool < 0) {
10029 ss << "pool '" << poolname << "' not found";
10030 err = -ENOENT;
10031 goto reply;
10032 }
9f95a23c 10033 cmd_getval(cmdmap, "mode", mode);
c07f9fc5
FG
10034 if (mode != "flat" && mode != "positional") {
10035 ss << "unrecognized weight-set mode '" << mode << "'";
10036 err = -EINVAL;
10037 goto reply;
10038 }
10039 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10040 } else {
10041 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10042 positions = 1;
224ce89b 10043 }
11fdf7f2
TL
10044 if (!newcrush.create_choose_args(pool, positions)) {
10045 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10046 ss << "compat weight-set already created";
10047 } else {
10048 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10049 << "' already created";
10050 }
10051 goto reply;
10052 }
c07f9fc5
FG
10053 pending_inc.crush.clear();
10054 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10055 goto update;
224ce89b 10056
c07f9fc5
FG
10057 } else if (prefix == "osd crush weight-set rm" ||
10058 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
10059 CrushWrapper newcrush;
10060 _get_pending_crush(newcrush);
c07f9fc5
FG
10061 int64_t pool;
10062 if (prefix == "osd crush weight-set rm") {
10063 string poolname;
9f95a23c 10064 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10065 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10066 if (pool < 0) {
10067 ss << "pool '" << poolname << "' not found";
10068 err = -ENOENT;
10069 goto reply;
10070 }
10071 } else {
10072 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 10073 }
c07f9fc5
FG
10074 newcrush.rm_choose_args(pool);
10075 pending_inc.crush.clear();
10076 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10077 goto update;
224ce89b 10078
c07f9fc5
FG
10079 } else if (prefix == "osd crush weight-set reweight" ||
10080 prefix == "osd crush weight-set reweight-compat") {
10081 string poolname, item;
10082 vector<double> weight;
9f95a23c
TL
10083 cmd_getval(cmdmap, "pool", poolname);
10084 cmd_getval(cmdmap, "item", item);
10085 cmd_getval(cmdmap, "weight", weight);
c07f9fc5
FG
10086 CrushWrapper newcrush;
10087 _get_pending_crush(newcrush);
10088 int64_t pool;
10089 if (prefix == "osd crush weight-set reweight") {
10090 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10091 if (pool < 0) {
10092 ss << "pool '" << poolname << "' not found";
10093 err = -ENOENT;
10094 goto reply;
10095 }
10096 if (!newcrush.have_choose_args(pool)) {
10097 ss << "no weight-set for pool '" << poolname << "'";
10098 err = -ENOENT;
10099 goto reply;
10100 }
10101 auto arg_map = newcrush.choose_args_get(pool);
10102 int positions = newcrush.get_choose_args_positions(arg_map);
10103 if (weight.size() != (size_t)positions) {
10104 ss << "must specify exact " << positions << " weight values";
10105 err = -EINVAL;
10106 goto reply;
10107 }
10108 } else {
10109 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10110 if (!newcrush.have_choose_args(pool)) {
10111 ss << "no backward-compatible weight-set";
10112 err = -ENOENT;
10113 goto reply;
10114 }
224ce89b 10115 }
c07f9fc5
FG
10116 if (!newcrush.name_exists(item)) {
10117 ss << "item '" << item << "' does not exist";
10118 err = -ENOENT;
224ce89b
WB
10119 goto reply;
10120 }
c07f9fc5 10121 err = newcrush.choose_args_adjust_item_weightf(
11fdf7f2 10122 cct,
c07f9fc5
FG
10123 newcrush.choose_args_get(pool),
10124 newcrush.get_item_id(item),
10125 weight,
10126 &ss);
224ce89b 10127 if (err < 0) {
224ce89b
WB
10128 goto reply;
10129 }
c07f9fc5 10130 err = 0;
224ce89b
WB
10131 pending_inc.crush.clear();
10132 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
224ce89b 10133 goto update;
7c673cae
FG
10134 } else if (osdid_present &&
10135 (prefix == "osd crush set" || prefix == "osd crush add")) {
10136 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10137 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10138 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10139
10140 if (!osdmap.exists(osdid)) {
10141 err = -ENOENT;
11fdf7f2
TL
10142 ss << osd_name
10143 << " does not exist. Create it before updating the crush map";
7c673cae
FG
10144 goto reply;
10145 }
10146
10147 double weight;
9f95a23c 10148 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10149 ss << "unable to parse weight value '"
11fdf7f2 10150 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10151 err = -EINVAL;
10152 goto reply;
10153 }
10154
10155 string args;
10156 vector<string> argvec;
9f95a23c 10157 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10158 map<string,string> loc;
10159 CrushWrapper::parse_loc_map(argvec, &loc);
10160
10161 if (prefix == "osd crush set"
10162 && !_get_stable_crush().item_exists(osdid)) {
10163 err = -ENOENT;
11fdf7f2 10164 ss << "unable to set item id " << osdid << " name '" << osd_name
7c673cae
FG
10165 << "' weight " << weight << " at location " << loc
10166 << ": does not exist";
10167 goto reply;
10168 }
10169
10170 dout(5) << "adding/updating crush item id " << osdid << " name '"
11fdf7f2 10171 << osd_name << "' weight " << weight << " at location "
7c673cae
FG
10172 << loc << dendl;
10173 CrushWrapper newcrush;
10174 _get_pending_crush(newcrush);
10175
10176 string action;
10177 if (prefix == "osd crush set" ||
11fdf7f2 10178 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
7c673cae 10179 action = "set";
11fdf7f2 10180 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10181 } else {
10182 action = "add";
11fdf7f2 10183 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10184 if (err == 0)
10185 err = 1;
10186 }
10187
10188 if (err < 0)
10189 goto reply;
10190
10191 if (err == 0 && !_have_pending_crush()) {
11fdf7f2
TL
10192 ss << action << " item id " << osdid << " name '" << osd_name
10193 << "' weight " << weight << " at location " << loc << ": no change";
7c673cae
FG
10194 goto reply;
10195 }
10196
10197 pending_inc.crush.clear();
10198 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
10199 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10200 << weight << " at location " << loc << " to crush map";
7c673cae
FG
10201 getline(ss, rs);
10202 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10203 get_last_committed() + 1));
10204 return true;
10205
10206 } else if (prefix == "osd crush create-or-move") {
10207 do {
10208 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10209 if (!osdmap.exists(osdid)) {
10210 err = -ENOENT;
11fdf7f2
TL
10211 ss << osd_name
10212 << " does not exist. create it before updating the crush map";
7c673cae
FG
10213 goto reply;
10214 }
10215
10216 double weight;
9f95a23c 10217 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10218 ss << "unable to parse weight value '"
11fdf7f2 10219 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10220 err = -EINVAL;
10221 goto reply;
10222 }
10223
10224 string args;
10225 vector<string> argvec;
9f95a23c 10226 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10227 map<string,string> loc;
10228 CrushWrapper::parse_loc_map(argvec, &loc);
10229
11fdf7f2
TL
10230 dout(0) << "create-or-move crush item name '" << osd_name
10231 << "' initial_weight " << weight << " at location " << loc
10232 << dendl;
7c673cae
FG
10233
10234 CrushWrapper newcrush;
10235 _get_pending_crush(newcrush);
10236
11fdf7f2
TL
10237 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10238 g_conf()->osd_crush_update_weight_set);
7c673cae 10239 if (err == 0) {
11fdf7f2
TL
10240 ss << "create-or-move updated item name '" << osd_name
10241 << "' weight " << weight
7c673cae
FG
10242 << " at location " << loc << " to crush map";
10243 break;
10244 }
10245 if (err > 0) {
10246 pending_inc.crush.clear();
10247 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
10248 ss << "create-or-move updating item name '" << osd_name
10249 << "' weight " << weight
7c673cae
FG
10250 << " at location " << loc << " to crush map";
10251 getline(ss, rs);
10252 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10253 get_last_committed() + 1));
10254 return true;
10255 }
10256 } while (false);
10257
10258 } else if (prefix == "osd crush move") {
10259 do {
10260 // osd crush move <name> <loc1> [<loc2> ...]
11fdf7f2 10261 string name;
7c673cae 10262 vector<string> argvec;
9f95a23c
TL
10263 cmd_getval(cmdmap, "name", name);
10264 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10265 map<string,string> loc;
10266 CrushWrapper::parse_loc_map(argvec, &loc);
10267
10268 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10269 CrushWrapper newcrush;
10270 _get_pending_crush(newcrush);
10271
10272 if (!newcrush.name_exists(name)) {
10273 err = -ENOENT;
10274 ss << "item " << name << " does not exist";
10275 break;
10276 }
10277 int id = newcrush.get_item_id(name);
10278
11fdf7f2 10279 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
7c673cae 10280 if (id >= 0) {
11fdf7f2
TL
10281 err = newcrush.create_or_move_item(
10282 cct, id, 0, name, loc,
10283 g_conf()->osd_crush_update_weight_set);
7c673cae 10284 } else {
11fdf7f2 10285 err = newcrush.move_bucket(cct, id, loc);
7c673cae
FG
10286 }
10287 if (err >= 0) {
10288 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10289 pending_inc.crush.clear();
10290 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10291 getline(ss, rs);
10292 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10293 get_last_committed() + 1));
10294 return true;
10295 }
10296 } else {
10297 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10298 err = 0;
10299 }
10300 } while (false);
31f18b77 10301 } else if (prefix == "osd crush swap-bucket") {
11fdf7f2 10302 string source, dest;
9f95a23c
TL
10303 cmd_getval(cmdmap, "source", source);
10304 cmd_getval(cmdmap, "dest", dest);
11fdf7f2
TL
10305
10306 bool force = false;
9f95a23c 10307 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2 10308
31f18b77
FG
10309 CrushWrapper newcrush;
10310 _get_pending_crush(newcrush);
10311 if (!newcrush.name_exists(source)) {
10312 ss << "source item " << source << " does not exist";
10313 err = -ENOENT;
10314 goto reply;
10315 }
10316 if (!newcrush.name_exists(dest)) {
10317 ss << "dest item " << dest << " does not exist";
10318 err = -ENOENT;
10319 goto reply;
10320 }
10321 int sid = newcrush.get_item_id(source);
10322 int did = newcrush.get_item_id(dest);
10323 int sparent;
11fdf7f2 10324 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
31f18b77
FG
10325 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10326 err = -EPERM;
10327 goto reply;
10328 }
10329 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
11fdf7f2 10330 !force) {
31f18b77
FG
10331 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10332 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10333 << "; pass --yes-i-really-mean-it to proceed anyway";
10334 err = -EPERM;
10335 goto reply;
10336 }
11fdf7f2 10337 int r = newcrush.swap_bucket(cct, sid, did);
31f18b77
FG
10338 if (r < 0) {
10339 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 10340 err = r;
31f18b77
FG
10341 goto reply;
10342 }
10343 ss << "swapped bucket of " << source << " to " << dest;
10344 pending_inc.crush.clear();
10345 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10346 wait_for_finished_proposal(op,
10347 new Monitor::C_Command(mon, op, err, ss.str(),
10348 get_last_committed() + 1));
10349 return true;
10350 } else if (prefix == "osd crush link") {
10351 // osd crush link <name> <loc1> [<loc2> ...]
10352 string name;
9f95a23c 10353 cmd_getval(cmdmap, "name", name);
31f18b77 10354 vector<string> argvec;
9f95a23c 10355 cmd_getval(cmdmap, "args", argvec);
31f18b77
FG
10356 map<string,string> loc;
10357 CrushWrapper::parse_loc_map(argvec, &loc);
10358
10359 // Need an explicit check for name_exists because get_item_id returns
10360 // 0 on unfound.
10361 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
10362 if (!osdmap.crush->name_exists(name)) {
10363 err = -ENOENT;
10364 ss << "item " << name << " does not exist";
10365 goto reply;
10366 } else {
10367 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10368 }
11fdf7f2 10369 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
7c673cae
FG
10370 ss << "no need to move item id " << id << " name '" << name
10371 << "' to location " << loc << " in crush map";
10372 err = 0;
10373 goto reply;
10374 }
10375
10376 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10377 CrushWrapper newcrush;
10378 _get_pending_crush(newcrush);
10379
10380 if (!newcrush.name_exists(name)) {
10381 err = -ENOENT;
10382 ss << "item " << name << " does not exist";
10383 goto reply;
10384 } else {
10385 int id = newcrush.get_item_id(name);
11fdf7f2
TL
10386 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10387 err = newcrush.link_bucket(cct, id, loc);
7c673cae
FG
10388 if (err >= 0) {
10389 ss << "linked item id " << id << " name '" << name
10390 << "' to location " << loc << " in crush map";
10391 pending_inc.crush.clear();
10392 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10393 } else {
10394 ss << "cannot link item id " << id << " name '" << name
10395 << "' to location " << loc;
10396 goto reply;
10397 }
10398 } else {
10399 ss << "no need to move item id " << id << " name '" << name
10400 << "' to location " << loc << " in crush map";
10401 err = 0;
10402 }
10403 }
10404 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10405 get_last_committed() + 1));
10406 return true;
10407 } else if (prefix == "osd crush rm" ||
10408 prefix == "osd crush remove" ||
10409 prefix == "osd crush unlink") {
10410 do {
10411 // osd crush rm <id> [ancestor]
10412 CrushWrapper newcrush;
10413 _get_pending_crush(newcrush);
10414
10415 string name;
9f95a23c 10416 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10417
10418 if (!osdmap.crush->name_exists(name)) {
10419 err = 0;
10420 ss << "device '" << name << "' does not appear in the crush map";
10421 break;
10422 }
10423 if (!newcrush.name_exists(name)) {
10424 err = 0;
10425 ss << "device '" << name << "' does not appear in the crush map";
10426 getline(ss, rs);
10427 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10428 get_last_committed() + 1));
10429 return true;
10430 }
10431 int id = newcrush.get_item_id(name);
31f18b77
FG
10432 int ancestor = 0;
10433
7c673cae
FG
10434 bool unlink_only = prefix == "osd crush unlink";
10435 string ancestor_str;
9f95a23c 10436 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
7c673cae
FG
10437 if (!newcrush.name_exists(ancestor_str)) {
10438 err = -ENOENT;
10439 ss << "ancestor item '" << ancestor_str
10440 << "' does not appear in the crush map";
10441 break;
10442 }
31f18b77 10443 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 10444 }
31f18b77
FG
10445
10446 err = prepare_command_osd_crush_remove(
10447 newcrush,
10448 id, ancestor,
10449 (ancestor < 0), unlink_only);
10450
7c673cae
FG
10451 if (err == -ENOENT) {
10452 ss << "item " << id << " does not appear in that position";
10453 err = 0;
10454 break;
10455 }
10456 if (err == 0) {
81eedcae
TL
10457 if (!unlink_only)
10458 pending_inc.new_crush_node_flags[id] = 0;
7c673cae
FG
10459 ss << "removed item id " << id << " name '" << name << "' from crush map";
10460 getline(ss, rs);
10461 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10462 get_last_committed() + 1));
10463 return true;
10464 }
10465 } while (false);
10466
10467 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
10468 CrushWrapper newcrush;
10469 _get_pending_crush(newcrush);
10470
11fdf7f2 10471 newcrush.reweight(cct);
7c673cae
FG
10472 pending_inc.crush.clear();
10473 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10474 ss << "reweighted crush hierarchy";
10475 getline(ss, rs);
10476 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10477 get_last_committed() + 1));
10478 return true;
10479 } else if (prefix == "osd crush reweight") {
10480 // osd crush reweight <name> <weight>
10481 CrushWrapper newcrush;
10482 _get_pending_crush(newcrush);
10483
10484 string name;
9f95a23c 10485 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10486 if (!newcrush.name_exists(name)) {
10487 err = -ENOENT;
10488 ss << "device '" << name << "' does not appear in the crush map";
10489 goto reply;
10490 }
10491
10492 int id = newcrush.get_item_id(name);
10493 if (id < 0) {
10494 ss << "device '" << name << "' is not a leaf in the crush map";
10495 err = -EINVAL;
10496 goto reply;
10497 }
10498 double w;
9f95a23c 10499 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 10500 ss << "unable to parse weight value '"
11fdf7f2 10501 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10502 err = -EINVAL;
10503 goto reply;
10504 }
10505
11fdf7f2
TL
10506 err = newcrush.adjust_item_weightf(cct, id, w,
10507 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10508 if (err < 0)
10509 goto reply;
10510 pending_inc.crush.clear();
10511 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10512 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10513 << " in crush map";
10514 getline(ss, rs);
10515 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10516 get_last_committed() + 1));
10517 return true;
10518 } else if (prefix == "osd crush reweight-subtree") {
10519 // osd crush reweight <name> <weight>
10520 CrushWrapper newcrush;
10521 _get_pending_crush(newcrush);
10522
10523 string name;
9f95a23c 10524 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10525 if (!newcrush.name_exists(name)) {
10526 err = -ENOENT;
10527 ss << "device '" << name << "' does not appear in the crush map";
10528 goto reply;
10529 }
10530
10531 int id = newcrush.get_item_id(name);
10532 if (id >= 0) {
10533 ss << "device '" << name << "' is not a subtree in the crush map";
10534 err = -EINVAL;
10535 goto reply;
10536 }
10537 double w;
9f95a23c 10538 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 10539 ss << "unable to parse weight value '"
11fdf7f2 10540 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10541 err = -EINVAL;
10542 goto reply;
10543 }
10544
11fdf7f2
TL
10545 err = newcrush.adjust_subtree_weightf(cct, id, w,
10546 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10547 if (err < 0)
10548 goto reply;
10549 pending_inc.crush.clear();
10550 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10551 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10552 << " in crush map";
10553 getline(ss, rs);
10554 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10555 get_last_committed() + 1));
10556 return true;
10557 } else if (prefix == "osd crush tunables") {
10558 CrushWrapper newcrush;
10559 _get_pending_crush(newcrush);
10560
10561 err = 0;
10562 string profile;
9f95a23c 10563 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
10564 if (profile == "legacy" || profile == "argonaut") {
10565 newcrush.set_tunables_legacy();
10566 } else if (profile == "bobtail") {
10567 newcrush.set_tunables_bobtail();
10568 } else if (profile == "firefly") {
10569 newcrush.set_tunables_firefly();
10570 } else if (profile == "hammer") {
10571 newcrush.set_tunables_hammer();
10572 } else if (profile == "jewel") {
10573 newcrush.set_tunables_jewel();
10574 } else if (profile == "optimal") {
10575 newcrush.set_tunables_optimal();
10576 } else if (profile == "default") {
10577 newcrush.set_tunables_default();
10578 } else {
10579 ss << "unrecognized profile '" << profile << "'";
10580 err = -EINVAL;
10581 goto reply;
10582 }
10583
10584 if (!validate_crush_against_features(&newcrush, ss)) {
10585 err = -EINVAL;
10586 goto reply;
10587 }
10588
10589 pending_inc.crush.clear();
10590 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10591 ss << "adjusted tunables profile to " << profile;
10592 getline(ss, rs);
10593 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10594 get_last_committed() + 1));
10595 return true;
10596 } else if (prefix == "osd crush set-tunable") {
10597 CrushWrapper newcrush;
10598 _get_pending_crush(newcrush);
10599
10600 err = 0;
10601 string tunable;
9f95a23c 10602 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
10603
10604 int64_t value = -1;
9f95a23c 10605 if (!cmd_getval(cmdmap, "value", value)) {
7c673cae 10606 err = -EINVAL;
11fdf7f2
TL
10607 ss << "failed to parse integer value "
10608 << cmd_vartype_stringify(cmdmap.at("value"));
7c673cae
FG
10609 goto reply;
10610 }
10611
10612 if (tunable == "straw_calc_version") {
224ce89b 10613 if (value != 0 && value != 1) {
7c673cae
FG
10614 ss << "value must be 0 or 1; got " << value;
10615 err = -EINVAL;
10616 goto reply;
10617 }
10618 newcrush.set_straw_calc_version(value);
10619 } else {
10620 ss << "unrecognized tunable '" << tunable << "'";
10621 err = -EINVAL;
10622 goto reply;
10623 }
10624
10625 if (!validate_crush_against_features(&newcrush, ss)) {
10626 err = -EINVAL;
10627 goto reply;
10628 }
10629
10630 pending_inc.crush.clear();
10631 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10632 ss << "adjusted tunable " << tunable << " to " << value;
10633 getline(ss, rs);
10634 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10635 get_last_committed() + 1));
10636 return true;
10637
10638 } else if (prefix == "osd crush rule create-simple") {
10639 string name, root, type, mode;
9f95a23c
TL
10640 cmd_getval(cmdmap, "name", name);
10641 cmd_getval(cmdmap, "root", root);
10642 cmd_getval(cmdmap, "type", type);
10643 cmd_getval(cmdmap, "mode", mode);
7c673cae
FG
10644 if (mode == "")
10645 mode = "firstn";
10646
10647 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
10648 // The name is uniquely associated to a ruleid and the rule it contains
10649 // From the user point of view, the rule is more meaningfull.
10650 ss << "rule " << name << " already exists";
7c673cae
FG
10651 err = 0;
10652 goto reply;
10653 }
10654
10655 CrushWrapper newcrush;
10656 _get_pending_crush(newcrush);
10657
10658 if (newcrush.rule_exists(name)) {
31f18b77
FG
10659 // The name is uniquely associated to a ruleid and the rule it contains
10660 // From the user point of view, the rule is more meaningfull.
10661 ss << "rule " << name << " already exists";
7c673cae
FG
10662 err = 0;
10663 } else {
224ce89b 10664 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
10665 pg_pool_t::TYPE_REPLICATED, &ss);
10666 if (ruleno < 0) {
10667 err = ruleno;
10668 goto reply;
10669 }
10670
10671 pending_inc.crush.clear();
10672 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10673 }
10674 getline(ss, rs);
10675 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10676 get_last_committed() + 1));
10677 return true;
10678
224ce89b
WB
10679 } else if (prefix == "osd crush rule create-replicated") {
10680 string name, root, type, device_class;
9f95a23c
TL
10681 cmd_getval(cmdmap, "name", name);
10682 cmd_getval(cmdmap, "root", root);
10683 cmd_getval(cmdmap, "type", type);
10684 cmd_getval(cmdmap, "class", device_class);
224ce89b
WB
10685
10686 if (osdmap.crush->rule_exists(name)) {
10687 // The name is uniquely associated to a ruleid and the rule it contains
10688 // From the user point of view, the rule is more meaningfull.
10689 ss << "rule " << name << " already exists";
10690 err = 0;
10691 goto reply;
10692 }
10693
10694 CrushWrapper newcrush;
10695 _get_pending_crush(newcrush);
10696
10697 if (newcrush.rule_exists(name)) {
10698 // The name is uniquely associated to a ruleid and the rule it contains
10699 // From the user point of view, the rule is more meaningfull.
10700 ss << "rule " << name << " already exists";
10701 err = 0;
10702 } else {
10703 int ruleno = newcrush.add_simple_rule(
10704 name, root, type, device_class,
10705 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10706 if (ruleno < 0) {
10707 err = ruleno;
10708 goto reply;
10709 }
10710
10711 pending_inc.crush.clear();
10712 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10713 }
10714 getline(ss, rs);
10715 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10716 get_last_committed() + 1));
10717 return true;
10718
7c673cae
FG
10719 } else if (prefix == "osd erasure-code-profile rm") {
10720 string name;
9f95a23c 10721 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10722
10723 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10724 goto wait;
10725
10726 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10727 err = -EBUSY;
10728 goto reply;
10729 }
10730
10731 if (osdmap.has_erasure_code_profile(name) ||
10732 pending_inc.new_erasure_code_profiles.count(name)) {
10733 if (osdmap.has_erasure_code_profile(name)) {
10734 pending_inc.old_erasure_code_profiles.push_back(name);
10735 } else {
10736 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10737 pending_inc.new_erasure_code_profiles.erase(name);
10738 }
10739
10740 getline(ss, rs);
10741 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10742 get_last_committed() + 1));
10743 return true;
10744 } else {
10745 ss << "erasure-code-profile " << name << " does not exist";
10746 err = 0;
10747 goto reply;
10748 }
10749
10750 } else if (prefix == "osd erasure-code-profile set") {
10751 string name;
9f95a23c 10752 cmd_getval(cmdmap, "name", name);
7c673cae 10753 vector<string> profile;
9f95a23c 10754 cmd_getval(cmdmap, "profile", profile);
11fdf7f2
TL
10755
10756 bool force = false;
9f95a23c 10757 cmd_getval(cmdmap, "force", force);
11fdf7f2 10758
7c673cae
FG
10759 map<string,string> profile_map;
10760 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10761 if (err)
10762 goto reply;
10763 if (profile_map.find("plugin") == profile_map.end()) {
10764 ss << "erasure-code-profile " << profile_map
10765 << " must contain a plugin entry" << std::endl;
10766 err = -EINVAL;
10767 goto reply;
10768 }
10769 string plugin = profile_map["plugin"];
10770
10771 if (pending_inc.has_erasure_code_profile(name)) {
10772 dout(20) << "erasure code profile " << name << " try again" << dendl;
10773 goto wait;
10774 } else {
7c673cae
FG
10775 err = normalize_profile(name, profile_map, force, &ss);
10776 if (err)
10777 goto reply;
10778
10779 if (osdmap.has_erasure_code_profile(name)) {
10780 ErasureCodeProfile existing_profile_map =
10781 osdmap.get_erasure_code_profile(name);
10782 err = normalize_profile(name, existing_profile_map, force, &ss);
10783 if (err)
10784 goto reply;
10785
10786 if (existing_profile_map == profile_map) {
10787 err = 0;
10788 goto reply;
10789 }
10790 if (!force) {
10791 err = -EPERM;
10792 ss << "will not override erasure code profile " << name
10793 << " because the existing profile "
10794 << existing_profile_map
10795 << " is different from the proposed profile "
10796 << profile_map;
10797 goto reply;
10798 }
10799 }
10800
10801 dout(20) << "erasure code profile set " << name << "="
10802 << profile_map << dendl;
10803 pending_inc.set_erasure_code_profile(name, profile_map);
10804 }
10805
10806 getline(ss, rs);
10807 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10808 get_last_committed() + 1));
10809 return true;
10810
10811 } else if (prefix == "osd crush rule create-erasure") {
10812 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10813 if (err == -EAGAIN)
10814 goto wait;
10815 if (err)
10816 goto reply;
10817 string name, poolstr;
9f95a23c 10818 cmd_getval(cmdmap, "name", name);
7c673cae 10819 string profile;
9f95a23c 10820 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
10821 if (profile == "")
10822 profile = "default";
10823 if (profile == "default") {
10824 if (!osdmap.has_erasure_code_profile(profile)) {
10825 if (pending_inc.has_erasure_code_profile(profile)) {
10826 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10827 goto wait;
10828 }
10829
10830 map<string,string> profile_map;
11fdf7f2 10831 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
10832 profile_map,
10833 &ss);
10834 if (err)
10835 goto reply;
10836 err = normalize_profile(name, profile_map, true, &ss);
10837 if (err)
10838 goto reply;
10839 dout(20) << "erasure code profile set " << profile << "="
10840 << profile_map << dendl;
10841 pending_inc.set_erasure_code_profile(profile, profile_map);
10842 goto wait;
10843 }
10844 }
10845
31f18b77
FG
10846 int rule;
10847 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
10848 if (err < 0) {
10849 switch(err) {
10850 case -EEXIST: // return immediately
10851 ss << "rule " << name << " already exists";
10852 err = 0;
10853 goto reply;
10854 break;
10855 case -EALREADY: // wait for pending to be proposed
10856 ss << "rule " << name << " already exists";
10857 err = 0;
10858 break;
10859 default: // non recoverable error
10860 goto reply;
10861 break;
10862 }
10863 } else {
31f18b77 10864 ss << "created rule " << name << " at " << rule;
7c673cae
FG
10865 }
10866
10867 getline(ss, rs);
10868 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10869 get_last_committed() + 1));
10870 return true;
10871
10872 } else if (prefix == "osd crush rule rm") {
10873 string name;
9f95a23c 10874 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10875
10876 if (!osdmap.crush->rule_exists(name)) {
10877 ss << "rule " << name << " does not exist";
10878 err = 0;
10879 goto reply;
10880 }
10881
10882 CrushWrapper newcrush;
10883 _get_pending_crush(newcrush);
10884
10885 if (!newcrush.rule_exists(name)) {
10886 ss << "rule " << name << " does not exist";
10887 err = 0;
10888 } else {
10889 int ruleno = newcrush.get_rule_id(name);
11fdf7f2 10890 ceph_assert(ruleno >= 0);
7c673cae
FG
10891
10892 // make sure it is not in use.
10893 // FIXME: this is ok in some situations, but let's not bother with that
10894 // complexity now.
10895 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
3efd9988 10896 if (osdmap.crush_rule_in_use(ruleset)) {
7c673cae
FG
10897 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10898 err = -EBUSY;
10899 goto reply;
10900 }
10901
10902 err = newcrush.remove_rule(ruleno);
10903 if (err < 0) {
10904 goto reply;
10905 }
10906
10907 pending_inc.crush.clear();
10908 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10909 }
10910 getline(ss, rs);
10911 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10912 get_last_committed() + 1));
10913 return true;
10914
b5b8bbf5
FG
10915 } else if (prefix == "osd crush rule rename") {
10916 string srcname;
10917 string dstname;
9f95a23c
TL
10918 cmd_getval(cmdmap, "srcname", srcname);
10919 cmd_getval(cmdmap, "dstname", dstname);
b5b8bbf5
FG
10920 if (srcname.empty() || dstname.empty()) {
10921 ss << "must specify both source rule name and destination rule name";
10922 err = -EINVAL;
10923 goto reply;
10924 }
10925 if (srcname == dstname) {
10926 ss << "destination rule name is equal to source rule name";
10927 err = 0;
10928 goto reply;
10929 }
10930
10931 CrushWrapper newcrush;
10932 _get_pending_crush(newcrush);
181888fb
FG
10933 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10934 // srcname does not exist and dstname already exists
10935 // suppose this is a replay and return success
10936 // (so this command is idempotent)
10937 ss << "already renamed to '" << dstname << "'";
10938 err = 0;
10939 goto reply;
10940 }
10941
b5b8bbf5
FG
10942 err = newcrush.rename_rule(srcname, dstname, &ss);
10943 if (err < 0) {
10944 // ss has reason for failure
10945 goto reply;
10946 }
10947 pending_inc.crush.clear();
10948 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10949 getline(ss, rs);
10950 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10951 get_last_committed() + 1));
10952 return true;
10953
7c673cae
FG
10954 } else if (prefix == "osd setmaxosd") {
10955 int64_t newmax;
9f95a23c 10956 if (!cmd_getval(cmdmap, "newmax", newmax)) {
7c673cae 10957 ss << "unable to parse 'newmax' value '"
11fdf7f2 10958 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
7c673cae
FG
10959 err = -EINVAL;
10960 goto reply;
10961 }
10962
11fdf7f2 10963 if (newmax > g_conf()->mon_max_osd) {
7c673cae
FG
10964 err = -ERANGE;
10965 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11fdf7f2 10966 << g_conf()->mon_max_osd << ")";
7c673cae
FG
10967 goto reply;
10968 }
10969
10970 // Don't allow shrinking OSD number as this will cause data loss
10971 // and may cause kernel crashes.
10972 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10973 if (newmax < osdmap.get_max_osd()) {
10974 // Check if the OSDs exist between current max and new value.
10975 // If there are any OSDs exist, then don't allow shrinking number
10976 // of OSDs.
10977 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10978 if (osdmap.exists(i)) {
10979 err = -EBUSY;
10980 ss << "cannot shrink max_osd to " << newmax
10981 << " because osd." << i << " (and possibly others) still in use";
10982 goto reply;
10983 }
10984 }
10985 }
10986
10987 pending_inc.new_max_osd = newmax;
10988 ss << "set new max_osd = " << pending_inc.new_max_osd;
10989 getline(ss, rs);
10990 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10991 get_last_committed() + 1));
10992 return true;
10993
10994 } else if (prefix == "osd set-full-ratio" ||
10995 prefix == "osd set-backfillfull-ratio" ||
10996 prefix == "osd set-nearfull-ratio") {
7c673cae 10997 double n;
9f95a23c 10998 if (!cmd_getval(cmdmap, "ratio", n)) {
7c673cae 10999 ss << "unable to parse 'ratio' value '"
11fdf7f2 11000 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
7c673cae
FG
11001 err = -EINVAL;
11002 goto reply;
11003 }
11004 if (prefix == "osd set-full-ratio")
11005 pending_inc.new_full_ratio = n;
11006 else if (prefix == "osd set-backfillfull-ratio")
11007 pending_inc.new_backfillfull_ratio = n;
11008 else if (prefix == "osd set-nearfull-ratio")
11009 pending_inc.new_nearfull_ratio = n;
11010 ss << prefix << " " << n;
11011 getline(ss, rs);
11012 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11013 get_last_committed() + 1));
11014 return true;
11015 } else if (prefix == "osd set-require-min-compat-client") {
7c673cae 11016 string v;
9f95a23c
TL
11017 cmd_getval(cmdmap, "version", v);
11018 ceph_release_t vno = ceph_release_from_name(v);
11019 if (!vno) {
7c673cae
FG
11020 ss << "version " << v << " is not recognized";
11021 err = -EINVAL;
11022 goto reply;
11023 }
11024 OSDMap newmap;
11025 newmap.deepish_copy_from(osdmap);
11026 newmap.apply_incremental(pending_inc);
31f18b77
FG
11027 newmap.require_min_compat_client = vno;
11028 auto mvno = newmap.get_min_compat_client();
11029 if (vno < mvno) {
9f95a23c
TL
11030 ss << "osdmap current utilizes features that require " << mvno
11031 << "; cannot set require_min_compat_client below that to " << vno;
7c673cae
FG
11032 err = -EPERM;
11033 goto reply;
11034 }
11fdf7f2 11035 bool sure = false;
9f95a23c 11036 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11037 if (!sure) {
31f18b77
FG
11038 FeatureMap m;
11039 mon->get_combined_feature_map(&m);
9f95a23c 11040 uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
31f18b77
FG
11041 bool first = true;
11042 bool ok = true;
11043 for (int type : {
11044 CEPH_ENTITY_TYPE_CLIENT,
11045 CEPH_ENTITY_TYPE_MDS,
11046 CEPH_ENTITY_TYPE_MGR }) {
11047 auto p = m.m.find(type);
11048 if (p == m.m.end()) {
11049 continue;
11050 }
11051 for (auto& q : p->second) {
11052 uint64_t missing = ~q.first & features;
11053 if (missing) {
11054 if (first) {
11055 ss << "cannot set require_min_compat_client to " << v << ": ";
11056 } else {
11057 ss << "; ";
11058 }
11059 first = false;
11060 ss << q.second << " connected " << ceph_entity_type_name(type)
11061 << "(s) look like " << ceph_release_name(
11062 ceph_release_from_features(q.first))
11063 << " (missing 0x" << std::hex << missing << std::dec << ")";
11064 ok = false;
11065 }
11066 }
11067 }
11068 if (!ok) {
11069 ss << "; add --yes-i-really-mean-it to do it anyway";
11070 err = -EPERM;
11071 goto reply;
11072 }
11073 }
9f95a23c 11074 ss << "set require_min_compat_client to " << vno;
31f18b77 11075 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
11076 getline(ss, rs);
11077 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11078 get_last_committed() + 1));
11079 return true;
11080 } else if (prefix == "osd pause") {
11081 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11082
11083 } else if (prefix == "osd unpause") {
11084 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11085
11086 } else if (prefix == "osd set") {
11fdf7f2 11087 bool sure = false;
9f95a23c 11088 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11089
7c673cae 11090 string key;
9f95a23c
TL
11091 cmd_getval(cmdmap, "key", key);
11092 if (key == "pause")
7c673cae
FG
11093 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11094 else if (key == "noup")
11095 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11096 else if (key == "nodown")
11097 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11098 else if (key == "noout")
11099 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11100 else if (key == "noin")
11101 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11102 else if (key == "nobackfill")
11103 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11104 else if (key == "norebalance")
11105 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11106 else if (key == "norecover")
11107 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11108 else if (key == "noscrub")
11109 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11110 else if (key == "nodeep-scrub")
11111 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11112 else if (key == "notieragent")
11113 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11114 else if (key == "nosnaptrim")
11115 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11116 else if (key == "pglog_hardlimit") {
11117 if (!osdmap.get_num_up_osds() && !sure) {
f64942e4
AA
11118 ss << "Not advisable to continue since no OSDs are up. Pass "
11119 << "--yes-i-really-mean-it if you really wish to continue.";
11120 err = -EPERM;
11121 goto reply;
11122 }
11123 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11124 // we are reusing a jewel feature bit that was retired in luminous.
9f95a23c 11125 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
f64942e4 11126 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11fdf7f2 11127 || sure)) {
f64942e4
AA
11128 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11129 } else {
11130 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11131 err = -EPERM;
11132 goto reply;
11133 }
7c673cae
FG
11134 } else {
11135 ss << "unrecognized flag '" << key << "'";
11136 err = -EINVAL;
11137 }
11138
11139 } else if (prefix == "osd unset") {
11140 string key;
9f95a23c
TL
11141 cmd_getval(cmdmap, "key", key);
11142 if (key == "pause")
7c673cae
FG
11143 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11144 else if (key == "noup")
11145 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11146 else if (key == "nodown")
11147 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11148 else if (key == "noout")
11149 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11150 else if (key == "noin")
11151 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11152 else if (key == "nobackfill")
11153 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11154 else if (key == "norebalance")
11155 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11156 else if (key == "norecover")
11157 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11158 else if (key == "noscrub")
11159 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11160 else if (key == "nodeep-scrub")
11161 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11162 else if (key == "notieragent")
11163 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11164 else if (key == "nosnaptrim")
11165 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
224ce89b 11166 else {
7c673cae
FG
11167 ss << "unrecognized flag '" << key << "'";
11168 err = -EINVAL;
11169 }
11170
31f18b77
FG
11171 } else if (prefix == "osd require-osd-release") {
11172 string release;
9f95a23c 11173 cmd_getval(cmdmap, "release", release);
11fdf7f2 11174 bool sure = false;
9f95a23c
TL
11175 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11176 ceph_release_t rel = ceph_release_from_name(release.c_str());
11177 if (!rel) {
31f18b77
FG
11178 ss << "unrecognized release " << release;
11179 err = -EINVAL;
11180 goto reply;
11181 }
d2e6a577
FG
11182 if (rel == osdmap.require_osd_release) {
11183 // idempotent
11184 err = 0;
11185 goto reply;
11186 }
9f95a23c 11187 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11fdf7f2
TL
11188 if (!osdmap.get_num_up_osds() && !sure) {
11189 ss << "Not advisable to continue since no OSDs are up. Pass "
11190 << "--yes-i-really-mean-it if you really wish to continue.";
11191 err = -EPERM;
11192 goto reply;
11193 }
9f95a23c 11194 if (rel == ceph_release_t::mimic) {
11fdf7f2
TL
11195 if (!mon->monmap->get_required_features().contains_all(
11196 ceph::features::mon::FEATURE_MIMIC)) {
11197 ss << "not all mons are mimic";
11198 err = -EPERM;
11199 goto reply;
11200 }
11201 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11202 && !sure) {
11203 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11204 err = -EPERM;
11205 goto reply;
11206 }
9f95a23c 11207 } else if (rel == ceph_release_t::nautilus) {
11fdf7f2
TL
11208 if (!mon->monmap->get_required_features().contains_all(
11209 ceph::features::mon::FEATURE_NAUTILUS)) {
11210 ss << "not all mons are nautilus";
11211 err = -EPERM;
11212 goto reply;
11213 }
11214 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11215 && !sure) {
11216 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
31f18b77
FG
11217 err = -EPERM;
11218 goto reply;
11219 }
9f95a23c
TL
11220 } else if (rel == ceph_release_t::octopus) {
11221 if (!mon->monmap->get_required_features().contains_all(
11222 ceph::features::mon::FEATURE_OCTOPUS)) {
11223 ss << "not all mons are octopus";
11224 err = -EPERM;
11225 goto reply;
11226 }
11227 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11228 && !sure) {
11229 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11230 err = -EPERM;
11231 goto reply;
11232 }
31f18b77
FG
11233 } else {
11234 ss << "not supported for this release yet";
11235 err = -EPERM;
11236 goto reply;
11237 }
11238 if (rel < osdmap.require_osd_release) {
11239 ss << "require_osd_release cannot be lowered once it has been set";
11240 err = -EPERM;
11241 goto reply;
11242 }
11243 pending_inc.new_require_osd_release = rel;
11244 goto update;
7c673cae 11245 } else if (prefix == "osd down" ||
9f95a23c
TL
11246 prefix == "osd out" ||
11247 prefix == "osd in" ||
11248 prefix == "osd rm" ||
11249 prefix == "osd stop") {
7c673cae
FG
11250
11251 bool any = false;
31f18b77
FG
11252 bool stop = false;
11253 bool verbose = true;
9f95a23c 11254 bool definitely_dead = false;
7c673cae
FG
11255
11256 vector<string> idvec;
9f95a23c
TL
11257 cmd_getval(cmdmap, "ids", idvec);
11258 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11259 derr << "definitely_dead " << (int)definitely_dead << dendl;
31f18b77
FG
11260 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11261 set<int> osds;
11262
11263 // wildcard?
11264 if (j == 0 &&
11265 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11266 if (prefix == "osd in") {
11267 // touch out osds only
81eedcae 11268 osdmap.get_out_existing_osds(osds);
31f18b77
FG
11269 } else {
11270 osdmap.get_all_osds(osds);
11271 }
11272 stop = true;
11273 verbose = false; // so the output is less noisy.
11274 } else {
11275 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11276 if (osd < 0) {
11277 ss << "invalid osd id" << osd;
11278 err = -EINVAL;
11279 continue;
11280 } else if (!osdmap.exists(osd)) {
11281 ss << "osd." << osd << " does not exist. ";
11282 continue;
11283 }
11284
11285 osds.insert(osd);
7c673cae 11286 }
31f18b77
FG
11287
11288 for (auto &osd : osds) {
11289 if (prefix == "osd down") {
11290 if (osdmap.is_down(osd)) {
11291 if (verbose)
11292 ss << "osd." << osd << " is already down. ";
11293 } else {
11294 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11295 ss << "marked down osd." << osd << ". ";
11296 any = true;
11297 }
9f95a23c
TL
11298 if (definitely_dead) {
11299 if (!pending_inc.new_xinfo.count(osd)) {
11300 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11301 }
11302 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11303 any = true;
11304 }
11305 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11306 }
31f18b77
FG
11307 } else if (prefix == "osd out") {
11308 if (osdmap.is_out(osd)) {
11309 if (verbose)
11310 ss << "osd." << osd << " is already out. ";
11311 } else {
11312 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11313 if (osdmap.osd_weight[osd]) {
11314 if (pending_inc.new_xinfo.count(osd) == 0) {
11315 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11316 }
11317 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 11318 }
31f18b77 11319 ss << "marked out osd." << osd << ". ";
224ce89b
WB
11320 std::ostringstream msg;
11321 msg << "Client " << op->get_session()->entity_name
11322 << " marked osd." << osd << " out";
11323 if (osdmap.is_up(osd)) {
11324 msg << ", while it was still marked up";
11325 } else {
3efd9988
FG
11326 auto period = ceph_clock_now() - down_pending_out[osd];
11327 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
11328 << " seconds";
11329 }
11330
11331 mon->clog->info() << msg.str();
31f18b77 11332 any = true;
7c673cae 11333 }
31f18b77
FG
11334 } else if (prefix == "osd in") {
11335 if (osdmap.is_in(osd)) {
11336 if (verbose)
11337 ss << "osd." << osd << " is already in. ";
11338 } else {
11339 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11340 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11341 if (pending_inc.new_xinfo.count(osd) == 0) {
11342 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11343 }
11344 pending_inc.new_xinfo[osd].old_weight = 0;
11345 } else {
11346 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 11347 }
31f18b77
FG
11348 ss << "marked in osd." << osd << ". ";
11349 any = true;
11350 }
11351 } else if (prefix == "osd rm") {
11352 err = prepare_command_osd_remove(osd);
11353
11354 if (err == -EBUSY) {
11355 if (any)
11356 ss << ", ";
11357 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 11358 } else {
11fdf7f2 11359 ceph_assert(err == 0);
31f18b77
FG
11360 if (any) {
11361 ss << ", osd." << osd;
11362 } else {
11363 ss << "removed osd." << osd;
11364 }
11365 any = true;
7c673cae 11366 }
9f95a23c
TL
11367 } else if (prefix == "osd stop") {
11368 if (osdmap.is_stop(osd)) {
11369 if (verbose)
11370 ss << "osd." << osd << " is already stopped. ";
11371 } else if (osdmap.is_down(osd)) {
11372 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11373 ss << "stop down osd." << osd << ". ";
11374 any = true;
11375 } else {
11376 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11377 ss << "stop osd." << osd << ". ";
11378 any = true;
11379 }
31f18b77
FG
11380 }
11381 }
11382 }
11383 if (any) {
11384 getline(ss, rs);
11385 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11386 get_last_committed() + 1));
11387 return true;
11388 }
81eedcae
TL
11389 } else if (prefix == "osd set-group" ||
11390 prefix == "osd unset-group" ||
11391 prefix == "osd add-noup" ||
31f18b77
FG
11392 prefix == "osd add-nodown" ||
11393 prefix == "osd add-noin" ||
81eedcae
TL
11394 prefix == "osd add-noout" ||
11395 prefix == "osd rm-noup" ||
11396 prefix == "osd rm-nodown" ||
11397 prefix == "osd rm-noin" ||
11398 prefix == "osd rm-noout") {
11399 bool do_set = prefix == "osd set-group" ||
11400 prefix.find("add") != string::npos;
11401 string flag_str;
11402 unsigned flags = 0;
11403 vector<string> who;
11404 if (prefix == "osd set-group" || prefix == "osd unset-group") {
9f95a23c
TL
11405 cmd_getval(cmdmap, "flags", flag_str);
11406 cmd_getval(cmdmap, "who", who);
81eedcae
TL
11407 vector<string> raw_flags;
11408 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11409 for (auto& f : raw_flags) {
11410 if (f == "noup")
11411 flags |= CEPH_OSD_NOUP;
11412 else if (f == "nodown")
11413 flags |= CEPH_OSD_NODOWN;
11414 else if (f == "noin")
11415 flags |= CEPH_OSD_NOIN;
11416 else if (f == "noout")
11417 flags |= CEPH_OSD_NOOUT;
11418 else {
11419 ss << "unrecognized flag '" << f << "', must be one of "
11420 << "{noup,nodown,noin,noout}";
11421 err = -EINVAL;
11422 goto reply;
11423 }
11424 }
31f18b77 11425 } else {
9f95a23c 11426 cmd_getval(cmdmap, "ids", who);
81eedcae
TL
11427 if (prefix.find("noup") != string::npos)
11428 flags = CEPH_OSD_NOUP;
11429 else if (prefix.find("nodown") != string::npos)
11430 flags = CEPH_OSD_NODOWN;
11431 else if (prefix.find("noin") != string::npos)
11432 flags = CEPH_OSD_NOIN;
11433 else if (prefix.find("noout") != string::npos)
11434 flags = CEPH_OSD_NOOUT;
11435 else
11436 ceph_assert(0 == "Unreachable!");
31f18b77 11437 }
81eedcae
TL
11438 if (flags == 0) {
11439 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11440 err = -EINVAL;
11441 goto reply;
11442 }
11443 if (who.empty()) {
11444 ss << "must specify at least one or more targets to set/unset";
11445 err = -EINVAL;
11446 goto reply;
11447 }
11448 set<int> osds;
11449 set<int> crush_nodes;
11450 set<int> device_classes;
11451 for (auto& w : who) {
11452 if (w == "any" || w == "all" || w == "*") {
31f18b77 11453 osdmap.get_all_osds(osds);
81eedcae 11454 break;
31f18b77 11455 }
81eedcae
TL
11456 std::stringstream ts;
11457 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11458 osds.insert(osd);
11459 } else if (osdmap.crush->name_exists(w)) {
11460 crush_nodes.insert(osdmap.crush->get_item_id(w));
11461 } else if (osdmap.crush->class_exists(w)) {
11462 device_classes.insert(osdmap.crush->get_class_id(w));
11463 } else {
11464 ss << "unable to parse osd id or crush node or device class: "
11465 << "\"" << w << "\". ";
7c673cae
FG
11466 }
11467 }
81eedcae
TL
11468 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11469 // ss has reason for failure
11470 err = -EINVAL;
11471 goto reply;
31f18b77 11472 }
31f18b77 11473 bool any = false;
81eedcae
TL
11474 for (auto osd : osds) {
11475 if (!osdmap.exists(osd)) {
11476 ss << "osd." << osd << " does not exist. ";
11477 continue;
11478 }
11479 if (do_set) {
11480 if (flags & CEPH_OSD_NOUP) {
11481 any |= osdmap.is_noup_by_osd(osd) ?
11482 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11483 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
31f18b77 11484 }
81eedcae
TL
11485 if (flags & CEPH_OSD_NODOWN) {
11486 any |= osdmap.is_nodown_by_osd(osd) ?
11487 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11488 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11489 }
11490 if (flags & CEPH_OSD_NOIN) {
11491 any |= osdmap.is_noin_by_osd(osd) ?
11492 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11493 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11494 }
11495 if (flags & CEPH_OSD_NOOUT) {
11496 any |= osdmap.is_noout_by_osd(osd) ?
11497 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11498 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
31f18b77 11499 }
31f18b77 11500 } else {
81eedcae
TL
11501 if (flags & CEPH_OSD_NOUP) {
11502 any |= osdmap.is_noup_by_osd(osd) ?
11503 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11504 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
31f18b77 11505 }
81eedcae
TL
11506 if (flags & CEPH_OSD_NODOWN) {
11507 any |= osdmap.is_nodown_by_osd(osd) ?
11508 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11509 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
31f18b77 11510 }
81eedcae
TL
11511 if (flags & CEPH_OSD_NOIN) {
11512 any |= osdmap.is_noin_by_osd(osd) ?
11513 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11514 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11515 }
11516 if (flags & CEPH_OSD_NOOUT) {
11517 any |= osdmap.is_noout_by_osd(osd) ?
11518 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11519 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
31f18b77
FG
11520 }
11521 }
11522 }
81eedcae
TL
11523 for (auto& id : crush_nodes) {
11524 auto old_flags = osdmap.get_crush_node_flags(id);
11525 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11526 pending_flags |= old_flags; // adopt existing flags first!
11527 if (do_set) {
11528 pending_flags |= flags;
11529 } else {
11530 pending_flags &= ~flags;
11531 }
11532 any = true;
11533 }
11534 for (auto& id : device_classes) {
11535 auto old_flags = osdmap.get_device_class_flags(id);
11536 auto& pending_flags = pending_inc.new_device_class_flags[id];
11537 pending_flags |= old_flags;
11538 if (do_set) {
11539 pending_flags |= flags;
11540 } else {
11541 pending_flags &= ~flags;
11542 }
11543 any = true;
11544 }
31f18b77
FG
11545 if (any) {
11546 getline(ss, rs);
11547 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11548 get_last_committed() + 1));
7c673cae
FG
11549 return true;
11550 }
11551 } else if (prefix == "osd pg-temp") {
11552 string pgidstr;
9f95a23c 11553 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11554 ss << "unable to parse 'pgid' value '"
11fdf7f2 11555 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11556 err = -EINVAL;
11557 goto reply;
11558 }
11559 pg_t pgid;
11560 if (!pgid.parse(pgidstr.c_str())) {
11561 ss << "invalid pgid '" << pgidstr << "'";
11562 err = -EINVAL;
11563 goto reply;
11564 }
11565 if (!osdmap.pg_exists(pgid)) {
11566 ss << "pg " << pgid << " does not exist";
11567 err = -ENOENT;
11568 goto reply;
11569 }
11570 if (pending_inc.new_pg_temp.count(pgid)) {
11571 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11572 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11573 return true;
11574 }
11575
11576 vector<int64_t> id_vec;
11577 vector<int32_t> new_pg_temp;
9f95a23c 11578 cmd_getval(cmdmap, "id", id_vec);
11fdf7f2
TL
11579 if (id_vec.empty()) {
11580 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11581 ss << "done cleaning up pg_temp of " << pgid;
11582 goto update;
7c673cae
FG
11583 }
11584 for (auto osd : id_vec) {
11585 if (!osdmap.exists(osd)) {
11586 ss << "osd." << osd << " does not exist";
11587 err = -ENOENT;
11588 goto reply;
11589 }
11590 new_pg_temp.push_back(osd);
11591 }
11592
224ce89b
WB
11593 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11594 if ((int)new_pg_temp.size() < pool_min_size) {
11595 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11596 << pool_min_size << ")";
11597 err = -EINVAL;
11598 goto reply;
11599 }
11600
11601 int pool_size = osdmap.get_pg_pool_size(pgid);
11602 if ((int)new_pg_temp.size() > pool_size) {
11603 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11604 << pool_size << ")";
11605 err = -EINVAL;
11606 goto reply;
11607 }
11608
7c673cae
FG
11609 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11610 new_pg_temp.begin(), new_pg_temp.end());
11611 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11612 goto update;
11613 } else if (prefix == "osd primary-temp") {
11614 string pgidstr;
9f95a23c 11615 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11616 ss << "unable to parse 'pgid' value '"
11fdf7f2 11617 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11618 err = -EINVAL;
11619 goto reply;
11620 }
11621 pg_t pgid;
11622 if (!pgid.parse(pgidstr.c_str())) {
11623 ss << "invalid pgid '" << pgidstr << "'";
11624 err = -EINVAL;
11625 goto reply;
11626 }
11627 if (!osdmap.pg_exists(pgid)) {
11628 ss << "pg " << pgid << " does not exist";
11629 err = -ENOENT;
11630 goto reply;
11631 }
11632
11633 int64_t osd;
9f95a23c 11634 if (!cmd_getval(cmdmap, "id", osd)) {
7c673cae 11635 ss << "unable to parse 'id' value '"
11fdf7f2 11636 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11637 err = -EINVAL;
11638 goto reply;
11639 }
11640 if (osd != -1 && !osdmap.exists(osd)) {
11641 ss << "osd." << osd << " does not exist";
11642 err = -ENOENT;
11643 goto reply;
11644 }
11645
9f95a23c
TL
11646 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11647 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 11648 ss << "require_min_compat_client "
9f95a23c 11649 << osdmap.require_min_compat_client
7c673cae
FG
11650 << " < firefly, which is required for primary-temp";
11651 err = -EPERM;
11652 goto reply;
7c673cae
FG
11653 }
11654
11655 pending_inc.new_primary_temp[pgid] = osd;
11656 ss << "set " << pgid << " primary_temp mapping to " << osd;
11657 goto update;
11fdf7f2
TL
11658 } else if (prefix == "pg repeer") {
11659 pg_t pgid;
11660 string pgidstr;
9f95a23c 11661 cmd_getval(cmdmap, "pgid", pgidstr);
11fdf7f2
TL
11662 if (!pgid.parse(pgidstr.c_str())) {
11663 ss << "invalid pgid '" << pgidstr << "'";
11664 err = -EINVAL;
11665 goto reply;
11666 }
11667 if (!osdmap.pg_exists(pgid)) {
11668 ss << "pg '" << pgidstr << "' does not exist";
11669 err = -ENOENT;
11670 goto reply;
11671 }
11672 vector<int> acting;
11673 int primary;
11674 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11675 if (primary < 0) {
11676 err = -EAGAIN;
11677 ss << "pg currently has no primary";
11678 goto reply;
11679 }
11680 if (acting.size() > 1) {
11681 // map to just primary; it will map back to what it wants
11682 pending_inc.new_pg_temp[pgid] = { primary };
11683 } else {
11684 // hmm, pick another arbitrary osd to induce a change. Note
11685 // that this won't work if there is only one suitable OSD in the cluster.
11686 int i;
11687 bool done = false;
11688 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11689 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11690 continue;
11691 }
11692 pending_inc.new_pg_temp[pgid] = { primary, i };
11693 done = true;
11694 break;
11695 }
11696 if (!done) {
11697 err = -EAGAIN;
11698 ss << "not enough up OSDs in the cluster to force repeer";
11699 goto reply;
11700 }
11701 }
11702 goto update;
224ce89b
WB
11703 } else if (prefix == "osd pg-upmap" ||
11704 prefix == "osd rm-pg-upmap" ||
11705 prefix == "osd pg-upmap-items" ||
11706 prefix == "osd rm-pg-upmap-items") {
9f95a23c 11707 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
31f18b77 11708 ss << "min_compat_client "
9f95a23c 11709 << osdmap.require_min_compat_client
224ce89b
WB
11710 << " < luminous, which is required for pg-upmap. "
11711 << "Try 'ceph osd set-require-min-compat-client luminous' "
11712 << "before using the new interface";
7c673cae
FG
11713 err = -EPERM;
11714 goto reply;
11715 }
11716 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11717 if (err == -EAGAIN)
11718 goto wait;
11719 if (err < 0)
11720 goto reply;
11721 string pgidstr;
9f95a23c 11722 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11723 ss << "unable to parse 'pgid' value '"
11fdf7f2 11724 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11725 err = -EINVAL;
11726 goto reply;
11727 }
11728 pg_t pgid;
11729 if (!pgid.parse(pgidstr.c_str())) {
11730 ss << "invalid pgid '" << pgidstr << "'";
11731 err = -EINVAL;
11732 goto reply;
11733 }
11734 if (!osdmap.pg_exists(pgid)) {
11735 ss << "pg " << pgid << " does not exist";
11736 err = -ENOENT;
11737 goto reply;
11738 }
94b18763
FG
11739 if (pending_inc.old_pools.count(pgid.pool())) {
11740 ss << "pool of " << pgid << " is pending removal";
11741 err = -ENOENT;
11742 getline(ss, rs);
11743 wait_for_finished_proposal(op,
11744 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11745 return true;
11746 }
224ce89b
WB
11747
11748 enum {
11749 OP_PG_UPMAP,
11750 OP_RM_PG_UPMAP,
11751 OP_PG_UPMAP_ITEMS,
11752 OP_RM_PG_UPMAP_ITEMS,
11753 } option;
11754
11755 if (prefix == "osd pg-upmap") {
11756 option = OP_PG_UPMAP;
11757 } else if (prefix == "osd rm-pg-upmap") {
11758 option = OP_RM_PG_UPMAP;
11759 } else if (prefix == "osd pg-upmap-items") {
11760 option = OP_PG_UPMAP_ITEMS;
11761 } else {
11762 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 11763 }
224ce89b
WB
11764
11765 // check pending upmap changes
11766 switch (option) {
11767 case OP_PG_UPMAP: // fall through
11768 case OP_RM_PG_UPMAP:
11769 if (pending_inc.new_pg_upmap.count(pgid) ||
11770 pending_inc.old_pg_upmap.count(pgid)) {
11771 dout(10) << __func__ << " waiting for pending update on "
11772 << pgid << dendl;
11773 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11774 return true;
7c673cae 11775 }
224ce89b 11776 break;
7c673cae 11777
224ce89b
WB
11778 case OP_PG_UPMAP_ITEMS: // fall through
11779 case OP_RM_PG_UPMAP_ITEMS:
11780 if (pending_inc.new_pg_upmap_items.count(pgid) ||
11781 pending_inc.old_pg_upmap_items.count(pgid)) {
11782 dout(10) << __func__ << " waiting for pending update on "
11783 << pgid << dendl;
11784 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11785 return true;
11786 }
11787 break;
7c673cae 11788
224ce89b 11789 default:
11fdf7f2 11790 ceph_abort_msg("invalid option");
7c673cae 11791 }
224ce89b
WB
11792
11793 switch (option) {
11794 case OP_PG_UPMAP:
11795 {
11796 vector<int64_t> id_vec;
9f95a23c 11797 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 11798 ss << "unable to parse 'id' value(s) '"
11fdf7f2 11799 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
11800 err = -EINVAL;
11801 goto reply;
11802 }
11803
11804 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11805 if ((int)id_vec.size() < pool_min_size) {
11806 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11807 << pool_min_size << ")";
11808 err = -EINVAL;
11809 goto reply;
11810 }
11811
11812 int pool_size = osdmap.get_pg_pool_size(pgid);
11813 if ((int)id_vec.size() > pool_size) {
11814 ss << "num of osds (" << id_vec.size() <<") > pool size ("
11815 << pool_size << ")";
11816 err = -EINVAL;
11817 goto reply;
11818 }
11819
11820 vector<int32_t> new_pg_upmap;
11821 for (auto osd : id_vec) {
11822 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11823 ss << "osd." << osd << " does not exist";
11824 err = -ENOENT;
11825 goto reply;
11826 }
11827 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11828 if (it != new_pg_upmap.end()) {
11829 ss << "osd." << osd << " already exists, ";
11830 continue;
11831 }
11832 new_pg_upmap.push_back(osd);
11833 }
11834
11835 if (new_pg_upmap.empty()) {
11836 ss << "no valid upmap items(pairs) is specified";
11837 err = -EINVAL;
11838 goto reply;
11839 }
11840
11841 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11842 new_pg_upmap.begin(), new_pg_upmap.end());
11843 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 11844 }
224ce89b
WB
11845 break;
11846
11847 case OP_RM_PG_UPMAP:
11848 {
11849 pending_inc.old_pg_upmap.insert(pgid);
11850 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 11851 }
224ce89b 11852 break;
7c673cae 11853
224ce89b
WB
11854 case OP_PG_UPMAP_ITEMS:
11855 {
11856 vector<int64_t> id_vec;
9f95a23c 11857 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 11858 ss << "unable to parse 'id' value(s) '"
11fdf7f2 11859 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
11860 err = -EINVAL;
11861 goto reply;
11862 }
11863
11864 if (id_vec.size() % 2) {
11865 ss << "you must specify pairs of osd ids to be remapped";
11866 err = -EINVAL;
11867 goto reply;
11868 }
11869
11870 int pool_size = osdmap.get_pg_pool_size(pgid);
11871 if ((int)(id_vec.size() / 2) > pool_size) {
11872 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11873 << pool_size << ")";
11874 err = -EINVAL;
11875 goto reply;
11876 }
11877
11878 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11879 ostringstream items;
11880 items << "[";
11881 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11882 int from = *p++;
11883 int to = *p;
11884 if (from == to) {
11885 ss << "from osd." << from << " == to osd." << to << ", ";
11886 continue;
11887 }
11888 if (!osdmap.exists(from)) {
11889 ss << "osd." << from << " does not exist";
11890 err = -ENOENT;
11891 goto reply;
11892 }
11893 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11894 ss << "osd." << to << " does not exist";
11895 err = -ENOENT;
11896 goto reply;
11897 }
c07f9fc5
FG
11898 pair<int32_t,int32_t> entry = make_pair(from, to);
11899 auto it = std::find(new_pg_upmap_items.begin(),
11900 new_pg_upmap_items.end(), entry);
11901 if (it != new_pg_upmap_items.end()) {
11902 ss << "osd." << from << " -> osd." << to << " already exists, ";
11903 continue;
11904 }
11905 new_pg_upmap_items.push_back(entry);
224ce89b
WB
11906 items << from << "->" << to << ",";
11907 }
11908 string out(items.str());
11909 out.resize(out.size() - 1); // drop last ','
11910 out += "]";
11911
11912 if (new_pg_upmap_items.empty()) {
11913 ss << "no valid upmap items(pairs) is specified";
11914 err = -EINVAL;
11915 goto reply;
11916 }
11917
11918 pending_inc.new_pg_upmap_items[pgid] =
11919 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11920 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11921 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11922 }
11923 break;
11924
11925 case OP_RM_PG_UPMAP_ITEMS:
11926 {
11927 pending_inc.old_pg_upmap_items.insert(pgid);
11928 ss << "clear " << pgid << " pg_upmap_items mapping";
11929 }
11930 break;
11931
11932 default:
11fdf7f2 11933 ceph_abort_msg("invalid option");
7c673cae
FG
11934 }
11935
7c673cae
FG
11936 goto update;
11937 } else if (prefix == "osd primary-affinity") {
11938 int64_t id;
9f95a23c 11939 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 11940 ss << "invalid osd id value '"
11fdf7f2 11941 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11942 err = -EINVAL;
11943 goto reply;
11944 }
11945 double w;
9f95a23c 11946 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 11947 ss << "unable to parse 'weight' value '"
11fdf7f2 11948 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
11949 err = -EINVAL;
11950 goto reply;
11951 }
11952 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11953 if (ww < 0L) {
11954 ss << "weight must be >= 0";
11955 err = -EINVAL;
11956 goto reply;
11957 }
9f95a23c
TL
11958 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11959 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 11960 ss << "require_min_compat_client "
9f95a23c 11961 << osdmap.require_min_compat_client
7c673cae
FG
11962 << " < firefly, which is required for primary-affinity";
11963 err = -EPERM;
11964 goto reply;
7c673cae 11965 }
7c673cae
FG
11966 if (osdmap.exists(id)) {
11967 pending_inc.new_primary_affinity[id] = ww;
11968 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11969 getline(ss, rs);
11970 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11971 get_last_committed() + 1));
11972 return true;
11973 } else {
11974 ss << "osd." << id << " does not exist";
11975 err = -ENOENT;
11976 goto reply;
11977 }
11978 } else if (prefix == "osd reweight") {
11979 int64_t id;
9f95a23c 11980 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 11981 ss << "unable to parse osd id value '"
11fdf7f2 11982 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11983 err = -EINVAL;
11984 goto reply;
11985 }
11986 double w;
9f95a23c 11987 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 11988 ss << "unable to parse weight value '"
11fdf7f2 11989 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
11990 err = -EINVAL;
11991 goto reply;
11992 }
11993 long ww = (int)((double)CEPH_OSD_IN*w);
11994 if (ww < 0L) {
11995 ss << "weight must be >= 0";
11996 err = -EINVAL;
11997 goto reply;
11998 }
11999 if (osdmap.exists(id)) {
12000 pending_inc.new_weight[id] = ww;
12001 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12002 getline(ss, rs);
12003 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12004 get_last_committed() + 1));
12005 return true;
12006 } else {
12007 ss << "osd." << id << " does not exist";
12008 err = -ENOENT;
12009 goto reply;
12010 }
12011 } else if (prefix == "osd reweightn") {
12012 map<int32_t, uint32_t> weights;
11fdf7f2 12013 err = parse_reweights(cct, cmdmap, osdmap, &weights);
7c673cae
FG
12014 if (err) {
12015 ss << "unable to parse 'weights' value '"
11fdf7f2 12016 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
7c673cae
FG
12017 goto reply;
12018 }
12019 pending_inc.new_weight.insert(weights.begin(), weights.end());
12020 wait_for_finished_proposal(
12021 op,
12022 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 12023 return true;
7c673cae
FG
12024 } else if (prefix == "osd lost") {
12025 int64_t id;
9f95a23c 12026 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12027 ss << "unable to parse osd id value '"
11fdf7f2 12028 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12029 err = -EINVAL;
12030 goto reply;
12031 }
11fdf7f2 12032 bool sure = false;
9f95a23c 12033 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 12034 if (!sure) {
7c673cae
FG
12035 ss << "are you SURE? this might mean real, permanent data loss. pass "
12036 "--yes-i-really-mean-it if you really do.";
12037 err = -EPERM;
12038 goto reply;
12039 } else if (!osdmap.exists(id)) {
12040 ss << "osd." << id << " does not exist";
12041 err = -ENOENT;
12042 goto reply;
12043 } else if (!osdmap.is_down(id)) {
12044 ss << "osd." << id << " is not down";
12045 err = -EBUSY;
12046 goto reply;
12047 } else {
12048 epoch_t e = osdmap.get_info(id).down_at;
12049 pending_inc.new_lost[id] = e;
12050 ss << "marked osd lost in epoch " << e;
12051 getline(ss, rs);
12052 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12053 get_last_committed() + 1));
12054 return true;
12055 }
12056
11fdf7f2
TL
12057 } else if (prefix == "osd destroy-actual" ||
12058 prefix == "osd purge-actual" ||
12059 prefix == "osd purge-new") {
31f18b77
FG
12060 /* Destroying an OSD means that we don't expect to further make use of
12061 * the OSDs data (which may even become unreadable after this operation),
12062 * and that we are okay with scrubbing all its cephx keys and config-key
12063 * data (which may include lockbox keys, thus rendering the osd's data
12064 * unreadable).
12065 *
12066 * The OSD will not be removed. Instead, we will mark it as destroyed,
12067 * such that a subsequent call to `create` will not reuse the osd id.
12068 * This will play into being able to recreate the OSD, at the same
12069 * crush location, with minimal data movement.
12070 */
12071
12072 // make sure authmon is writeable.
12073 if (!mon->authmon()->is_writeable()) {
12074 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12075 << "osd destroy" << dendl;
12076 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12077 return false;
12078 }
12079
12080 int64_t id;
9f95a23c 12081 if (!cmd_getval(cmdmap, "id", id)) {
11fdf7f2
TL
12082 auto p = cmdmap.find("id");
12083 if (p == cmdmap.end()) {
12084 ss << "no osd id specified";
12085 } else {
12086 ss << "unable to parse osd id value '"
12087 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12088 }
31f18b77
FG
12089 err = -EINVAL;
12090 goto reply;
12091 }
12092
11fdf7f2 12093 bool is_destroy = (prefix == "osd destroy-actual");
31f18b77 12094 if (!is_destroy) {
11fdf7f2
TL
12095 ceph_assert("osd purge-actual" == prefix ||
12096 "osd purge-new" == prefix);
31f18b77
FG
12097 }
12098
11fdf7f2 12099 bool sure = false;
9f95a23c 12100 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
12101 if (!sure) {
12102 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12103 << "This will mean real, permanent data loss, as well "
12104 << "as deletion of cephx and lockbox keys. "
12105 << "Pass --yes-i-really-mean-it if you really do.";
31f18b77
FG
12106 err = -EPERM;
12107 goto reply;
d2e6a577 12108 } else if (!osdmap.exists(id)) {
31f18b77 12109 ss << "osd." << id << " does not exist";
d2e6a577 12110 err = 0; // idempotent
31f18b77
FG
12111 goto reply;
12112 } else if (osdmap.is_up(id)) {
12113 ss << "osd." << id << " is not `down`.";
12114 err = -EBUSY;
12115 goto reply;
12116 } else if (is_destroy && osdmap.is_destroyed(id)) {
12117 ss << "destroyed osd." << id;
12118 err = 0;
12119 goto reply;
12120 }
12121
11fdf7f2
TL
12122 if (prefix == "osd purge-new" &&
12123 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12124 ss << "osd." << id << " is not new";
12125 err = -EPERM;
12126 goto reply;
12127 }
12128
31f18b77
FG
12129 bool goto_reply = false;
12130
12131 paxos->plug();
12132 if (is_destroy) {
12133 err = prepare_command_osd_destroy(id, ss);
12134 // we checked above that it should exist.
11fdf7f2 12135 ceph_assert(err != -ENOENT);
31f18b77
FG
12136 } else {
12137 err = prepare_command_osd_purge(id, ss);
12138 if (err == -ENOENT) {
12139 err = 0;
12140 ss << "osd." << id << " does not exist.";
12141 goto_reply = true;
12142 }
12143 }
12144 paxos->unplug();
12145
12146 if (err < 0 || goto_reply) {
12147 goto reply;
12148 }
12149
12150 if (is_destroy) {
12151 ss << "destroyed osd." << id;
12152 } else {
12153 ss << "purged osd." << id;
12154 }
12155
12156 getline(ss, rs);
12157 wait_for_finished_proposal(op,
12158 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12159 force_immediate_propose();
12160 return true;
12161
12162 } else if (prefix == "osd new") {
12163
12164 // make sure authmon is writeable.
12165 if (!mon->authmon()->is_writeable()) {
12166 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 12167 << "osd new" << dendl;
31f18b77
FG
12168 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12169 return false;
12170 }
12171
3a9019d9 12172 map<string,string> param_map;
31f18b77
FG
12173
12174 bufferlist bl = m->get_data();
3a9019d9
FG
12175 string param_json = bl.to_str();
12176 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 12177
3a9019d9 12178 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
12179 if (err < 0)
12180 goto reply;
12181
3a9019d9 12182 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77
FG
12183
12184 paxos->plug();
3a9019d9 12185 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
31f18b77
FG
12186 paxos->unplug();
12187
12188 if (err < 0) {
12189 goto reply;
12190 }
12191
12192 if (f) {
12193 f->flush(rdata);
12194 } else {
12195 rdata.append(ss);
12196 }
12197
12198 if (err == EEXIST) {
12199 // idempotent operation
12200 err = 0;
12201 goto reply;
12202 }
12203
12204 wait_for_finished_proposal(op,
12205 new Monitor::C_Command(mon, op, 0, rs, rdata,
12206 get_last_committed() + 1));
12207 force_immediate_propose();
12208 return true;
12209
7c673cae 12210 } else if (prefix == "osd create") {
7c673cae
FG
12211
12212 // optional id provided?
31f18b77 12213 int64_t id = -1, cmd_id = -1;
9f95a23c 12214 if (cmd_getval(cmdmap, "id", cmd_id)) {
31f18b77
FG
12215 if (cmd_id < 0) {
12216 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
12217 err = -EINVAL;
12218 goto reply;
12219 }
31f18b77 12220 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
12221 }
12222
7c673cae
FG
12223 uuid_d uuid;
12224 string uuidstr;
9f95a23c 12225 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
7c673cae 12226 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
12227 ss << "invalid uuid value '" << uuidstr << "'";
12228 err = -EINVAL;
12229 goto reply;
7c673cae 12230 }
31f18b77
FG
12231 // we only care about the id if we also have the uuid, to
12232 // ensure the operation's idempotency.
12233 id = cmd_id;
7c673cae
FG
12234 }
12235
31f18b77
FG
12236 int32_t new_id = -1;
12237 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12238 if (err < 0) {
12239 if (err == -EAGAIN) {
12240 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12241 return true;
12242 }
12243 // a check has failed; reply to the user.
12244 goto reply;
12245
12246 } else if (err == EEXIST) {
12247 // this is an idempotent operation; we can go ahead and reply.
12248 if (f) {
12249 f->open_object_section("created_osd");
12250 f->dump_int("osdid", new_id);
12251 f->close_section();
12252 f->flush(rdata);
12253 } else {
12254 ss << new_id;
12255 rdata.append(ss);
7c673cae 12256 }
31f18b77
FG
12257 err = 0;
12258 goto reply;
7c673cae
FG
12259 }
12260
3a9019d9
FG
12261 string empty_device_class;
12262 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 12263
7c673cae
FG
12264 if (f) {
12265 f->open_object_section("created_osd");
31f18b77 12266 f->dump_int("osdid", new_id);
7c673cae
FG
12267 f->close_section();
12268 f->flush(rdata);
12269 } else {
31f18b77 12270 ss << new_id;
7c673cae
FG
12271 rdata.append(ss);
12272 }
31f18b77
FG
12273 wait_for_finished_proposal(op,
12274 new Monitor::C_Command(mon, op, 0, rs, rdata,
12275 get_last_committed() + 1));
7c673cae
FG
12276 return true;
12277
12278 } else if (prefix == "osd blacklist clear") {
12279 pending_inc.new_blacklist.clear();
12280 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12281 osdmap.get_blacklist(&blacklist);
12282 for (const auto &entry : blacklist) {
12283 pending_inc.old_blacklist.push_back(entry.first);
12284 }
12285 ss << " removed all blacklist entries";
12286 getline(ss, rs);
12287 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12288 get_last_committed() + 1));
12289 return true;
12290 } else if (prefix == "osd blacklist") {
12291 string addrstr;
9f95a23c 12292 cmd_getval(cmdmap, "addr", addrstr);
7c673cae
FG
12293 entity_addr_t addr;
12294 if (!addr.parse(addrstr.c_str(), 0)) {
12295 ss << "unable to parse address " << addrstr;
12296 err = -EINVAL;
12297 goto reply;
12298 }
12299 else {
9f95a23c 12300 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
12301 // always blacklist type ANY
12302 addr.set_type(entity_addr_t::TYPE_ANY);
12303 } else {
12304 addr.set_type(entity_addr_t::TYPE_LEGACY);
12305 }
12306
7c673cae 12307 string blacklistop;
9f95a23c 12308 cmd_getval(cmdmap, "blacklistop", blacklistop);
7c673cae
FG
12309 if (blacklistop == "add") {
12310 utime_t expires = ceph_clock_now();
12311 double d;
12312 // default one hour
9f95a23c 12313 cmd_getval(cmdmap, "expire", d,
11fdf7f2 12314 g_conf()->mon_osd_blacklist_default_expire);
7c673cae
FG
12315 expires += d;
12316
12317 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
12318
12319 {
12320 // cancel any pending un-blacklisting request too
12321 auto it = std::find(pending_inc.old_blacklist.begin(),
12322 pending_inc.old_blacklist.end(), addr);
12323 if (it != pending_inc.old_blacklist.end()) {
12324 pending_inc.old_blacklist.erase(it);
12325 }
12326 }
12327
7c673cae
FG
12328 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12329 getline(ss, rs);
12330 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12331 get_last_committed() + 1));
12332 return true;
12333 } else if (blacklistop == "rm") {
12334 if (osdmap.is_blacklisted(addr) ||
12335 pending_inc.new_blacklist.count(addr)) {
12336 if (osdmap.is_blacklisted(addr))
12337 pending_inc.old_blacklist.push_back(addr);
12338 else
12339 pending_inc.new_blacklist.erase(addr);
12340 ss << "un-blacklisting " << addr;
12341 getline(ss, rs);
12342 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12343 get_last_committed() + 1));
12344 return true;
12345 }
12346 ss << addr << " isn't blacklisted";
12347 err = 0;
12348 goto reply;
12349 }
12350 }
12351 } else if (prefix == "osd pool mksnap") {
12352 string poolstr;
9f95a23c 12353 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12354 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12355 if (pool < 0) {
12356 ss << "unrecognized pool '" << poolstr << "'";
12357 err = -ENOENT;
12358 goto reply;
12359 }
12360 string snapname;
9f95a23c 12361 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
12362 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12363 if (p->is_unmanaged_snaps_mode()) {
12364 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12365 err = -EINVAL;
12366 goto reply;
12367 } else if (p->snap_exists(snapname.c_str())) {
12368 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12369 err = 0;
12370 goto reply;
12371 } else if (p->is_tier()) {
12372 ss << "pool " << poolstr << " is a cache tier";
12373 err = -EINVAL;
12374 goto reply;
12375 }
12376 pg_pool_t *pp = 0;
12377 if (pending_inc.new_pools.count(pool))
12378 pp = &pending_inc.new_pools[pool];
12379 if (!pp) {
12380 pp = &pending_inc.new_pools[pool];
12381 *pp = *p;
12382 }
12383 if (pp->snap_exists(snapname.c_str())) {
12384 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12385 } else {
12386 pp->add_snap(snapname.c_str(), ceph_clock_now());
12387 pp->set_snap_epoch(pending_inc.epoch);
12388 ss << "created pool " << poolstr << " snap " << snapname;
12389 }
12390 getline(ss, rs);
12391 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12392 get_last_committed() + 1));
12393 return true;
12394 } else if (prefix == "osd pool rmsnap") {
12395 string poolstr;
9f95a23c 12396 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12397 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12398 if (pool < 0) {
12399 ss << "unrecognized pool '" << poolstr << "'";
12400 err = -ENOENT;
12401 goto reply;
12402 }
12403 string snapname;
9f95a23c 12404 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
12405 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12406 if (p->is_unmanaged_snaps_mode()) {
12407 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12408 err = -EINVAL;
12409 goto reply;
12410 } else if (!p->snap_exists(snapname.c_str())) {
12411 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12412 err = 0;
12413 goto reply;
12414 }
12415 pg_pool_t *pp = 0;
12416 if (pending_inc.new_pools.count(pool))
12417 pp = &pending_inc.new_pools[pool];
12418 if (!pp) {
12419 pp = &pending_inc.new_pools[pool];
12420 *pp = *p;
12421 }
12422 snapid_t sn = pp->snap_exists(snapname.c_str());
12423 if (sn) {
12424 pp->remove_snap(sn);
12425 pp->set_snap_epoch(pending_inc.epoch);
12426 ss << "removed pool " << poolstr << " snap " << snapname;
12427 } else {
12428 ss << "already removed pool " << poolstr << " snap " << snapname;
12429 }
12430 getline(ss, rs);
12431 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12432 get_last_committed() + 1));
12433 return true;
12434 } else if (prefix == "osd pool create") {
11fdf7f2 12435 int64_t pg_num, pg_num_min;
7c673cae 12436 int64_t pgp_num;
9f95a23c
TL
12437 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12438 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12439 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
7c673cae
FG
12440
12441 string pool_type_str;
9f95a23c 12442 cmd_getval(cmdmap, "pool_type", pool_type_str);
7c673cae 12443 if (pool_type_str.empty())
11fdf7f2 12444 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
7c673cae
FG
12445
12446 string poolstr;
9f95a23c 12447 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12448 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12449 if (pool_id >= 0) {
12450 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12451 if (pool_type_str != p->get_type_name()) {
12452 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12453 err = -EINVAL;
12454 } else {
12455 ss << "pool '" << poolstr << "' already exists";
12456 err = 0;
12457 }
12458 goto reply;
12459 }
12460
12461 int pool_type;
12462 if (pool_type_str == "replicated") {
12463 pool_type = pg_pool_t::TYPE_REPLICATED;
12464 } else if (pool_type_str == "erasure") {
7c673cae
FG
12465 pool_type = pg_pool_t::TYPE_ERASURE;
12466 } else {
12467 ss << "unknown pool type '" << pool_type_str << "'";
12468 err = -EINVAL;
12469 goto reply;
12470 }
12471
31f18b77 12472 bool implicit_rule_creation = false;
94b18763 12473 int64_t expected_num_objects = 0;
31f18b77 12474 string rule_name;
9f95a23c 12475 cmd_getval(cmdmap, "rule", rule_name);
7c673cae 12476 string erasure_code_profile;
9f95a23c 12477 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
7c673cae
FG
12478
12479 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12480 if (erasure_code_profile == "")
12481 erasure_code_profile = "default";
12482 //handle the erasure code profile
12483 if (erasure_code_profile == "default") {
12484 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12485 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12486 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12487 goto wait;
12488 }
12489
12490 map<string,string> profile_map;
11fdf7f2 12491 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
12492 profile_map,
12493 &ss);
12494 if (err)
12495 goto reply;
12496 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12497 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12498 goto wait;
12499 }
12500 }
31f18b77
FG
12501 if (rule_name == "") {
12502 implicit_rule_creation = true;
7c673cae 12503 if (erasure_code_profile == "default") {
31f18b77 12504 rule_name = "erasure-code";
7c673cae 12505 } else {
31f18b77 12506 dout(1) << "implicitly use rule named after the pool: "
7c673cae 12507 << poolstr << dendl;
31f18b77 12508 rule_name = poolstr;
7c673cae
FG
12509 }
12510 }
9f95a23c 12511 cmd_getval(cmdmap, "expected_num_objects",
94b18763 12512 expected_num_objects, int64_t(0));
7c673cae 12513 } else {
31f18b77 12514 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
12515 // and put expected_num_objects to rule field
12516 if (erasure_code_profile != "") { // cmd is from CLI
12517 if (rule_name != "") {
12518 string interr;
12519 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12520 if (interr.length()) {
12521 ss << "error parsing integer value '" << rule_name << "': " << interr;
12522 err = -EINVAL;
12523 goto reply;
12524 }
12525 }
12526 rule_name = erasure_code_profile;
12527 } else { // cmd is well-formed
9f95a23c 12528 cmd_getval(cmdmap, "expected_num_objects",
94b18763
FG
12529 expected_num_objects, int64_t(0));
12530 }
7c673cae
FG
12531 }
12532
31f18b77
FG
12533 if (!implicit_rule_creation && rule_name != "") {
12534 int rule;
12535 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
12536 if (err == -EAGAIN) {
12537 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12538 return true;
12539 }
12540 if (err)
12541 goto reply;
12542 }
12543
7c673cae
FG
12544 if (expected_num_objects < 0) {
12545 ss << "'expected_num_objects' must be non-negative";
12546 err = -EINVAL;
12547 goto reply;
12548 }
12549
91327a77
AA
12550 if (expected_num_objects > 0 &&
12551 cct->_conf->osd_objectstore == "filestore" &&
12552 cct->_conf->filestore_merge_threshold > 0) {
12553 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12554 err = -EINVAL;
12555 goto reply;
12556 }
12557
12558 if (expected_num_objects == 0 &&
12559 cct->_conf->osd_objectstore == "filestore" &&
12560 cct->_conf->filestore_merge_threshold < 0) {
12561 int osds = osdmap.get_num_osds();
12562 if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12563 ss << "For better initial performance on pools expected to store a "
12564 << "large number of objects, consider supplying the "
12565 << "expected_num_objects parameter when creating the pool.\n";
12566 }
12567 }
12568
7c673cae 12569 int64_t fast_read_param;
9f95a23c 12570 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
7c673cae
FG
12571 FastReadType fast_read = FAST_READ_DEFAULT;
12572 if (fast_read_param == 0)
12573 fast_read = FAST_READ_OFF;
12574 else if (fast_read_param > 0)
12575 fast_read = FAST_READ_ON;
11fdf7f2
TL
12576
12577 int64_t repl_size = 0;
9f95a23c 12578 cmd_getval(cmdmap, "size", repl_size);
11fdf7f2
TL
12579 int64_t target_size_bytes = 0;
12580 double target_size_ratio = 0.0;
9f95a23c
TL
12581 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12582 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12583
12584 string pg_autoscale_mode;
12585 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
11fdf7f2
TL
12586
12587 err = prepare_new_pool(poolstr,
7c673cae 12588 -1, // default crush rule
31f18b77 12589 rule_name,
11fdf7f2
TL
12590 pg_num, pgp_num, pg_num_min,
12591 repl_size, target_size_bytes, target_size_ratio,
7c673cae
FG
12592 erasure_code_profile, pool_type,
12593 (uint64_t)expected_num_objects,
12594 fast_read,
9f95a23c 12595 pg_autoscale_mode,
7c673cae
FG
12596 &ss);
12597 if (err < 0) {
12598 switch(err) {
12599 case -EEXIST:
12600 ss << "pool '" << poolstr << "' already exists";
12601 break;
12602 case -EAGAIN:
12603 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12604 return true;
12605 case -ERANGE:
12606 goto reply;
12607 default:
12608 goto reply;
12609 break;
12610 }
12611 } else {
12612 ss << "pool '" << poolstr << "' created";
12613 }
12614 getline(ss, rs);
12615 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12616 get_last_committed() + 1));
12617 return true;
12618
12619 } else if (prefix == "osd pool delete" ||
12620 prefix == "osd pool rm") {
12621 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12622 string poolstr, poolstr2, sure;
9f95a23c
TL
12623 cmd_getval(cmdmap, "pool", poolstr);
12624 cmd_getval(cmdmap, "pool2", poolstr2);
7c673cae
FG
12625 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12626 if (pool < 0) {
12627 ss << "pool '" << poolstr << "' does not exist";
12628 err = 0;
12629 goto reply;
12630 }
12631
11fdf7f2 12632 bool force_no_fake = false;
9f95a23c 12633 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11fdf7f2 12634 bool force = false;
9f95a23c 12635 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
7c673cae 12636 if (poolstr2 != poolstr ||
11fdf7f2 12637 (!force && !force_no_fake)) {
7c673cae
FG
12638 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12639 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12640 << "followed by --yes-i-really-really-mean-it.";
12641 err = -EPERM;
12642 goto reply;
12643 }
12644 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12645 if (err == -EAGAIN) {
12646 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12647 return true;
12648 }
12649 if (err < 0)
12650 goto reply;
12651 goto update;
12652 } else if (prefix == "osd pool rename") {
12653 string srcpoolstr, destpoolstr;
9f95a23c
TL
12654 cmd_getval(cmdmap, "srcpool", srcpoolstr);
12655 cmd_getval(cmdmap, "destpool", destpoolstr);
7c673cae
FG
12656 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12657 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12658
12659 if (pool_src < 0) {
12660 if (pool_dst >= 0) {
12661 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12662 // of operations, assume this rename succeeded, as it is not changing
12663 // the current state. Make sure we output something understandable
12664 // for whoever is issuing the command, if they are paying attention,
12665 // in case it was not intentional; or to avoid a "wtf?" and a bug
12666 // report in case it was intentional, while expecting a failure.
12667 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12668 << destpoolstr << "' does -- assuming successful rename";
12669 err = 0;
12670 } else {
12671 ss << "unrecognized pool '" << srcpoolstr << "'";
12672 err = -ENOENT;
12673 }
12674 goto reply;
12675 } else if (pool_dst >= 0) {
12676 // source pool exists and so does the destination pool
12677 ss << "pool '" << destpoolstr << "' already exists";
12678 err = -EEXIST;
12679 goto reply;
12680 }
12681
12682 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12683 if (ret == 0) {
12684 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12685 } else {
12686 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12687 << cpp_strerror(ret);
12688 }
12689 getline(ss, rs);
12690 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12691 get_last_committed() + 1));
12692 return true;
12693
12694 } else if (prefix == "osd pool set") {
12695 err = prepare_command_pool_set(cmdmap, ss);
12696 if (err == -EAGAIN)
12697 goto wait;
12698 if (err < 0)
12699 goto reply;
12700
12701 getline(ss, rs);
12702 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12703 get_last_committed() + 1));
12704 return true;
12705 } else if (prefix == "osd tier add") {
12706 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12707 if (err == -EAGAIN)
12708 goto wait;
12709 if (err)
12710 goto reply;
12711 string poolstr;
9f95a23c 12712 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12713 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12714 if (pool_id < 0) {
12715 ss << "unrecognized pool '" << poolstr << "'";
12716 err = -ENOENT;
12717 goto reply;
12718 }
12719 string tierpoolstr;
9f95a23c 12720 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
12721 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12722 if (tierpool_id < 0) {
12723 ss << "unrecognized pool '" << tierpoolstr << "'";
12724 err = -ENOENT;
12725 goto reply;
12726 }
12727 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12728 ceph_assert(p);
7c673cae 12729 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 12730 ceph_assert(tp);
7c673cae
FG
12731
12732 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12733 goto reply;
12734 }
12735
12736 // make sure new tier is empty
12737 string force_nonempty;
9f95a23c 12738 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
11fdf7f2 12739 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 12740 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
12741 force_nonempty != "--force-nonempty") {
12742 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12743 err = -ENOTEMPTY;
12744 goto reply;
12745 }
11fdf7f2 12746 if (tp->is_erasure()) {
7c673cae
FG
12747 ss << "tier pool '" << tierpoolstr
12748 << "' is an ec pool, which cannot be a tier";
12749 err = -ENOTSUP;
12750 goto reply;
12751 }
12752 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12753 ((force_nonempty != "--force-nonempty") ||
11fdf7f2 12754 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
7c673cae
FG
12755 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12756 err = -ENOTEMPTY;
12757 goto reply;
12758 }
12759 // go
12760 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12761 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12762 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12763 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12764 return true;
12765 }
12766 np->tiers.insert(tierpool_id);
12767 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12768 ntp->tier_of = pool_id;
12769 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12770 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12771 get_last_committed() + 1));
12772 return true;
12773 } else if (prefix == "osd tier remove" ||
12774 prefix == "osd tier rm") {
12775 string poolstr;
9f95a23c 12776 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12777 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12778 if (pool_id < 0) {
12779 ss << "unrecognized pool '" << poolstr << "'";
12780 err = -ENOENT;
12781 goto reply;
12782 }
12783 string tierpoolstr;
9f95a23c 12784 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
12785 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12786 if (tierpool_id < 0) {
12787 ss << "unrecognized pool '" << tierpoolstr << "'";
12788 err = -ENOENT;
12789 goto reply;
12790 }
12791 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12792 ceph_assert(p);
7c673cae 12793 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 12794 ceph_assert(tp);
7c673cae
FG
12795
12796 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12797 goto reply;
12798 }
12799
12800 if (p->tiers.count(tierpool_id) == 0) {
12801 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12802 err = 0;
12803 goto reply;
12804 }
12805 if (tp->tier_of != pool_id) {
12806 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12807 << osdmap.get_pool_name(tp->tier_of) << "': "
12808 // be scary about it; this is an inconsistency and bells must go off
12809 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12810 err = -EINVAL;
12811 goto reply;
12812 }
12813 if (p->read_tier == tierpool_id) {
12814 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12815 err = -EBUSY;
12816 goto reply;
12817 }
12818 // go
12819 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12820 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12821 if (np->tiers.count(tierpool_id) == 0 ||
12822 ntp->tier_of != pool_id ||
12823 np->read_tier == tierpool_id) {
12824 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12825 return true;
12826 }
12827 np->tiers.erase(tierpool_id);
12828 ntp->clear_tier();
12829 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12830 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12831 get_last_committed() + 1));
12832 return true;
12833 } else if (prefix == "osd tier set-overlay") {
12834 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12835 if (err == -EAGAIN)
12836 goto wait;
12837 if (err)
12838 goto reply;
12839 string poolstr;
9f95a23c 12840 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12841 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12842 if (pool_id < 0) {
12843 ss << "unrecognized pool '" << poolstr << "'";
12844 err = -ENOENT;
12845 goto reply;
12846 }
12847 string overlaypoolstr;
9f95a23c 12848 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
7c673cae
FG
12849 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12850 if (overlaypool_id < 0) {
12851 ss << "unrecognized pool '" << overlaypoolstr << "'";
12852 err = -ENOENT;
12853 goto reply;
12854 }
12855 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12856 ceph_assert(p);
7c673cae 12857 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11fdf7f2 12858 ceph_assert(overlay_p);
7c673cae
FG
12859 if (p->tiers.count(overlaypool_id) == 0) {
12860 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12861 err = -EINVAL;
12862 goto reply;
12863 }
12864 if (p->read_tier == overlaypool_id) {
12865 err = 0;
12866 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12867 goto reply;
12868 }
12869 if (p->has_read_tier()) {
12870 ss << "pool '" << poolstr << "' has overlay '"
12871 << osdmap.get_pool_name(p->read_tier)
12872 << "'; please remove-overlay first";
12873 err = -EINVAL;
12874 goto reply;
12875 }
12876
12877 // go
12878 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12879 np->read_tier = overlaypool_id;
12880 np->write_tier = overlaypool_id;
12881 np->set_last_force_op_resend(pending_inc.epoch);
12882 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12883 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12884 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12885 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12886 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12887 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12888 get_last_committed() + 1));
12889 return true;
12890 } else if (prefix == "osd tier remove-overlay" ||
12891 prefix == "osd tier rm-overlay") {
12892 string poolstr;
9f95a23c 12893 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12894 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12895 if (pool_id < 0) {
12896 ss << "unrecognized pool '" << poolstr << "'";
12897 err = -ENOENT;
12898 goto reply;
12899 }
12900 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12901 ceph_assert(p);
7c673cae
FG
12902 if (!p->has_read_tier()) {
12903 err = 0;
12904 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12905 goto reply;
12906 }
12907
12908 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12909 goto reply;
12910 }
12911
12912 // go
12913 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12914 if (np->has_read_tier()) {
12915 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12916 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12917 nop->set_last_force_op_resend(pending_inc.epoch);
12918 }
12919 if (np->has_write_tier()) {
12920 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12921 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12922 nop->set_last_force_op_resend(pending_inc.epoch);
12923 }
12924 np->clear_read_tier();
12925 np->clear_write_tier();
12926 np->set_last_force_op_resend(pending_inc.epoch);
12927 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12928 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12929 get_last_committed() + 1));
12930 return true;
12931 } else if (prefix == "osd tier cache-mode") {
12932 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12933 if (err == -EAGAIN)
12934 goto wait;
12935 if (err)
12936 goto reply;
12937 string poolstr;
9f95a23c 12938 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12939 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12940 if (pool_id < 0) {
12941 ss << "unrecognized pool '" << poolstr << "'";
12942 err = -ENOENT;
12943 goto reply;
12944 }
12945 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12946 ceph_assert(p);
7c673cae
FG
12947 if (!p->is_tier()) {
12948 ss << "pool '" << poolstr << "' is not a tier";
12949 err = -EINVAL;
12950 goto reply;
12951 }
12952 string modestr;
9f95a23c 12953 cmd_getval(cmdmap, "mode", modestr);
7c673cae 12954 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 12955 if (int(mode) < 0) {
7c673cae
FG
12956 ss << "'" << modestr << "' is not a valid cache mode";
12957 err = -EINVAL;
12958 goto reply;
12959 }
12960
11fdf7f2 12961 bool sure = false;
9f95a23c 12962 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 12963
9f95a23c
TL
12964 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
12965 mode == pg_pool_t::CACHEMODE_READFORWARD) {
12966 ss << "'" << modestr << "' is no longer a supported cache mode";
12967 err = -EPERM;
12968 goto reply;
12969 }
7c673cae
FG
12970 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12971 mode != pg_pool_t::CACHEMODE_NONE &&
12972 mode != pg_pool_t::CACHEMODE_PROXY &&
12973 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11fdf7f2 12974 !sure) {
7c673cae
FG
12975 ss << "'" << modestr << "' is not a well-supported cache mode and may "
12976 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12977 err = -EPERM;
12978 goto reply;
12979 }
12980
12981 // pool already has this cache-mode set and there are no pending changes
12982 if (p->cache_mode == mode &&
12983 (pending_inc.new_pools.count(pool_id) == 0 ||
12984 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12985 ss << "set cache-mode for pool '" << poolstr << "'"
12986 << " to " << pg_pool_t::get_cache_mode_name(mode);
12987 err = 0;
12988 goto reply;
12989 }
12990
12991 /* Mode description:
12992 *
12993 * none: No cache-mode defined
9f95a23c 12994 * forward: Forward all reads and writes to base pool [removed]
7c673cae
FG
12995 * writeback: Cache writes, promote reads from base pool
12996 * readonly: Forward writes to base pool
9f95a23c 12997 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
7c673cae
FG
12998 * proxy: Proxy all reads and writes to base pool
12999 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13000 *
13001 * Hence, these are the allowed transitions:
13002 *
13003 * none -> any
13004 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c 13005 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
7c673cae 13006 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c
TL
13007 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13008 * writeback -> readproxy || proxy
7c673cae
FG
13009 * readonly -> any
13010 */
13011
13012 // We check if the transition is valid against the current pool mode, as
13013 // it is the only committed state thus far. We will blantly squash
13014 // whatever mode is on the pending state.
13015
13016 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
9f95a23c 13017 (mode != pg_pool_t::CACHEMODE_PROXY &&
7c673cae
FG
13018 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13019 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13020 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13021 << "' pool; only '"
7c673cae
FG
13022 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13023 << "' allowed.";
13024 err = -EINVAL;
13025 goto reply;
13026 }
13027 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13028 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13029 mode != pg_pool_t::CACHEMODE_PROXY &&
13030 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13031
13032 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13033 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13034 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13035
13036 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13037 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13038 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13039
13040 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13041 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13042 mode != pg_pool_t::CACHEMODE_PROXY &&
13043 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13044
31f18b77 13045 const pool_stat_t* pstats =
11fdf7f2 13046 mon->mgrstatmon()->get_pool_stat(pool_id);
7c673cae 13047
31f18b77 13048 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
13049 ss << "unable to set cache-mode '"
13050 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13051 << "': dirty objects found";
13052 err = -EBUSY;
13053 goto reply;
13054 }
13055 }
13056 // go
13057 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13058 np->cache_mode = mode;
13059 // set this both when moving to and from cache_mode NONE. this is to
13060 // capture legacy pools that were set up before this flag existed.
13061 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13062 ss << "set cache-mode for pool '" << poolstr
13063 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13064 if (mode == pg_pool_t::CACHEMODE_NONE) {
13065 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11fdf7f2 13066 ceph_assert(base_pool);
7c673cae
FG
13067 if (base_pool->read_tier == pool_id ||
13068 base_pool->write_tier == pool_id)
13069 ss <<" (WARNING: pool is still configured as read or write tier)";
13070 }
13071 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13072 get_last_committed() + 1));
13073 return true;
13074 } else if (prefix == "osd tier add-cache") {
13075 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13076 if (err == -EAGAIN)
13077 goto wait;
13078 if (err)
13079 goto reply;
13080 string poolstr;
9f95a23c 13081 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13082 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13083 if (pool_id < 0) {
13084 ss << "unrecognized pool '" << poolstr << "'";
13085 err = -ENOENT;
13086 goto reply;
13087 }
13088 string tierpoolstr;
9f95a23c 13089 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13090 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13091 if (tierpool_id < 0) {
13092 ss << "unrecognized pool '" << tierpoolstr << "'";
13093 err = -ENOENT;
13094 goto reply;
13095 }
13096 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13097 ceph_assert(p);
7c673cae 13098 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13099 ceph_assert(tp);
7c673cae
FG
13100
13101 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13102 goto reply;
13103 }
13104
13105 int64_t size = 0;
9f95a23c 13106 if (!cmd_getval(cmdmap, "size", size)) {
7c673cae 13107 ss << "unable to parse 'size' value '"
11fdf7f2 13108 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
7c673cae
FG
13109 err = -EINVAL;
13110 goto reply;
13111 }
13112 // make sure new tier is empty
31f18b77 13113 const pool_stat_t *pstats =
11fdf7f2 13114 mon->mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 13115 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
13116 ss << "tier pool '" << tierpoolstr << "' is not empty";
13117 err = -ENOTEMPTY;
13118 goto reply;
13119 }
11fdf7f2 13120 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
7c673cae 13121 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 13122 if (int(mode) < 0) {
7c673cae
FG
13123 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13124 err = -EINVAL;
13125 goto reply;
13126 }
13127 HitSet::Params hsp;
11fdf7f2
TL
13128 auto& cache_hit_set_type =
13129 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13130 if (cache_hit_set_type == "bloom") {
7c673cae 13131 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 13132 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae 13133 hsp = HitSet::Params(bsp);
11fdf7f2 13134 } else if (cache_hit_set_type == "explicit_hash") {
7c673cae 13135 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11fdf7f2 13136 } else if (cache_hit_set_type == "explicit_object") {
7c673cae
FG
13137 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13138 } else {
11fdf7f2
TL
13139 ss << "osd tier cache default hit set type '"
13140 << cache_hit_set_type << "' is not a known type";
7c673cae
FG
13141 err = -EINVAL;
13142 goto reply;
13143 }
13144 // go
13145 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13146 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13147 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13148 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13149 return true;
13150 }
13151 np->tiers.insert(tierpool_id);
13152 np->read_tier = np->write_tier = tierpool_id;
13153 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13154 np->set_last_force_op_resend(pending_inc.epoch);
13155 ntp->set_last_force_op_resend(pending_inc.epoch);
13156 ntp->tier_of = pool_id;
13157 ntp->cache_mode = mode;
11fdf7f2
TL
13158 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13159 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13160 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13161 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13162 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13163 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
7c673cae
FG
13164 ntp->hit_set_params = hsp;
13165 ntp->target_max_bytes = size;
13166 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13167 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13168 get_last_committed() + 1));
13169 return true;
13170 } else if (prefix == "osd pool set-quota") {
13171 string poolstr;
9f95a23c 13172 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13173 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13174 if (pool_id < 0) {
13175 ss << "unrecognized pool '" << poolstr << "'";
13176 err = -ENOENT;
13177 goto reply;
13178 }
13179
13180 string field;
9f95a23c 13181 cmd_getval(cmdmap, "field", field);
7c673cae
FG
13182 if (field != "max_objects" && field != "max_bytes") {
13183 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13184 err = -EINVAL;
13185 goto reply;
13186 }
13187
13188 // val could contain unit designations, so we treat as a string
13189 string val;
9f95a23c 13190 cmd_getval(cmdmap, "val", val);
1adf2230
AA
13191 string tss;
13192 int64_t value;
13193 if (field == "max_objects") {
13194 value = strict_sistrtoll(val.c_str(), &tss);
13195 } else if (field == "max_bytes") {
13196 value = strict_iecstrtoll(val.c_str(), &tss);
13197 } else {
11fdf7f2 13198 ceph_abort_msg("unrecognized option");
1adf2230
AA
13199 }
13200 if (!tss.empty()) {
13201 ss << "error parsing value '" << val << "': " << tss;
13202 err = -EINVAL;
7c673cae
FG
13203 goto reply;
13204 }
13205
13206 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13207 if (field == "max_objects") {
13208 pi->quota_max_objects = value;
13209 } else if (field == "max_bytes") {
13210 pi->quota_max_bytes = value;
13211 } else {
11fdf7f2 13212 ceph_abort_msg("unrecognized option");
7c673cae
FG
13213 }
13214 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13215 rs = ss.str();
13216 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13217 get_last_committed() + 1));
13218 return true;
c07f9fc5
FG
13219 } else if (prefix == "osd pool application enable" ||
13220 prefix == "osd pool application disable" ||
13221 prefix == "osd pool application set" ||
13222 prefix == "osd pool application rm") {
13223 err = prepare_command_pool_application(prefix, cmdmap, ss);
11fdf7f2 13224 if (err == -EAGAIN) {
c07f9fc5 13225 goto wait;
11fdf7f2 13226 } else if (err < 0) {
7c673cae 13227 goto reply;
7c673cae 13228 } else {
11fdf7f2 13229 goto update;
7c673cae 13230 }
c07f9fc5
FG
13231 } else if (prefix == "osd force-create-pg") {
13232 pg_t pgid;
13233 string pgidstr;
9f95a23c 13234 cmd_getval(cmdmap, "pgid", pgidstr);
c07f9fc5
FG
13235 if (!pgid.parse(pgidstr.c_str())) {
13236 ss << "invalid pgid '" << pgidstr << "'";
13237 err = -EINVAL;
13238 goto reply;
13239 }
94b18763
FG
13240 if (!osdmap.pg_exists(pgid)) {
13241 ss << "pg " << pgid << " should not exist";
13242 err = -ENOENT;
13243 goto reply;
13244 }
11fdf7f2 13245 bool sure = false;
9f95a23c 13246 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
13247 if (!sure) {
13248 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13249 << "that the cluster will give up ever trying to recover the lost data. Do this "
13250 << "only if you are certain that all copies of the PG are in fact lost and you are "
13251 << "willing to accept that the data is permanently destroyed. Pass "
13252 << "--yes-i-really-mean-it to proceed.";
13253 err = -EPERM;
13254 goto reply;
13255 }
c07f9fc5
FG
13256 bool creating_now;
13257 {
13258 std::lock_guard<std::mutex> l(creating_pgs_lock);
9f95a23c
TL
13259 auto emplaced = creating_pgs.pgs.emplace(
13260 pgid,
13261 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13262 ceph_clock_now()));
c07f9fc5
FG
13263 creating_now = emplaced.second;
13264 }
13265 if (creating_now) {
13266 ss << "pg " << pgidstr << " now creating, ok";
11fdf7f2
TL
13267 // set the pool's CREATING flag so that (1) the osd won't ignore our
13268 // create message and (2) we won't propose any future pg_num changes
13269 // until after the PG has been instantiated.
13270 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13271 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13272 }
13273 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
c07f9fc5
FG
13274 err = 0;
13275 goto update;
13276 } else {
13277 ss << "pg " << pgid << " already creating";
13278 err = 0;
13279 goto reply;
13280 }
7c673cae
FG
13281 } else {
13282 err = -EINVAL;
13283 }
13284
13285 reply:
13286 getline(ss, rs);
13287 if (err < 0 && rs.length() == 0)
13288 rs = cpp_strerror(err);
13289 mon->reply_command(op, err, rs, rdata, get_last_committed());
13290 return ret;
13291
13292 update:
13293 getline(ss, rs);
13294 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13295 get_last_committed() + 1));
13296 return true;
13297
13298 wait:
13299 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13300 return true;
13301}
13302
28e407b8 13303bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
7c673cae
FG
13304{
13305 op->mark_osdmon_event(__func__);
28e407b8 13306
9f95a23c 13307 auto m = op->get_req<MPoolOp>();
11fdf7f2 13308 MonSession *session = op->get_session();
28e407b8
AA
13309 if (!session) {
13310 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13311 return true;
13312 }
13313
13314 switch (m->op) {
13315 case POOL_OP_CREATE_UNMANAGED_SNAP:
13316 case POOL_OP_DELETE_UNMANAGED_SNAP:
13317 {
13318 const std::string* pool_name = nullptr;
13319 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13320 if (pg_pool != nullptr) {
13321 pool_name = &osdmap.get_pool_name(m->pool);
13322 }
13323
13324 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13325 session->entity_name, session->caps,
11fdf7f2 13326 session->get_peer_socket_addr(),
28e407b8
AA
13327 pool_name)) {
13328 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13329 << "privileges. message: " << *m << std::endl
13330 << "caps: " << session->caps << dendl;
13331 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13332 return true;
13333 }
13334 }
13335 break;
13336 default:
13337 if (!session->is_capable("osd", MON_CAP_W)) {
13338 dout(0) << "got pool op from entity with insufficient privileges. "
13339 << "message: " << *m << std::endl
13340 << "caps: " << session->caps << dendl;
13341 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13342 return true;
13343 }
13344 break;
13345 }
13346
13347 return false;
13348}
13349
13350bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13351{
13352 op->mark_osdmon_event(__func__);
9f95a23c 13353 auto m = op->get_req<MPoolOp>();
28e407b8
AA
13354
13355 if (enforce_pool_op_caps(op)) {
13356 return true;
13357 }
13358
7c673cae
FG
13359 if (m->fsid != mon->monmap->fsid) {
13360 dout(0) << __func__ << " drop message on fsid " << m->fsid
13361 << " != " << mon->monmap->fsid << " for " << *m << dendl;
13362 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13363 return true;
13364 }
13365
13366 if (m->op == POOL_OP_CREATE)
13367 return preprocess_pool_op_create(op);
13368
11fdf7f2
TL
13369 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13370 if (p == nullptr) {
7c673cae 13371 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11fdf7f2
TL
13372 if (m->op == POOL_OP_DELETE) {
13373 _pool_op_reply(op, 0, osdmap.get_epoch());
13374 } else {
13375 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13376 }
7c673cae
FG
13377 return true;
13378 }
13379
13380 // check if the snap and snapname exist
13381 bool snap_exists = false;
7c673cae
FG
13382 if (p->snap_exists(m->name.c_str()))
13383 snap_exists = true;
13384
13385 switch (m->op) {
13386 case POOL_OP_CREATE_SNAP:
13387 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13388 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13389 return true;
13390 }
13391 if (snap_exists) {
13392 _pool_op_reply(op, 0, osdmap.get_epoch());
13393 return true;
13394 }
13395 return false;
13396 case POOL_OP_CREATE_UNMANAGED_SNAP:
13397 if (p->is_pool_snaps_mode()) {
13398 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13399 return true;
13400 }
13401 return false;
13402 case POOL_OP_DELETE_SNAP:
13403 if (p->is_unmanaged_snaps_mode()) {
13404 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13405 return true;
13406 }
13407 if (!snap_exists) {
13408 _pool_op_reply(op, 0, osdmap.get_epoch());
13409 return true;
13410 }
13411 return false;
13412 case POOL_OP_DELETE_UNMANAGED_SNAP:
13413 if (p->is_pool_snaps_mode()) {
13414 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13415 return true;
13416 }
9f95a23c 13417 if (_is_removed_snap(m->pool, m->snapid)) {
7c673cae
FG
13418 _pool_op_reply(op, 0, osdmap.get_epoch());
13419 return true;
13420 }
13421 return false;
13422 case POOL_OP_DELETE:
13423 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13424 _pool_op_reply(op, 0, osdmap.get_epoch());
13425 return true;
13426 }
13427 return false;
13428 case POOL_OP_AUID_CHANGE:
13429 return false;
13430 default:
13431 ceph_abort();
13432 break;
13433 }
13434
13435 return false;
13436}
13437
9f95a23c
TL
13438bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13439{
13440 if (!osdmap.have_pg_pool(pool)) {
13441 dout(10) << __func__ << " pool " << pool << " snap " << snap
13442 << " - pool dne" << dendl;
13443 return true;
13444 }
13445 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13446 dout(10) << __func__ << " pool " << pool << " snap " << snap
13447 << " - in osdmap removed_snaps_queue" << dendl;
13448 return true;
13449 }
13450 snapid_t begin, end;
13451 int r = lookup_purged_snap(pool, snap, &begin, &end);
13452 if (r == 0) {
13453 dout(10) << __func__ << " pool " << pool << " snap " << snap
13454 << " - purged, [" << begin << "," << end << ")" << dendl;
13455 return true;
13456 }
13457 return false;
13458}
13459
13460bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13461{
13462 if (pending_inc.old_pools.count(pool)) {
13463 dout(10) << __func__ << " pool " << pool << " snap " << snap
13464 << " - pool pending deletion" << dendl;
13465 return true;
13466 }
13467 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13468 dout(10) << __func__ << " pool " << pool << " snap " << snap
13469 << " - in pending new_removed_snaps" << dendl;
13470 return true;
13471 }
13472 return false;
13473}
13474
7c673cae
FG
13475bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13476{
13477 op->mark_osdmon_event(__func__);
9f95a23c 13478 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13479 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13480 if (pool >= 0) {
13481 _pool_op_reply(op, 0, osdmap.get_epoch());
13482 return true;
13483 }
13484
13485 return false;
13486}
13487
13488bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13489{
13490 op->mark_osdmon_event(__func__);
9f95a23c 13491 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13492 dout(10) << "prepare_pool_op " << *m << dendl;
13493 if (m->op == POOL_OP_CREATE) {
13494 return prepare_pool_op_create(op);
13495 } else if (m->op == POOL_OP_DELETE) {
13496 return prepare_pool_op_delete(op);
13497 }
13498
13499 int ret = 0;
13500 bool changed = false;
13501
13502 if (!osdmap.have_pg_pool(m->pool)) {
13503 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13504 return false;
13505 }
13506
13507 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13508
13509 switch (m->op) {
13510 case POOL_OP_CREATE_SNAP:
13511 if (pool->is_tier()) {
13512 ret = -EINVAL;
13513 _pool_op_reply(op, ret, osdmap.get_epoch());
13514 return false;
13515 } // else, fall through
13516 case POOL_OP_DELETE_SNAP:
13517 if (!pool->is_unmanaged_snaps_mode()) {
13518 bool snap_exists = pool->snap_exists(m->name.c_str());
13519 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13520 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13521 ret = 0;
13522 } else {
13523 break;
13524 }
13525 } else {
13526 ret = -EINVAL;
13527 }
13528 _pool_op_reply(op, ret, osdmap.get_epoch());
13529 return false;
13530
13531 case POOL_OP_DELETE_UNMANAGED_SNAP:
13532 // we won't allow removal of an unmanaged snapshot from a pool
13533 // not in unmanaged snaps mode.
13534 if (!pool->is_unmanaged_snaps_mode()) {
13535 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13536 return false;
13537 }
13538 /* fall-thru */
13539 case POOL_OP_CREATE_UNMANAGED_SNAP:
13540 // but we will allow creating an unmanaged snapshot on any pool
13541 // as long as it is not in 'pool' snaps mode.
13542 if (pool->is_pool_snaps_mode()) {
13543 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13544 return false;
13545 }
13546 }
13547
13548 // projected pool info
13549 pg_pool_t pp;
13550 if (pending_inc.new_pools.count(m->pool))
13551 pp = pending_inc.new_pools[m->pool];
13552 else
13553 pp = *osdmap.get_pg_pool(m->pool);
13554
13555 bufferlist reply_data;
13556
13557 // pool snaps vs unmanaged snaps are mutually exclusive
13558 switch (m->op) {
13559 case POOL_OP_CREATE_SNAP:
13560 case POOL_OP_DELETE_SNAP:
13561 if (pp.is_unmanaged_snaps_mode()) {
13562 ret = -EINVAL;
13563 goto out;
13564 }
13565 break;
13566
13567 case POOL_OP_CREATE_UNMANAGED_SNAP:
13568 case POOL_OP_DELETE_UNMANAGED_SNAP:
13569 if (pp.is_pool_snaps_mode()) {
13570 ret = -EINVAL;
13571 goto out;
13572 }
13573 }
13574
13575 switch (m->op) {
13576 case POOL_OP_CREATE_SNAP:
13577 if (!pp.snap_exists(m->name.c_str())) {
13578 pp.add_snap(m->name.c_str(), ceph_clock_now());
11fdf7f2
TL
13579 dout(10) << "create snap in pool " << m->pool << " " << m->name
13580 << " seq " << pp.get_snap_epoch() << dendl;
7c673cae
FG
13581 changed = true;
13582 }
13583 break;
13584
13585 case POOL_OP_DELETE_SNAP:
13586 {
13587 snapid_t s = pp.snap_exists(m->name.c_str());
13588 if (s) {
13589 pp.remove_snap(s);
11fdf7f2 13590 pending_inc.new_removed_snaps[m->pool].insert(s);
7c673cae
FG
13591 changed = true;
13592 }
13593 }
13594 break;
13595
13596 case POOL_OP_CREATE_UNMANAGED_SNAP:
13597 {
9f95a23c
TL
13598 uint64_t snapid = pp.add_unmanaged_snap(
13599 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 13600 encode(snapid, reply_data);
7c673cae
FG
13601 changed = true;
13602 }
13603 break;
13604
13605 case POOL_OP_DELETE_UNMANAGED_SNAP:
9f95a23c
TL
13606 if (!_is_removed_snap(m->pool, m->snapid) &&
13607 !_is_pending_removed_snap(m->pool, m->snapid)) {
28e407b8
AA
13608 if (m->snapid > pp.get_snap_seq()) {
13609 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13610 return false;
13611 }
9f95a23c
TL
13612 pp.remove_unmanaged_snap(
13613 m->snapid,
13614 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 13615 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
9f95a23c
TL
13616 // also record the new seq as purged: this avoids a discontinuity
13617 // after all of the snaps have been purged, since the seq assigned
13618 // during removal lives in the same namespace as the actual snaps.
13619 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
7c673cae
FG
13620 changed = true;
13621 }
13622 break;
13623
13624 case POOL_OP_AUID_CHANGE:
11fdf7f2
TL
13625 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13626 return false;
7c673cae
FG
13627
13628 default:
13629 ceph_abort();
13630 break;
13631 }
13632
13633 if (changed) {
13634 pp.set_snap_epoch(pending_inc.epoch);
13635 pending_inc.new_pools[m->pool] = pp;
13636 }
13637
13638 out:
13639 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13640 return true;
13641}
13642
13643bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13644{
13645 op->mark_osdmon_event(__func__);
13646 int err = prepare_new_pool(op);
13647 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13648 return true;
13649}
13650
13651int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13652 ostream *ss)
13653{
13654 const string& poolstr = osdmap.get_pool_name(pool_id);
13655
13656 // If the Pool is in use by CephFS, refuse to delete it
28e407b8 13657 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
13658 if (pending_fsmap.pool_in_use(pool_id)) {
13659 *ss << "pool '" << poolstr << "' is in use by CephFS";
13660 return -EBUSY;
13661 }
13662
13663 if (pool.tier_of >= 0) {
13664 *ss << "pool '" << poolstr << "' is a tier of '"
13665 << osdmap.get_pool_name(pool.tier_of) << "'";
13666 return -EBUSY;
13667 }
13668 if (!pool.tiers.empty()) {
13669 *ss << "pool '" << poolstr << "' has tiers";
13670 for(auto tier : pool.tiers) {
13671 *ss << " " << osdmap.get_pool_name(tier);
13672 }
13673 return -EBUSY;
13674 }
13675
11fdf7f2 13676 if (!g_conf()->mon_allow_pool_delete) {
7c673cae
FG
13677 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13678 return -EPERM;
13679 }
13680
13681 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13682 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13683 return -EPERM;
13684 }
13685
13686 *ss << "pool '" << poolstr << "' removed";
13687 return 0;
13688}
13689
13690/**
13691 * Check if it is safe to add a tier to a base pool
13692 *
13693 * @return
13694 * True if the operation should proceed, false if we should abort here
13695 * (abort doesn't necessarily mean error, could be idempotency)
13696 */
13697bool OSDMonitor::_check_become_tier(
13698 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13699 const int64_t base_pool_id, const pg_pool_t *base_pool,
13700 int *err,
13701 ostream *ss) const
13702{
13703 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13704 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13705
28e407b8 13706 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
13707 if (pending_fsmap.pool_in_use(tier_pool_id)) {
13708 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13709 *err = -EBUSY;
13710 return false;
13711 }
13712
13713 if (base_pool->tiers.count(tier_pool_id)) {
11fdf7f2 13714 ceph_assert(tier_pool->tier_of == base_pool_id);
7c673cae
FG
13715 *err = 0;
13716 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13717 << base_pool_name << "'";
13718 return false;
13719 }
13720
13721 if (base_pool->is_tier()) {
13722 *ss << "pool '" << base_pool_name << "' is already a tier of '"
13723 << osdmap.get_pool_name(base_pool->tier_of) << "', "
13724 << "multiple tiers are not yet supported.";
13725 *err = -EINVAL;
13726 return false;
13727 }
13728
13729 if (tier_pool->has_tiers()) {
13730 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13731 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13732 it != tier_pool->tiers.end(); ++it)
13733 *ss << "'" << osdmap.get_pool_name(*it) << "',";
13734 *ss << " multiple tiers are not yet supported.";
13735 *err = -EINVAL;
13736 return false;
13737 }
13738
13739 if (tier_pool->is_tier()) {
13740 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13741 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13742 *err = -EINVAL;
13743 return false;
13744 }
13745
13746 *err = 0;
13747 return true;
13748}
13749
13750
13751/**
13752 * Check if it is safe to remove a tier from this base pool
13753 *
13754 * @return
13755 * True if the operation should proceed, false if we should abort here
13756 * (abort doesn't necessarily mean error, could be idempotency)
13757 */
13758bool OSDMonitor::_check_remove_tier(
13759 const int64_t base_pool_id, const pg_pool_t *base_pool,
13760 const pg_pool_t *tier_pool,
13761 int *err, ostream *ss) const
13762{
13763 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13764
13765 // Apply CephFS-specific checks
28e407b8 13766 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae 13767 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
13768 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13769 // If the underlying pool is erasure coded and does not allow EC
13770 // overwrites, we can't permit the removal of the replicated tier that
13771 // CephFS relies on to access it
13772 *ss << "pool '" << base_pool_name <<
13773 "' does not allow EC overwrites and is in use by CephFS"
13774 " via its tier";
7c673cae
FG
13775 *err = -EBUSY;
13776 return false;
13777 }
13778
13779 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13780 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13781 "tier is still in use as a writeback cache. Change the cache "
13782 "mode and flush the cache before removing it";
13783 *err = -EBUSY;
13784 return false;
13785 }
13786 }
13787
13788 *err = 0;
13789 return true;
13790}
13791
13792int OSDMonitor::_prepare_remove_pool(
13793 int64_t pool, ostream *ss, bool no_fake)
13794{
224ce89b 13795 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
13796 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13797 int r = _check_remove_pool(pool, *p, ss);
13798 if (r < 0)
13799 return r;
13800
13801 auto new_pool = pending_inc.new_pools.find(pool);
13802 if (new_pool != pending_inc.new_pools.end()) {
13803 // if there is a problem with the pending info, wait and retry
13804 // this op.
13805 const auto& p = new_pool->second;
13806 int r = _check_remove_pool(pool, p, ss);
13807 if (r < 0)
13808 return -EAGAIN;
13809 }
13810
13811 if (pending_inc.old_pools.count(pool)) {
224ce89b 13812 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
13813 << dendl;
13814 return 0;
13815 }
13816
11fdf7f2 13817 if (g_conf()->mon_fake_pool_delete && !no_fake) {
7c673cae
FG
13818 string old_name = osdmap.get_pool_name(pool);
13819 string new_name = old_name + "." + stringify(pool) + ".DELETED";
13820 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13821 << old_name << " -> " << new_name << dendl;
13822 pending_inc.new_pool_names[pool] = new_name;
13823 return 0;
13824 }
13825
13826 // remove
13827 pending_inc.old_pools.insert(pool);
13828
224ce89b 13829 // remove any pg_temp mappings for this pool
7c673cae
FG
13830 for (auto p = osdmap.pg_temp->begin();
13831 p != osdmap.pg_temp->end();
13832 ++p) {
11fdf7f2 13833 if (p->first.pool() == pool) {
224ce89b 13834 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
13835 << p->first << dendl;
13836 pending_inc.new_pg_temp[p->first].clear();
13837 }
13838 }
224ce89b 13839 // remove any primary_temp mappings for this pool
7c673cae
FG
13840 for (auto p = osdmap.primary_temp->begin();
13841 p != osdmap.primary_temp->end();
13842 ++p) {
11fdf7f2 13843 if (p->first.pool() == pool) {
224ce89b 13844 dout(10) << __func__ << " " << pool
7c673cae
FG
13845 << " removing obsolete primary_temp" << p->first << dendl;
13846 pending_inc.new_primary_temp[p->first] = -1;
13847 }
13848 }
224ce89b
WB
13849 // remove any pg_upmap mappings for this pool
13850 for (auto& p : osdmap.pg_upmap) {
11fdf7f2 13851 if (p.first.pool() == pool) {
224ce89b
WB
13852 dout(10) << __func__ << " " << pool
13853 << " removing obsolete pg_upmap "
13854 << p.first << dendl;
13855 pending_inc.old_pg_upmap.insert(p.first);
13856 }
13857 }
94b18763
FG
13858 // remove any pending pg_upmap mappings for this pool
13859 {
13860 auto it = pending_inc.new_pg_upmap.begin();
13861 while (it != pending_inc.new_pg_upmap.end()) {
11fdf7f2 13862 if (it->first.pool() == pool) {
94b18763
FG
13863 dout(10) << __func__ << " " << pool
13864 << " removing pending pg_upmap "
13865 << it->first << dendl;
13866 it = pending_inc.new_pg_upmap.erase(it);
13867 } else {
13868 it++;
13869 }
13870 }
13871 }
224ce89b
WB
13872 // remove any pg_upmap_items mappings for this pool
13873 for (auto& p : osdmap.pg_upmap_items) {
11fdf7f2 13874 if (p.first.pool() == pool) {
224ce89b
WB
13875 dout(10) << __func__ << " " << pool
13876 << " removing obsolete pg_upmap_items " << p.first
13877 << dendl;
13878 pending_inc.old_pg_upmap_items.insert(p.first);
13879 }
13880 }
94b18763
FG
13881 // remove any pending pg_upmap mappings for this pool
13882 {
13883 auto it = pending_inc.new_pg_upmap_items.begin();
13884 while (it != pending_inc.new_pg_upmap_items.end()) {
11fdf7f2 13885 if (it->first.pool() == pool) {
94b18763
FG
13886 dout(10) << __func__ << " " << pool
13887 << " removing pending pg_upmap_items "
13888 << it->first << dendl;
13889 it = pending_inc.new_pg_upmap_items.erase(it);
13890 } else {
13891 it++;
13892 }
13893 }
13894 }
35e4c445
FG
13895
13896 // remove any choose_args for this pool
13897 CrushWrapper newcrush;
13898 _get_pending_crush(newcrush);
13899 if (newcrush.have_choose_args(pool)) {
13900 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13901 newcrush.rm_choose_args(pool);
13902 pending_inc.crush.clear();
13903 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13904 }
7c673cae
FG
13905 return 0;
13906}
13907
13908int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13909{
13910 dout(10) << "_prepare_rename_pool " << pool << dendl;
13911 if (pending_inc.old_pools.count(pool)) {
13912 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13913 return -ENOENT;
13914 }
13915 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13916 p != pending_inc.new_pool_names.end();
13917 ++p) {
13918 if (p->second == newname && p->first != pool) {
13919 return -EEXIST;
13920 }
13921 }
13922
13923 pending_inc.new_pool_names[pool] = newname;
13924 return 0;
13925}
13926
13927bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13928{
13929 op->mark_osdmon_event(__func__);
9f95a23c 13930 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13931 ostringstream ss;
13932 int ret = _prepare_remove_pool(m->pool, &ss, false);
13933 if (ret == -EAGAIN) {
13934 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13935 return true;
13936 }
13937 if (ret < 0)
13938 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13939 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13940 pending_inc.epoch));
13941 return true;
13942}
13943
13944void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13945 int ret, epoch_t epoch, bufferlist *blp)
13946{
13947 op->mark_osdmon_event(__func__);
9f95a23c 13948 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13949 dout(20) << "_pool_op_reply " << ret << dendl;
13950 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13951 ret, epoch, get_last_committed(), blp);
13952 mon->send_reply(op, reply);
13953}
81eedcae
TL
13954
13955void OSDMonitor::convert_pool_priorities(void)
13956{
13957 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13958 int64_t max_prio = 0;
13959 int64_t min_prio = 0;
13960 for (const auto &i : osdmap.get_pools()) {
13961 const auto &pool = i.second;
13962
13963 if (pool.opts.is_set(key)) {
9f95a23c 13964 int64_t prio = 0;
81eedcae
TL
13965 pool.opts.get(key, &prio);
13966 if (prio > max_prio)
13967 max_prio = prio;
13968 if (prio < min_prio)
13969 min_prio = prio;
13970 }
13971 }
13972 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13973 dout(20) << __func__ << " nothing to fix" << dendl;
13974 return;
13975 }
13976 // Current pool priorities exceeds new maximum
13977 for (const auto &i : osdmap.get_pools()) {
13978 const auto pool_id = i.first;
13979 pg_pool_t pool = i.second;
13980
13981 int64_t prio = 0;
13982 pool.opts.get(key, &prio);
13983 int64_t n;
13984
13985 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13986 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13987 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13988 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13989 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13990 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13991 } else {
13992 continue;
13993 }
13994 if (n == 0) {
13995 pool.opts.unset(key);
13996 } else {
13997 pool.opts.set(key, static_cast<int64_t>(n));
13998 }
13999 dout(10) << __func__ << " pool " << pool_id
14000 << " recovery_priority adjusted "
14001 << prio << " to " << n << dendl;
14002 pool.last_change = pending_inc.epoch;
14003 pending_inc.new_pools[pool_id] = pool;
14004 }
14005}