]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
import 15.2.5
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b 20#include <boost/algorithm/string.hpp>
11fdf7f2 21#include <experimental/iterator>
224ce89b 22#include <locale>
7c673cae
FG
23#include <sstream>
24
31f18b77
FG
25#include "mon/OSDMonitor.h"
26#include "mon/Monitor.h"
27#include "mon/MDSMonitor.h"
31f18b77
FG
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
9f95a23c 42#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
43#include "messages/MOSDFull.h"
44#include "messages/MOSDMap.h"
45#include "messages/MMonGetOSDMap.h"
46#include "messages/MOSDBoot.h"
47#include "messages/MOSDAlive.h"
48#include "messages/MPoolOp.h"
49#include "messages/MPoolOpReply.h"
50#include "messages/MOSDPGCreate.h"
11fdf7f2 51#include "messages/MOSDPGCreate2.h"
7c673cae
FG
52#include "messages/MOSDPGCreated.h"
53#include "messages/MOSDPGTemp.h"
11fdf7f2 54#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
55#include "messages/MMonCommand.h"
56#include "messages/MRemoveSnaps.h"
57#include "messages/MOSDScrub.h"
58#include "messages/MRoute.h"
9f95a23c
TL
59#include "messages/MMonGetPurgedSnaps.h"
60#include "messages/MMonGetPurgedSnapsReply.h"
7c673cae
FG
61
62#include "common/TextTable.h"
63#include "common/Timer.h"
64#include "common/ceph_argparse.h"
65#include "common/perf_counters.h"
eafe8130 66#include "common/PriorityCache.h"
7c673cae 67#include "common/strtol.h"
11fdf7f2 68#include "common/numa.h"
7c673cae
FG
69
70#include "common/config.h"
71#include "common/errno.h"
72
73#include "erasure-code/ErasureCodePlugin.h"
74#include "compressor/Compressor.h"
75#include "common/Checksummer.h"
76
77#include "include/compat.h"
11fdf7f2 78#include "include/ceph_assert.h"
7c673cae
FG
79#include "include/stringify.h"
80#include "include/util.h"
81#include "common/cmdparse.h"
82#include "include/str_list.h"
83#include "include/str_map.h"
224ce89b 84#include "include/scope_guard.h"
eafe8130 85#include "perfglue/heap_profiler.h"
7c673cae 86
28e407b8
AA
87#include "auth/cephx/CephxKeyServer.h"
88#include "osd/OSDCap.h"
89
7c673cae
FG
90#include "json_spirit/json_spirit_reader.h"
91
c07f9fc5
FG
92#include <boost/algorithm/string/predicate.hpp>
93
7c673cae 94#define dout_subsys ceph_subsys_mon
3efd9988
FG
95static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
96static const string OSD_METADATA_PREFIX("osd_metadata");
11fdf7f2 97static const string OSD_SNAP_PREFIX("osd_snap");
7c673cae 98
9f95a23c
TL
99/*
100
101 OSD snapshot metadata
102 ---------------------
103
104 -- starting with mimic, removed in octopus --
105
106 "removed_epoch_%llu_%08lx" % (pool, epoch)
107 -> interval_set<snapid_t>
108
109 "removed_snap_%llu_%016llx" % (pool, last_snap)
110 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
111
112
113 -- starting with mimic --
114
115 "purged_snap_%llu_%016llx" % (pool, last_snap)
116 -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1)
117
118 - note that the {removed,purged}_snap put the last snap in they key so
119 that we can use forward iteration only to search for an epoch in an
120 interval. e.g., to test if epoch N is removed/purged, we'll find a key
121 >= N that either does or doesn't contain the given snap.
122
123
124 -- starting with octopus --
125
126 "purged_epoch_%08lx" % epoch
127 -> map<int64_t,interval_set<snapid_t>>
128
129 */
130using namespace TOPNSPC::common;
c07f9fc5
FG
131namespace {
132
eafe8130
TL
133struct OSDMemCache : public PriorityCache::PriCache {
134 OSDMonitor *osdmon;
135 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
136 int64_t committed_bytes = 0;
137 double cache_ratio = 0;
138
139 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
140
141 virtual uint64_t _get_used_bytes() const = 0;
142
143 virtual int64_t request_cache_bytes(
144 PriorityCache::Priority pri, uint64_t total_cache) const {
145 int64_t assigned = get_cache_bytes(pri);
146
147 switch (pri) {
148 // All cache items are currently set to have PRI1 priority
149 case PriorityCache::Priority::PRI1:
150 {
151 int64_t request = _get_used_bytes();
152 return (request > assigned) ? request - assigned : 0;
153 }
154 default:
155 break;
156 }
157 return -EOPNOTSUPP;
158 }
159
160 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
161 return cache_bytes[pri];
162 }
163
164 virtual int64_t get_cache_bytes() const {
165 int64_t total = 0;
166
167 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
168 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
169 total += get_cache_bytes(pri);
170 }
171 return total;
172 }
173
174 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
175 cache_bytes[pri] = bytes;
176 }
177 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
178 cache_bytes[pri] += bytes;
179 }
180 virtual int64_t commit_cache_size(uint64_t total_cache) {
181 committed_bytes = PriorityCache::get_chunk(
182 get_cache_bytes(), total_cache);
183 return committed_bytes;
184 }
185 virtual int64_t get_committed_size() const {
186 return committed_bytes;
187 }
188 virtual double get_cache_ratio() const {
189 return cache_ratio;
190 }
191 virtual void set_cache_ratio(double ratio) {
192 cache_ratio = ratio;
193 }
194 virtual string get_cache_name() const = 0;
195};
196
197struct IncCache : public OSDMemCache {
198 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
199
200 virtual uint64_t _get_used_bytes() const {
201 return osdmon->inc_osd_cache.get_bytes();
202 }
203
204 virtual string get_cache_name() const {
205 return "OSDMap Inc Cache";
206 }
207
208 uint64_t _get_num_osdmaps() const {
209 return osdmon->inc_osd_cache.get_size();
210 }
211};
212
213struct FullCache : public OSDMemCache {
214 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
215
216 virtual uint64_t _get_used_bytes() const {
217 return osdmon->full_osd_cache.get_bytes();
218 }
219
220 virtual string get_cache_name() const {
221 return "OSDMap Full Cache";
222 }
223
224 uint64_t _get_num_osdmaps() const {
225 return osdmon->full_osd_cache.get_size();
226 }
227};
228
229std::shared_ptr<IncCache> inc_cache;
230std::shared_ptr<FullCache> full_cache;
231
c07f9fc5
FG
232const uint32_t MAX_POOL_APPLICATIONS = 4;
233const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
234const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
235
28e407b8
AA
236bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
237 // Note: this doesn't include support for the application tag match
238 if ((grant.spec.allow & OSD_CAP_W) != 0) {
239 auto& match = grant.match;
240 if (match.is_match_all()) {
241 return true;
11fdf7f2 242 } else if (pool_name != nullptr &&
28e407b8
AA
243 !match.pool_namespace.pool_name.empty() &&
244 match.pool_namespace.pool_name == *pool_name) {
245 return true;
246 }
247 }
248 return false;
249}
250
251bool is_unmanaged_snap_op_permitted(CephContext* cct,
252 const KeyServer& key_server,
253 const EntityName& entity_name,
254 const MonCap& mon_caps,
11fdf7f2 255 const entity_addr_t& peer_socket_addr,
28e407b8
AA
256 const std::string* pool_name)
257{
258 typedef std::map<std::string, std::string> CommandArgs;
259
11fdf7f2 260 if (mon_caps.is_capable(
92f5a8d4 261 cct, entity_name, "osd",
11fdf7f2
TL
262 "osd pool op unmanaged-snap",
263 (pool_name == nullptr ?
264 CommandArgs{} /* pool DNE, require unrestricted cap */ :
265 CommandArgs{{"poolname", *pool_name}}),
266 false, true, false,
267 peer_socket_addr)) {
28e407b8
AA
268 return true;
269 }
270
271 AuthCapsInfo caps_info;
272 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
273 caps_info)) {
274 dout(10) << "unable to locate OSD cap data for " << entity_name
275 << " in auth db" << dendl;
276 return false;
277 }
278
279 string caps_str;
280 if (caps_info.caps.length() > 0) {
11fdf7f2 281 auto p = caps_info.caps.cbegin();
28e407b8
AA
282 try {
283 decode(caps_str, p);
284 } catch (const buffer::error &err) {
285 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
286 << dendl;
287 return false;
288 }
289 }
290
291 OSDCap osd_cap;
292 if (!osd_cap.parse(caps_str, nullptr)) {
293 dout(10) << "unable to parse OSD cap data for " << entity_name
294 << " in auth db" << dendl;
295 return false;
296 }
297
298 // if the entity has write permissions in one or all pools, permit
299 // usage of unmanaged-snapshots
300 if (osd_cap.allow_all()) {
301 return true;
302 }
303
304 for (auto& grant : osd_cap.grants) {
305 if (grant.profile.is_valid()) {
306 for (auto& profile_grant : grant.profile_grants) {
307 if (is_osd_writable(profile_grant, pool_name)) {
308 return true;
309 }
310 }
311 } else if (is_osd_writable(grant, pool_name)) {
312 return true;
313 }
314 }
315
316 return false;
317}
318
c07f9fc5
FG
319} // anonymous namespace
320
7c673cae
FG
321void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
322{
323 if (epoch_by_pg.size() <= ps) {
324 epoch_by_pg.resize(ps + 1, 0);
325 }
326 const auto old_lec = epoch_by_pg[ps];
327 if (old_lec >= last_epoch_clean) {
328 // stale lec
329 return;
330 }
331 epoch_by_pg[ps] = last_epoch_clean;
332 if (last_epoch_clean < floor) {
333 floor = last_epoch_clean;
334 } else if (last_epoch_clean > floor) {
335 if (old_lec == floor) {
336 // probably should increase floor?
337 auto new_floor = std::min_element(std::begin(epoch_by_pg),
338 std::end(epoch_by_pg));
339 floor = *new_floor;
340 }
341 }
342 if (ps != next_missing) {
343 return;
344 }
345 for (; next_missing < epoch_by_pg.size(); next_missing++) {
346 if (epoch_by_pg[next_missing] == 0) {
347 break;
348 }
349 }
350}
351
352void LastEpochClean::remove_pool(uint64_t pool)
353{
354 report_by_pool.erase(pool);
355}
356
357void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
358{
359 auto& lec = report_by_pool[pg.pool()];
360 return lec.report(pg.ps(), last_epoch_clean);
361}
362
363epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
364{
365 auto floor = latest.get_epoch();
366 for (auto& pool : latest.get_pools()) {
367 auto reported = report_by_pool.find(pool.first);
368 if (reported == report_by_pool.end()) {
369 return 0;
370 }
371 if (reported->second.next_missing < pool.second.get_pg_num()) {
372 return 0;
373 }
374 if (reported->second.floor < floor) {
375 floor = reported->second.floor;
376 }
377 }
378 return floor;
379}
380
1911f103
TL
381void LastEpochClean::dump(Formatter *f) const
382{
383 f->open_array_section("per_pool");
384
385 for (auto& it : report_by_pool) {
386 f->open_object_section("pool");
387 f->dump_unsigned("poolid", it.first);
388 f->dump_unsigned("floor", it.second.floor);
389 f->close_section();
390 }
391
392 f->close_section();
393}
7c673cae 394
11fdf7f2
TL
395class C_UpdateCreatingPGs : public Context {
396public:
7c673cae
FG
397 OSDMonitor *osdmon;
398 utime_t start;
399 epoch_t epoch;
400 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
401 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
402 void finish(int r) override {
403 if (r >= 0) {
404 utime_t end = ceph_clock_now();
405 dout(10) << "osdmap epoch " << epoch << " mapping took "
406 << (end - start) << " seconds" << dendl;
407 osdmon->update_creating_pgs();
408 osdmon->check_pg_creates_subs();
409 }
410 }
411};
412
413#undef dout_prefix
414#define dout_prefix _prefix(_dout, mon, osdmap)
415static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
416 return *_dout << "mon." << mon->name << "@" << mon->rank
417 << "(" << mon->get_state_name()
418 << ").osd e" << osdmap.get_epoch() << " ";
419}
420
421OSDMonitor::OSDMonitor(
422 CephContext *cct,
423 Monitor *mn,
424 Paxos *p,
425 const string& service_name)
426 : PaxosService(mn, p, service_name),
427 cct(cct),
11fdf7f2
TL
428 inc_osd_cache(g_conf()->mon_osd_cache_size),
429 full_osd_cache(g_conf()->mon_osd_cache_size),
430 has_osdmap_manifest(false),
431 mapper(mn->cct, &mn->cpu_tp)
eafe8130
TL
432{
433 inc_cache = std::make_shared<IncCache>(this);
434 full_cache = std::make_shared<FullCache>(this);
435 cct->_conf.add_observer(this);
436 int r = _set_cache_sizes();
437 if (r < 0) {
438 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
439 << g_conf()->mon_osd_cache_size
440 << ") without priority cache management"
441 << dendl;
442 }
443}
444
445const char **OSDMonitor::get_tracked_conf_keys() const
446{
447 static const char* KEYS[] = {
448 "mon_memory_target",
449 "mon_memory_autotune",
450 "rocksdb_cache_size",
451 NULL
452 };
453 return KEYS;
454}
455
456void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
457 const std::set<std::string> &changed)
458{
459 dout(10) << __func__ << " " << changed << dendl;
460
461 if (changed.count("mon_memory_autotune")) {
462 _set_cache_autotuning();
463 }
464 if (changed.count("mon_memory_target") ||
465 changed.count("rocksdb_cache_size")) {
466 int r = _update_mon_cache_settings();
467 if (r < 0) {
468 derr << __func__ << " mon_memory_target:"
469 << g_conf()->mon_memory_target
470 << " rocksdb_cache_size:"
471 << g_conf()->rocksdb_cache_size
92f5a8d4 472 << ". Unable to update cache size."
eafe8130
TL
473 << dendl;
474 }
475 }
476}
477
478void OSDMonitor::_set_cache_autotuning()
479{
480 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
481 // Disable cache autotuning
482 std::lock_guard l(balancer_lock);
483 pcm = nullptr;
484 }
485
486 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
487 int r = register_cache_with_pcm();
488 if (r < 0) {
489 dout(10) << __func__
490 << " Error while registering osdmon caches with pcm."
491 << " Cache auto tuning not enabled."
492 << dendl;
493 mon_memory_autotune = false;
494 } else {
495 mon_memory_autotune = true;
496 }
497 }
498}
499
500int OSDMonitor::_update_mon_cache_settings()
501{
502 if (g_conf()->mon_memory_target <= 0 ||
503 g_conf()->mon_memory_target < mon_memory_min ||
504 g_conf()->rocksdb_cache_size <= 0) {
505 return -EINVAL;
506 }
507
92f5a8d4
TL
508 if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
509 derr << __func__ << " not using pcm and rocksdb" << dendl;
510 return -EINVAL;
511 }
512
eafe8130
TL
513 uint64_t old_mon_memory_target = mon_memory_target;
514 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
515
516 // Set the new pcm memory cache sizes
517 mon_memory_target = g_conf()->mon_memory_target;
518 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
519
520 uint64_t base = mon_memory_base;
521 double fragmentation = mon_memory_fragmentation;
522 uint64_t target = mon_memory_target;
523 uint64_t min = mon_memory_min;
524 uint64_t max = min;
525
526 uint64_t ltarget = (1.0 - fragmentation) * target;
527 if (ltarget > base + min) {
528 max = ltarget - base;
529 }
530
531 int r = _set_cache_ratios();
532 if (r < 0) {
533 derr << __func__ << " Cache ratios for pcm could not be set."
534 << " Review the kv (rocksdb) and mon_memory_target sizes."
535 << dendl;
536 mon_memory_target = old_mon_memory_target;
537 rocksdb_cache_size = old_rocksdb_cache_size;
538 return -EINVAL;
539 }
540
541 if (mon_memory_autotune && pcm != nullptr) {
542 std::lock_guard l(balancer_lock);
543 // set pcm cache levels
544 pcm->set_target_memory(target);
545 pcm->set_min_memory(min);
546 pcm->set_max_memory(max);
547 // tune memory based on new values
548 pcm->tune_memory();
549 pcm->balance();
550 _set_new_cache_sizes();
92f5a8d4 551 dout(1) << __func__ << " Updated mon cache setting."
eafe8130
TL
552 << " target: " << target
553 << " min: " << min
554 << " max: " << max
555 << dendl;
556 }
557 return 0;
558}
559
560int OSDMonitor::_set_cache_sizes()
561{
562 if (g_conf()->mon_memory_autotune) {
563 // set the new osdmon cache targets to be managed by pcm
564 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
565 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
566 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
567 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
568 mon_memory_target = g_conf()->mon_memory_target;
569 mon_memory_min = g_conf()->mon_osd_cache_size_min;
570 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
571 derr << __func__ << " mon_memory_target:" << mon_memory_target
572 << " mon_memory_min:" << mon_memory_min
573 << ". Invalid size option(s) provided."
574 << dendl;
575 return -EINVAL;
576 }
577 // Set the initial inc and full LRU cache sizes
578 inc_osd_cache.set_bytes(mon_memory_min);
579 full_osd_cache.set_bytes(mon_memory_min);
580 mon_memory_autotune = g_conf()->mon_memory_autotune;
581 }
582 return 0;
583}
7c673cae
FG
584
585bool OSDMonitor::_have_pending_crush()
586{
587 return pending_inc.crush.length() > 0;
588}
589
590CrushWrapper &OSDMonitor::_get_stable_crush()
591{
592 return *osdmap.crush;
593}
594
595void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
596{
597 bufferlist bl;
598 if (pending_inc.crush.length())
599 bl = pending_inc.crush;
600 else
601 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
602
11fdf7f2 603 auto p = bl.cbegin();
7c673cae
FG
604 newcrush.decode(p);
605}
606
607void OSDMonitor::create_initial()
608{
609 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
610
611 OSDMap newmap;
612
613 bufferlist bl;
614 mon->store->get("mkfs", "osdmap", bl);
615
616 if (bl.length()) {
617 newmap.decode(bl);
618 newmap.set_fsid(mon->monmap->fsid);
619 } else {
11fdf7f2 620 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
7c673cae
FG
621 }
622 newmap.set_epoch(1);
623 newmap.created = newmap.modified = ceph_clock_now();
624
625 // new clusters should sort bitwise by default.
626 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
627
11fdf7f2
TL
628 newmap.flags |=
629 CEPH_OSDMAP_RECOVERY_DELETES |
630 CEPH_OSDMAP_PURGED_SNAPDIRS |
631 CEPH_OSDMAP_PGLOG_HARDLIMIT;
632 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
633 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
634 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
635 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
636 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
637 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
638
7c673cae 639 // new cluster should require latest by default
9f95a23c
TL
640 if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
641 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
642 derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
643 newmap.require_osd_release = ceph_release_t::mimic;
11fdf7f2 644 } else {
9f95a23c
TL
645 derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
646 newmap.require_osd_release = ceph_release_t::nautilus;
11fdf7f2 647 }
31f18b77 648 } else {
9f95a23c
TL
649 newmap.require_osd_release = ceph_release_t::octopus;
650 ceph_release_t r = ceph_release_from_name(
651 g_conf()->mon_osd_initial_require_min_compat_client);
652 if (!r) {
11fdf7f2 653 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
31f18b77
FG
654 }
655 newmap.require_min_compat_client = r;
7c673cae
FG
656 }
657
658 // encode into pending incremental
28e407b8 659 uint64_t features = newmap.get_encoding_features();
7c673cae 660 newmap.encode(pending_inc.fullmap,
28e407b8 661 features | CEPH_FEATURE_RESERVED);
7c673cae
FG
662 pending_inc.full_crc = newmap.get_crc();
663 dout(20) << " full crc " << pending_inc.full_crc << dendl;
664}
665
11fdf7f2 666void OSDMonitor::get_store_prefixes(std::set<string>& s) const
7c673cae
FG
667{
668 s.insert(service_name);
669 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 670 s.insert(OSD_METADATA_PREFIX);
11fdf7f2 671 s.insert(OSD_SNAP_PREFIX);
7c673cae
FG
672}
673
674void OSDMonitor::update_from_paxos(bool *need_bootstrap)
675{
11fdf7f2
TL
676 // we really don't care if the version has been updated, because we may
677 // have trimmed without having increased the last committed; yet, we may
678 // need to update the in-memory manifest.
679 load_osdmap_manifest();
680
7c673cae
FG
681 version_t version = get_last_committed();
682 if (version == osdmap.epoch)
683 return;
11fdf7f2 684 ceph_assert(version > osdmap.epoch);
7c673cae
FG
685
686 dout(15) << "update_from_paxos paxos e " << version
687 << ", my e " << osdmap.epoch << dendl;
688
31f18b77
FG
689 if (mapping_job) {
690 if (!mapping_job->is_done()) {
691 dout(1) << __func__ << " mapping job "
692 << mapping_job.get() << " did not complete, "
693 << mapping_job->shards << " left, canceling" << dendl;
694 mapping_job->abort();
695 }
696 mapping_job.reset();
697 }
7c673cae 698
224ce89b
WB
699 load_health();
700
7c673cae
FG
701 /*
702 * We will possibly have a stashed latest that *we* wrote, and we will
703 * always be sure to have the oldest full map in the first..last range
704 * due to encode_trim_extra(), which includes the oldest full map in the trim
705 * transaction.
706 *
707 * encode_trim_extra() does not however write the full map's
708 * version to 'full_latest'. This is only done when we are building the
709 * full maps from the incremental versions. But don't panic! We make sure
710 * that the following conditions find whichever full map version is newer.
711 */
712 version_t latest_full = get_version_latest_full();
713 if (latest_full == 0 && get_first_committed() > 1)
714 latest_full = get_first_committed();
715
716 if (get_first_committed() > 1 &&
717 latest_full < get_first_committed()) {
718 // the monitor could be just sync'ed with its peer, and the latest_full key
719 // is not encoded in the paxos commits in encode_pending(), so we need to
720 // make sure we get it pointing to a proper version.
721 version_t lc = get_last_committed();
722 version_t fc = get_first_committed();
723
724 dout(10) << __func__ << " looking for valid full map in interval"
725 << " [" << fc << ", " << lc << "]" << dendl;
726
727 latest_full = 0;
728 for (version_t v = lc; v >= fc; v--) {
729 string full_key = "full_" + stringify(v);
730 if (mon->store->exists(get_service_name(), full_key)) {
731 dout(10) << __func__ << " found latest full map v " << v << dendl;
732 latest_full = v;
733 break;
734 }
735 }
736
11fdf7f2 737 ceph_assert(latest_full > 0);
7c673cae
FG
738 auto t(std::make_shared<MonitorDBStore::Transaction>());
739 put_version_latest_full(t, latest_full);
740 mon->store->apply_transaction(t);
741 dout(10) << __func__ << " updated the on-disk full map version to "
742 << latest_full << dendl;
743 }
744
745 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
746 bufferlist latest_bl;
747 get_version_full(latest_full, latest_bl);
11fdf7f2 748 ceph_assert(latest_bl.length() != 0);
7c673cae 749 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
11fdf7f2 750 osdmap = OSDMap();
7c673cae
FG
751 osdmap.decode(latest_bl);
752 }
753
11fdf7f2
TL
754 bufferlist bl;
755 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
756 auto p = bl.cbegin();
757 std::lock_guard<std::mutex> l(creating_pgs_lock);
758 creating_pgs.decode(p);
759 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
760 << creating_pgs.last_scan_epoch
761 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
31f18b77 762 } else {
11fdf7f2
TL
763 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
764 << dendl;
31f18b77
FG
765 }
766
7c673cae
FG
767 // walk through incrementals
768 MonitorDBStore::TransactionRef t;
769 size_t tx_size = 0;
770 while (version > osdmap.epoch) {
771 bufferlist inc_bl;
772 int err = get_version(osdmap.epoch+1, inc_bl);
11fdf7f2
TL
773 ceph_assert(err == 0);
774 ceph_assert(inc_bl.length());
eafe8130
TL
775 // set priority cache manager levels if the osdmap is
776 // being populated for the first time.
777 if (mon_memory_autotune && pcm == nullptr) {
778 int r = register_cache_with_pcm();
779 if (r < 0) {
780 dout(10) << __func__
781 << " Error while registering osdmon caches with pcm."
782 << " Proceeding without cache auto tuning."
783 << dendl;
784 }
785 }
7c673cae
FG
786
787 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
788 << dendl;
789 OSDMap::Incremental inc(inc_bl);
790 err = osdmap.apply_incremental(inc);
11fdf7f2 791 ceph_assert(err == 0);
7c673cae
FG
792
793 if (!t)
794 t.reset(new MonitorDBStore::Transaction);
795
796 // Write out the full map for all past epochs. Encode the full
797 // map with the same features as the incremental. If we don't
798 // know, use the quorum features. If we don't know those either,
799 // encode with all features.
800 uint64_t f = inc.encode_features;
801 if (!f)
802 f = mon->get_quorum_con_features();
803 if (!f)
804 f = -1;
805 bufferlist full_bl;
806 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
807 tx_size += full_bl.length();
808
809 bufferlist orig_full_bl;
810 get_version_full(osdmap.epoch, orig_full_bl);
811 if (orig_full_bl.length()) {
812 // the primary provided the full map
11fdf7f2 813 ceph_assert(inc.have_crc);
7c673cae
FG
814 if (inc.full_crc != osdmap.crc) {
815 // This will happen if the mons were running mixed versions in
816 // the past or some other circumstance made the full encoded
817 // maps divergent. Reloading here will bring us back into
818 // sync with the primary for this and all future maps. OSDs
819 // will also be brought back into sync when they discover the
820 // crc mismatch and request a full map from a mon.
821 derr << __func__ << " full map CRC mismatch, resetting to canonical"
822 << dendl;
11fdf7f2
TL
823
824 dout(20) << __func__ << " my (bad) full osdmap:\n";
825 JSONFormatter jf(true);
826 jf.dump_object("osdmap", osdmap);
827 jf.flush(*_dout);
828 *_dout << "\nhexdump:\n";
829 full_bl.hexdump(*_dout);
830 *_dout << dendl;
831
7c673cae
FG
832 osdmap = OSDMap();
833 osdmap.decode(orig_full_bl);
11fdf7f2
TL
834
835 dout(20) << __func__ << " canonical full osdmap:\n";
836 JSONFormatter jf(true);
837 jf.dump_object("osdmap", osdmap);
838 jf.flush(*_dout);
839 *_dout << "\nhexdump:\n";
840 orig_full_bl.hexdump(*_dout);
841 *_dout << dendl;
7c673cae
FG
842 }
843 } else {
11fdf7f2 844 ceph_assert(!inc.have_crc);
7c673cae
FG
845 put_version_full(t, osdmap.epoch, full_bl);
846 }
847 put_version_latest_full(t, osdmap.epoch);
848
849 // share
850 dout(1) << osdmap << dendl;
851
852 if (osdmap.epoch == 1) {
853 t->erase("mkfs", "osdmap");
854 }
855
11fdf7f2 856 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
7c673cae
FG
857 mon->store->apply_transaction(t);
858 t = MonitorDBStore::TransactionRef();
859 tx_size = 0;
860 }
11fdf7f2
TL
861 for (const auto &osd_state : inc.new_state) {
862 if (osd_state.second & CEPH_OSD_UP) {
863 // could be marked up *or* down, but we're too lazy to check which
864 last_osd_report.erase(osd_state.first);
865 }
866 if (osd_state.second & CEPH_OSD_EXISTS) {
867 // could be created *or* destroyed, but we can safely drop it
868 osd_epochs.erase(osd_state.first);
7c673cae
FG
869 }
870 }
871 }
872
873 if (t) {
874 mon->store->apply_transaction(t);
875 }
876
877 for (int o = 0; o < osdmap.get_max_osd(); o++) {
878 if (osdmap.is_out(o))
879 continue;
880 auto found = down_pending_out.find(o);
881 if (osdmap.is_down(o)) {
882 // populate down -> out map
883 if (found == down_pending_out.end()) {
884 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
885 down_pending_out[o] = ceph_clock_now();
886 }
887 } else {
888 if (found != down_pending_out.end()) {
889 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
890 down_pending_out.erase(found);
891 }
892 }
893 }
894 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
895
7c673cae
FG
896 check_osdmap_subs();
897 check_pg_creates_subs();
898
899 share_map_with_random_osd();
900 update_logger();
7c673cae
FG
901 process_failures();
902
903 // make sure our feature bits reflect the latest map
904 update_msgr_features();
905
906 if (!mon->is_leader()) {
907 // will be called by on_active() on the leader, avoid doing so twice
908 start_mapping();
909 }
910}
911
eafe8130
TL
912int OSDMonitor::register_cache_with_pcm()
913{
914 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
915 derr << __func__ << " Invalid memory size specified for mon caches."
916 << " Caches will not be auto-tuned."
917 << dendl;
918 return -EINVAL;
919 }
920 uint64_t base = mon_memory_base;
921 double fragmentation = mon_memory_fragmentation;
922 // For calculating total target memory, consider rocksdb cache size.
923 uint64_t target = mon_memory_target;
924 uint64_t min = mon_memory_min;
925 uint64_t max = min;
926
927 // Apply the same logic as in bluestore to set the max amount
928 // of memory to use for cache. Assume base memory for OSDMaps
929 // and then add in some overhead for fragmentation.
930 uint64_t ltarget = (1.0 - fragmentation) * target;
931 if (ltarget > base + min) {
932 max = ltarget - base;
933 }
934
935 rocksdb_binned_kv_cache = mon->store->get_priority_cache();
936 if (!rocksdb_binned_kv_cache) {
937 derr << __func__ << " not using rocksdb" << dendl;
938 return -EINVAL;
939 }
940
941 int r = _set_cache_ratios();
942 if (r < 0) {
943 derr << __func__ << " Cache ratios for pcm could not be set."
944 << " Review the kv (rocksdb) and mon_memory_target sizes."
945 << dendl;
946 return -EINVAL;
947 }
948
949 pcm = std::make_shared<PriorityCache::Manager>(
950 cct, min, max, target, true);
951 pcm->insert("kv", rocksdb_binned_kv_cache, true);
952 pcm->insert("inc", inc_cache, true);
953 pcm->insert("full", full_cache, true);
92f5a8d4 954 dout(1) << __func__ << " pcm target: " << target
eafe8130
TL
955 << " pcm max: " << max
956 << " pcm min: " << min
957 << " inc_osd_cache size: " << inc_osd_cache.get_size()
958 << dendl;
959 return 0;
960}
961
962int OSDMonitor::_set_cache_ratios()
963{
964 double old_cache_kv_ratio = cache_kv_ratio;
965
966 // Set the cache ratios for kv(rocksdb), inc and full caches
967 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
968 if (cache_kv_ratio >= 1.0) {
969 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
970 << ") must be in range [0,<1.0]."
971 << dendl;
972 cache_kv_ratio = old_cache_kv_ratio;
973 return -EINVAL;
974 }
975 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
976 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
977 inc_cache->set_cache_ratio(cache_inc_ratio);
978 full_cache->set_cache_ratio(cache_full_ratio);
979
92f5a8d4 980 dout(1) << __func__ << " kv ratio " << cache_kv_ratio
eafe8130
TL
981 << " inc ratio " << cache_inc_ratio
982 << " full ratio " << cache_full_ratio
983 << dendl;
984 return 0;
985}
986
7c673cae
FG
987void OSDMonitor::start_mapping()
988{
989 // initiate mapping job
990 if (mapping_job) {
991 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
992 << dendl;
993 mapping_job->abort();
994 }
224ce89b
WB
995 if (!osdmap.get_pools().empty()) {
996 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
997 mapping_job = mapping.start_update(osdmap, mapper,
11fdf7f2 998 g_conf()->mon_osd_mapping_pgs_per_chunk);
224ce89b
WB
999 dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000 << " at " << fin->start << dendl;
1001 mapping_job->set_finish_event(fin);
1002 } else {
1003 dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004 mapping_job = nullptr;
1005 }
7c673cae
FG
1006}
1007
1008void OSDMonitor::update_msgr_features()
1009{
1010 set<int> types;
1011 types.insert((int)entity_name_t::TYPE_OSD);
1012 types.insert((int)entity_name_t::TYPE_CLIENT);
1013 types.insert((int)entity_name_t::TYPE_MDS);
1014 types.insert((int)entity_name_t::TYPE_MON);
1015 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016 uint64_t mask;
1017 uint64_t features = osdmap.get_features(*q, &mask);
1018 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
11fdf7f2 1020 ceph::net::Policy p = mon->messenger->get_policy(*q);
7c673cae
FG
1021 p.features_required = (p.features_required & ~mask) | features;
1022 mon->messenger->set_policy(*q, p);
1023 }
1024 }
1025}
1026
1027void OSDMonitor::on_active()
1028{
1029 update_logger();
1030
1031 if (mon->is_leader()) {
224ce89b 1032 mon->clog->debug() << "osdmap " << osdmap;
81eedcae
TL
1033 if (!priority_convert) {
1034 // Only do this once at start-up
1035 convert_pool_priorities();
1036 priority_convert = true;
1037 }
7c673cae
FG
1038 } else {
1039 list<MonOpRequestRef> ls;
1040 take_all_failures(ls);
1041 while (!ls.empty()) {
1042 MonOpRequestRef op = ls.front();
1043 op->mark_osdmon_event(__func__);
1044 dispatch(op);
1045 ls.pop_front();
1046 }
1047 }
1048 start_mapping();
1049}
1050
1051void OSDMonitor::on_restart()
1052{
1053 last_osd_report.clear();
1054}
1055
1056void OSDMonitor::on_shutdown()
1057{
1058 dout(10) << __func__ << dendl;
1059 if (mapping_job) {
1060 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061 << dendl;
1062 mapping_job->abort();
1063 }
1064
1065 // discard failure info, waiters
1066 list<MonOpRequestRef> ls;
1067 take_all_failures(ls);
1068 ls.clear();
1069}
1070
1071void OSDMonitor::update_logger()
1072{
1073 dout(10) << "update_logger" << dendl;
1074
1075 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079}
1080
7c673cae
FG
1081void OSDMonitor::create_pending()
1082{
1083 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084 pending_inc.fsid = mon->monmap->fsid;
11fdf7f2
TL
1085 pending_metadata.clear();
1086 pending_metadata_rm.clear();
9f95a23c 1087 pending_pseudo_purged_snaps.clear();
7c673cae
FG
1088
1089 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
11fdf7f2
TL
1091 // safety checks (this shouldn't really happen)
1092 {
1093 if (osdmap.backfillfull_ratio <= 0) {
1094 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095 if (pending_inc.new_backfillfull_ratio > 1.0)
1096 pending_inc.new_backfillfull_ratio /= 100;
1097 dout(1) << __func__ << " setting backfillfull_ratio = "
1098 << pending_inc.new_backfillfull_ratio << dendl;
7c673cae 1099 }
7c673cae 1100 if (osdmap.full_ratio <= 0) {
11fdf7f2 1101 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
7c673cae
FG
1102 if (pending_inc.new_full_ratio > 1.0)
1103 pending_inc.new_full_ratio /= 100;
1104 dout(1) << __func__ << " setting full_ratio = "
1105 << pending_inc.new_full_ratio << dendl;
1106 }
1107 if (osdmap.nearfull_ratio <= 0) {
11fdf7f2 1108 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
7c673cae
FG
1109 if (pending_inc.new_nearfull_ratio > 1.0)
1110 pending_inc.new_nearfull_ratio /= 100;
1111 dout(1) << __func__ << " setting nearfull_ratio = "
1112 << pending_inc.new_nearfull_ratio << dendl;
1113 }
1114 }
3efd9988
FG
1115
1116 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117 // structure.
1118 if (osdmap.crush->has_legacy_rule_ids()) {
1119 CrushWrapper newcrush;
1120 _get_pending_crush(newcrush);
1121
1122 // First, for all pools, work out which rule they really used
1123 // by resolving ruleset to rule.
1124 for (const auto &i : osdmap.get_pools()) {
1125 const auto pool_id = i.first;
1126 const auto &pool = i.second;
1127 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128 pool.type, pool.size);
1129
1130 dout(1) << __func__ << " rewriting pool "
1131 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133 if (pending_inc.new_pools.count(pool_id) == 0) {
1134 pending_inc.new_pools[pool_id] = pool;
1135 }
1136 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137 }
1138
1139 // Now, go ahead and renumber all the rules so that their
1140 // rule_id field corresponds to their position in the array
1141 auto old_to_new = newcrush.renumber_rules();
1142 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143 for (const auto &i : old_to_new) {
1144 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145 }
1146 pending_inc.crush.clear();
1147 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148 }
7c673cae
FG
1149}
1150
1151creating_pgs_t
94b18763
FG
1152OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153 const OSDMap& nextmap)
7c673cae 1154{
31f18b77 1155 dout(10) << __func__ << dendl;
7c673cae
FG
1156 creating_pgs_t pending_creatings;
1157 {
1158 std::lock_guard<std::mutex> l(creating_pgs_lock);
1159 pending_creatings = creating_pgs;
1160 }
31f18b77
FG
1161 // check for new or old pools
1162 if (pending_creatings.last_scan_epoch < inc.epoch) {
31f18b77
FG
1163 unsigned queued = 0;
1164 queued += scan_for_creating_pgs(osdmap.get_pools(),
1165 inc.old_pools,
1166 inc.modified,
1167 &pending_creatings);
1168 queued += scan_for_creating_pgs(inc.new_pools,
1169 inc.old_pools,
1170 inc.modified,
1171 &pending_creatings);
1172 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173 for (auto deleted_pool : inc.old_pools) {
1174 auto removed = pending_creatings.remove_pool(deleted_pool);
1175 dout(10) << __func__ << " " << removed
1176 << " pg removed because containing pool deleted: "
1177 << deleted_pool << dendl;
1178 last_epoch_clean.remove_pool(deleted_pool);
1179 }
1180 // pgmon updates its creating_pgs in check_osd_map() which is called by
1181 // on_active() and check_osd_map() could be delayed if lease expires, so its
1182 // creating_pgs could be stale in comparison with the one of osdmon. let's
1183 // trim them here. otherwise, they will be added back after being erased.
1184 unsigned removed = 0;
1185 for (auto& pg : pending_created_pgs) {
1186 dout(20) << __func__ << " noting created pg " << pg << dendl;
1187 pending_creatings.created_pools.insert(pg.pool());
1188 removed += pending_creatings.pgs.erase(pg);
1189 }
1190 pending_created_pgs.clear();
1191 dout(10) << __func__ << " " << removed
1192 << " pgs removed because they're created" << dendl;
1193 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194 }
1195
94b18763
FG
1196 // filter out any pgs that shouldn't exist.
1197 {
1198 auto i = pending_creatings.pgs.begin();
1199 while (i != pending_creatings.pgs.end()) {
1200 if (!nextmap.pg_exists(i->first)) {
1201 dout(10) << __func__ << " removing pg " << i->first
1202 << " which should not exist" << dendl;
1203 i = pending_creatings.pgs.erase(i);
1204 } else {
1205 ++i;
1206 }
1207 }
1208 }
1209
31f18b77 1210 // process queue
11fdf7f2 1211 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
31f18b77
FG
1212 const auto total = pending_creatings.pgs.size();
1213 while (pending_creatings.pgs.size() < max &&
1214 !pending_creatings.queue.empty()) {
1215 auto p = pending_creatings.queue.begin();
1216 int64_t poolid = p->first;
1217 dout(10) << __func__ << " pool " << poolid
1218 << " created " << p->second.created
1219 << " modified " << p->second.modified
1220 << " [" << p->second.start << "-" << p->second.end << ")"
1221 << dendl;
11fdf7f2
TL
1222 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223 p->second.end - p->second.start);
31f18b77
FG
1224 ps_t first = p->second.start;
1225 ps_t end = first + n;
1226 for (ps_t ps = first; ps < end; ++ps) {
1227 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228 // NOTE: use the *current* epoch as the PG creation epoch so that the
1229 // OSD does not have to generate a long set of PastIntervals.
9f95a23c
TL
1230 pending_creatings.pgs.emplace(
1231 pgid,
1232 creating_pgs_t::pg_create_info(inc.epoch,
1233 p->second.modified));
31f18b77
FG
1234 dout(10) << __func__ << " adding " << pgid << dendl;
1235 }
1236 p->second.start = end;
1237 if (p->second.done()) {
1238 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239 pending_creatings.queue.erase(p);
1240 } else {
1241 dout(10) << __func__ << " pool " << poolid
1242 << " now [" << p->second.start << "-" << p->second.end << ")"
1243 << dendl;
1244 }
1245 }
1246 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247 << " pools" << dendl;
9f95a23c
TL
1248
1249 if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250 // walk creating pgs' history and past_intervals forward
1251 for (auto& i : pending_creatings.pgs) {
1252 // this mirrors PG::start_peering_interval()
1253 pg_t pgid = i.first;
1254
1255 // this is a bit imprecise, but sufficient?
1256 struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257 const pg_pool_t *pi;
1258 bool operator()(const set<pg_shard_t> &have) const {
1259 return have.size() >= pi->min_size;
1260 }
1261 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262 } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264 vector<int> up, acting;
1265 int up_primary, acting_primary;
1266 nextmap.pg_to_up_acting_osds(
1267 pgid, &up, &up_primary, &acting, &acting_primary);
1268 if (i.second.history.epoch_created == 0) {
1269 // new pg entry, set it up
1270 i.second.up = up;
1271 i.second.acting = acting;
1272 i.second.up_primary = up_primary;
1273 i.second.acting_primary = acting_primary;
1274 i.second.history = pg_history_t(i.second.create_epoch,
1275 i.second.create_stamp);
1276 dout(10) << __func__ << " pg " << pgid << " just added, "
1277 << " up " << i.second.up
1278 << " p " << i.second.up_primary
1279 << " acting " << i.second.acting
1280 << " p " << i.second.acting_primary
1281 << " history " << i.second.history
1282 << " past_intervals " << i.second.past_intervals
1283 << dendl;
1284 } else {
1285 std::stringstream debug;
1286 if (PastIntervals::check_new_interval(
1287 i.second.acting_primary, acting_primary,
1288 i.second.acting, acting,
1289 i.second.up_primary, up_primary,
1290 i.second.up, up,
1291 i.second.history.same_interval_since,
1292 i.second.history.last_epoch_clean,
1293 &nextmap,
1294 &osdmap,
1295 pgid,
1296 min_size_predicate,
1297 &i.second.past_intervals,
1298 &debug)) {
1299 epoch_t e = inc.epoch;
1300 i.second.history.same_interval_since = e;
1301 if (i.second.up != up) {
1302 i.second.history.same_up_since = e;
1303 }
1304 if (i.second.acting_primary != acting_primary) {
1305 i.second.history.same_primary_since = e;
1306 }
1307 if (pgid.is_split(
1308 osdmap.get_pg_num(pgid.pool()),
1309 nextmap.get_pg_num(pgid.pool()),
1310 nullptr)) {
1311 i.second.history.last_epoch_split = e;
1312 }
1313 dout(10) << __func__ << " pg " << pgid << " new interval,"
1314 << " up " << i.second.up << " -> " << up
1315 << " p " << i.second.up_primary << " -> " << up_primary
1316 << " acting " << i.second.acting << " -> " << acting
1317 << " p " << i.second.acting_primary << " -> "
1318 << acting_primary
1319 << " history " << i.second.history
1320 << " past_intervals " << i.second.past_intervals
1321 << dendl;
1322 dout(20) << " debug: " << debug.str() << dendl;
1323 i.second.up = up;
1324 i.second.acting = acting;
1325 i.second.up_primary = up_primary;
1326 i.second.acting_primary = acting_primary;
1327 }
1328 }
1329 }
1330 }
c07f9fc5
FG
1331 dout(10) << __func__
1332 << " " << (pending_creatings.pgs.size() - total)
1333 << "/" << pending_creatings.pgs.size()
31f18b77 1334 << " pgs added from queued pools" << dendl;
7c673cae
FG
1335 return pending_creatings;
1336}
1337
1338void OSDMonitor::maybe_prime_pg_temp()
1339{
1340 bool all = false;
1341 if (pending_inc.crush.length()) {
1342 dout(10) << __func__ << " new crush map, all" << dendl;
1343 all = true;
1344 }
1345
1346 if (!pending_inc.new_up_client.empty()) {
1347 dout(10) << __func__ << " new up osds, all" << dendl;
1348 all = true;
1349 }
1350
1351 // check for interesting OSDs
1352 set<int> osds;
31f18b77 1353 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
1354 !all && p != pending_inc.new_state.end();
1355 ++p) {
1356 if ((p->second & CEPH_OSD_UP) &&
1357 osdmap.is_up(p->first)) {
1358 osds.insert(p->first);
1359 }
1360 }
1361 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362 !all && p != pending_inc.new_weight.end();
1363 ++p) {
1364 if (p->second < osdmap.get_weight(p->first)) {
1365 // weight reduction
1366 osds.insert(p->first);
1367 } else {
1368 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369 << dendl;
1370 all = true;
1371 }
1372 }
1373
1374 if (!all && osds.empty())
1375 return;
1376
1377 if (!all) {
1378 unsigned estimate =
1379 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380 if (estimate > mapping.get_num_pgs() *
11fdf7f2 1381 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
7c673cae
FG
1382 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383 << osds.size() << " osds >= "
11fdf7f2 1384 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
7c673cae
FG
1385 << mapping.get_num_pgs() << " pgs, all"
1386 << dendl;
1387 all = true;
1388 } else {
1389 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390 << osds.size() << " osds" << dendl;
1391 }
1392 }
1393
1394 OSDMap next;
1395 next.deepish_copy_from(osdmap);
1396 next.apply_incremental(pending_inc);
1397
224ce89b
WB
1398 if (next.get_pools().empty()) {
1399 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400 } else if (all) {
7c673cae 1401 PrimeTempJob job(next, this);
494da23a 1402 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
11fdf7f2 1403 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
7c673cae
FG
1404 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405 } else {
1406 dout(10) << __func__ << " did not finish in "
11fdf7f2 1407 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1408 << ", stopping" << dendl;
1409 job.abort();
1410 }
1411 } else {
1412 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413 utime_t stop = ceph_clock_now();
11fdf7f2 1414 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
7c673cae
FG
1415 const int chunk = 1000;
1416 int n = chunk;
1417 std::unordered_set<pg_t> did_pgs;
1418 for (auto osd : osds) {
1419 auto& pgs = mapping.get_osd_acting_pgs(osd);
1420 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421 for (auto pgid : pgs) {
1422 if (!did_pgs.insert(pgid).second) {
1423 continue;
1424 }
1425 prime_pg_temp(next, pgid);
1426 if (--n <= 0) {
1427 n = chunk;
1428 if (ceph_clock_now() > stop) {
1429 dout(10) << __func__ << " consumed more than "
11fdf7f2 1430 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1431 << " seconds, stopping"
1432 << dendl;
1433 return;
1434 }
1435 }
1436 }
1437 }
1438 }
1439}
1440
1441void OSDMonitor::prime_pg_temp(
1442 const OSDMap& next,
1443 pg_t pgid)
1444{
11fdf7f2
TL
1445 // TODO: remove this creating_pgs direct access?
1446 if (creating_pgs.pgs.count(pgid)) {
1447 return;
7c673cae
FG
1448 }
1449 if (!osdmap.pg_exists(pgid)) {
1450 return;
1451 }
1452
1453 vector<int> up, acting;
1454 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456 vector<int> next_up, next_acting;
1457 int next_up_primary, next_acting_primary;
1458 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459 &next_acting, &next_acting_primary);
f64942e4
AA
1460 if (acting == next_acting &&
1461 !(up != acting && next_up == next_acting))
7c673cae
FG
1462 return; // no change since last epoch
1463
1464 if (acting.empty())
1465 return; // if previously empty now we can be no worse off
1466 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467 if (pool && acting.size() < pool->min_size)
1468 return; // can be no worse off than before
1469
c07f9fc5
FG
1470 if (next_up == next_acting) {
1471 acting.clear();
11fdf7f2
TL
1472 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473 << dendl;
c07f9fc5
FG
1474 }
1475
7c673cae
FG
1476 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477 << " -> " << next_up << "/" << next_acting
1478 << ", priming " << acting
1479 << dendl;
1480 {
11fdf7f2 1481 std::lock_guard l(prime_pg_temp_lock);
7c673cae
FG
1482 // do not touch a mapping if a change is pending
1483 pending_inc.new_pg_temp.emplace(
1484 pgid,
1485 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486 }
1487}
1488
1489/**
1490 * @note receiving a transaction in this function gives a fair amount of
1491 * freedom to the service implementation if it does need it. It shouldn't.
1492 */
1493void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494{
1495 dout(10) << "encode_pending e " << pending_inc.epoch
1496 << dendl;
1497
11fdf7f2
TL
1498 if (do_prune(t)) {
1499 dout(1) << __func__ << " osdmap full prune encoded e"
1500 << pending_inc.epoch << dendl;
1501 }
1502
7c673cae
FG
1503 // finalize up pending_inc
1504 pending_inc.modified = ceph_clock_now();
1505
11fdf7f2
TL
1506 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1507 ceph_assert(r == 0);
7c673cae
FG
1508
1509 if (mapping_job) {
1510 if (!mapping_job->is_done()) {
1511 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512 << mapping_job.get() << " did not complete, "
1513 << mapping_job->shards << " left" << dendl;
1514 mapping_job->abort();
1515 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517 << mapping_job.get() << " is prior epoch "
1518 << mapping.get_epoch() << dendl;
1519 } else {
11fdf7f2 1520 if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1521 maybe_prime_pg_temp();
1522 }
1523 }
11fdf7f2 1524 } else if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1525 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526 << dendl;
1527 }
1528 mapping_job.reset();
1529
c07f9fc5
FG
1530 // ensure we don't have blank new_state updates. these are interrpeted as
1531 // CEPH_OSD_UP (and almost certainly not what we want!).
1532 auto p = pending_inc.new_state.begin();
1533 while (p != pending_inc.new_state.end()) {
1534 if (p->second == 0) {
1535 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536 p = pending_inc.new_state.erase(p);
1537 } else {
11fdf7f2
TL
1538 if (p->second & CEPH_OSD_UP) {
1539 pending_inc.new_last_up_change = pending_inc.modified;
1540 }
c07f9fc5
FG
1541 ++p;
1542 }
1543 }
11fdf7f2
TL
1544 if (!pending_inc.new_up_client.empty()) {
1545 pending_inc.new_last_up_change = pending_inc.modified;
1546 }
1547 for (auto& i : pending_inc.new_weight) {
9f95a23c 1548 if (i.first >= osdmap.max_osd) {
11fdf7f2
TL
1549 if (i.second) {
1550 // new osd is already marked in
1551 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1552 break;
11fdf7f2
TL
1553 }
1554 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555 // existing osd marked in or out
1556 pending_inc.new_last_in_change = pending_inc.modified;
9f95a23c 1557 break;
11fdf7f2
TL
1558 }
1559 }
7c673cae
FG
1560
1561 {
1562 OSDMap tmp;
1563 tmp.deepish_copy_from(osdmap);
1564 tmp.apply_incremental(pending_inc);
1565
11fdf7f2
TL
1566 // clean pg_temp mappings
1567 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569 // clean inappropriate pg_upmap/pg_upmap_items (if any)
494da23a
TL
1570 {
1571 // check every upmapped pg for now
1572 // until we could reliably identify certain cases to ignore,
1573 // which is obviously the hard part TBD..
1574 vector<pg_t> pgs_to_check;
1575 tmp.get_upmap_pgs(&pgs_to_check);
9f95a23c
TL
1576 if (pgs_to_check.size() <
1577 static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
494da23a
TL
1578 // not enough pgs, do it inline
1579 tmp.clean_pg_upmaps(cct, &pending_inc);
1580 } else {
1581 CleanUpmapJob job(cct, tmp, pending_inc);
1582 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583 job.wait();
1584 }
1585 }
11fdf7f2
TL
1586
1587 // update creating pgs first so that we can remove the created pgid and
1588 // process the pool flag removal below in the same osdmap epoch.
1589 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590 bufferlist creatings_bl;
9f95a23c
TL
1591 uint64_t features = CEPH_FEATURES_ALL;
1592 if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593 dout(20) << __func__ << " encoding pending pgs without octopus features"
1594 << dendl;
1595 features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596 }
1597 encode(pending_creatings, creatings_bl, features);
11fdf7f2
TL
1598 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600 // remove any old (or incompat) POOL_CREATING flags
1601 for (auto& i : tmp.get_pools()) {
9f95a23c 1602 if (tmp.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
1603 // pre-nautilus OSDMaps shouldn't get this flag.
1604 if (pending_inc.new_pools.count(i.first)) {
1605 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606 }
1607 }
1608 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609 !pending_creatings.still_creating_pool(i.first)) {
1610 dout(10) << __func__ << " done creating pool " << i.first
1611 << ", clearing CREATING flag" << dendl;
1612 if (pending_inc.new_pools.count(i.first) == 0) {
1613 pending_inc.new_pools[i.first] = i.second;
1614 }
1615 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
3efd9988 1616 }
11fdf7f2
TL
1617 }
1618
11fdf7f2
TL
1619 // collect which pools are currently affected by
1620 // the near/backfill/full osd(s),
1621 // and set per-pool near/backfill/full flag instead
1622 set<int64_t> full_pool_ids;
1623 set<int64_t> backfillfull_pool_ids;
1624 set<int64_t> nearfull_pool_ids;
1625 tmp.get_full_pools(cct,
1626 &full_pool_ids,
1627 &backfillfull_pool_ids,
3efd9988 1628 &nearfull_pool_ids);
11fdf7f2
TL
1629 if (full_pool_ids.empty() ||
1630 backfillfull_pool_ids.empty() ||
1631 nearfull_pool_ids.empty()) {
1632 // normal case - no nearfull, backfillfull or full osds
3efd9988
FG
1633 // try cancel any improper nearfull/backfillfull/full pool
1634 // flags first
11fdf7f2
TL
1635 for (auto &pool: tmp.get_pools()) {
1636 auto p = pool.first;
1637 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638 nearfull_pool_ids.empty()) {
1639 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640 << "'s nearfull flag" << dendl;
1641 if (pending_inc.new_pools.count(p) == 0) {
1642 // load original pool info first!
1643 pending_inc.new_pools[p] = pool.second;
1644 }
1645 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646 }
1647 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648 backfillfull_pool_ids.empty()) {
1649 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650 << "'s backfillfull flag" << dendl;
1651 if (pending_inc.new_pools.count(p) == 0) {
1652 pending_inc.new_pools[p] = pool.second;
1653 }
1654 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655 }
1656 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657 full_pool_ids.empty()) {
1658 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659 // set by EQUOTA, skipping
1660 continue;
1661 }
1662 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663 << "'s full flag" << dendl;
1664 if (pending_inc.new_pools.count(p) == 0) {
1665 pending_inc.new_pools[p] = pool.second;
1666 }
1667 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668 }
3efd9988 1669 }
11fdf7f2
TL
1670 }
1671 if (!full_pool_ids.empty()) {
1672 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673 << " as full" << dendl;
1674 for (auto &p: full_pool_ids) {
1675 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676 continue;
1677 }
1678 if (pending_inc.new_pools.count(p) == 0) {
1679 pending_inc.new_pools[p] = tmp.pools[p];
1680 }
1681 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684 }
1685 // cancel FLAG_FULL for pools which are no longer full too
1686 for (auto &pool: tmp.get_pools()) {
1687 auto p = pool.first;
1688 if (full_pool_ids.count(p)) {
1689 // skip pools we have just marked as full above
1690 continue;
1691 }
1692 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694 // don't touch if currently is not full
1695 // or is running out of quota (and hence considered as full)
1696 continue;
1697 }
1698 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699 << "'s full flag" << dendl;
1700 if (pending_inc.new_pools.count(p) == 0) {
1701 pending_inc.new_pools[p] = pool.second;
1702 }
1703 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
3efd9988 1704 }
11fdf7f2
TL
1705 }
1706 if (!backfillfull_pool_ids.empty()) {
1707 for (auto &p: backfillfull_pool_ids) {
1708 if (full_pool_ids.count(p)) {
1709 // skip pools we have already considered as full above
1710 continue;
1711 }
1712 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713 // make sure FLAG_FULL is truly set, so we are safe not
1714 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716 continue;
1717 }
1718 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719 // don't bother if pool is already marked as backfillfull
1720 continue;
1721 }
1722 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723 << "'s as backfillfull" << dendl;
1724 if (pending_inc.new_pools.count(p) == 0) {
1725 pending_inc.new_pools[p] = tmp.pools[p];
1726 }
1727 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729 }
1730 // cancel FLAG_BACKFILLFULL for pools
1731 // which are no longer backfillfull too
1732 for (auto &pool: tmp.get_pools()) {
1733 auto p = pool.first;
1734 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735 // skip pools we have just marked as backfillfull/full above
1736 continue;
1737 }
1738 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739 // and don't touch if currently is not backfillfull
1740 continue;
1741 }
1742 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743 << "'s backfillfull flag" << dendl;
1744 if (pending_inc.new_pools.count(p) == 0) {
1745 pending_inc.new_pools[p] = pool.second;
1746 }
1747 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
3efd9988 1748 }
11fdf7f2
TL
1749 }
1750 if (!nearfull_pool_ids.empty()) {
1751 for (auto &p: nearfull_pool_ids) {
1752 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753 continue;
1754 }
1755 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756 // make sure FLAG_FULL is truly set, so we are safe not
1757 // to set a extra (redundant) FLAG_NEARFULL flag
1758 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759 continue;
1760 }
1761 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762 // don't bother if pool is already marked as nearfull
1763 continue;
1764 }
1765 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766 << "'s as nearfull" << dendl;
1767 if (pending_inc.new_pools.count(p) == 0) {
1768 pending_inc.new_pools[p] = tmp.pools[p];
1769 }
1770 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771 }
1772 // cancel FLAG_NEARFULL for pools
1773 // which are no longer nearfull too
1774 for (auto &pool: tmp.get_pools()) {
1775 auto p = pool.first;
1776 if (full_pool_ids.count(p) ||
1777 backfillfull_pool_ids.count(p) ||
1778 nearfull_pool_ids.count(p)) {
1779 // skip pools we have just marked as
1780 // nearfull/backfillfull/full above
1781 continue;
1782 }
1783 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784 // and don't touch if currently is not nearfull
1785 continue;
1786 }
1787 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788 << "'s nearfull flag" << dendl;
1789 if (pending_inc.new_pools.count(p) == 0) {
1790 pending_inc.new_pools[p] = pool.second;
1791 }
1792 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
7c673cae 1793 }
11fdf7f2 1794 }
7c673cae 1795
11fdf7f2 1796 // min_compat_client?
9f95a23c 1797 if (!tmp.require_min_compat_client) {
11fdf7f2
TL
1798 auto mv = tmp.get_min_compat_client();
1799 dout(1) << __func__ << " setting require_min_compat_client to currently "
9f95a23c 1800 << "required " << mv << dendl;
11fdf7f2 1801 mon->clog->info() << "setting require_min_compat_client to currently "
9f95a23c 1802 << "required " << mv;
11fdf7f2
TL
1803 pending_inc.new_require_min_compat_client = mv;
1804 }
1805
9f95a23c
TL
1806 if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807 tmp.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
1808 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809 // add creating flags?
1810 for (auto& i : tmp.get_pools()) {
1811 if (pending_creatings.still_creating_pool(i.first)) {
1812 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813 << dendl;
1814 if (pending_inc.new_pools.count(i.first) == 0) {
1815 pending_inc.new_pools[i.first] = i.second;
224ce89b 1816 }
11fdf7f2 1817 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
224ce89b 1818 }
11fdf7f2
TL
1819 }
1820 // adjust blacklist items to all be TYPE_ANY
1821 for (auto& i : tmp.blacklist) {
1822 auto a = i.first;
1823 a.set_type(entity_addr_t::TYPE_ANY);
1824 pending_inc.new_blacklist[a] = i.second;
1825 pending_inc.old_blacklist.push_back(i.first);
224ce89b 1826 }
7c673cae 1827 }
9f95a23c
TL
1828
1829 if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830 tmp.require_osd_release >= ceph_release_t::octopus) {
1831 dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833 // adjust obsoleted cache modes
1834 for (auto& [poolid, pi] : tmp.pools) {
1835 if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836 if (pending_inc.new_pools.count(poolid) == 0) {
1837 pending_inc.new_pools[poolid] = pi;
1838 }
1839 dout(10) << __func__ << " switching pool " << poolid
1840 << " cachemode from forward -> proxy" << dendl;
1841 pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842 }
1843 if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844 if (pending_inc.new_pools.count(poolid) == 0) {
1845 pending_inc.new_pools[poolid] = pi;
1846 }
1847 dout(10) << __func__ << " switching pool " << poolid
1848 << " cachemode from readforward -> readproxy" << dendl;
1849 pending_inc.new_pools[poolid].cache_mode =
1850 pg_pool_t::CACHEMODE_READPROXY;
1851 }
1852 }
1853
1854 // clear removed_snaps for every pool
1855 for (auto& [poolid, pi] : tmp.pools) {
1856 if (pi.removed_snaps.empty()) {
1857 continue;
1858 }
1859 if (pending_inc.new_pools.count(poolid) == 0) {
1860 pending_inc.new_pools[poolid] = pi;
1861 }
1862 dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863 << dendl;
1864 pending_inc.new_pools[poolid].removed_snaps.clear();
1865 }
1866
1867 // create a combined purged snap epoch key for all purged snaps
1868 // prior to this epoch, and store it in the current epoch (i.e.,
1869 // the last pre-octopus epoch, just prior to the one we're
1870 // encoding now).
1871 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872 it->lower_bound("purged_snap_");
1873 map<int64_t,snap_interval_set_t> combined;
1874 while (it->valid()) {
1875 if (it->key().find("purged_snap_") != 0) {
1876 break;
1877 }
1878 string k = it->key();
1879 long long unsigned pool;
1880 int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881 if (n != 1) {
1882 derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883 } else {
1884 bufferlist v = it->value();
1885 auto p = v.cbegin();
1886 snapid_t begin, end;
1887 ceph::decode(begin, p);
1888 ceph::decode(end, p);
1889 combined[pool].insert(begin, end - begin);
1890 }
1891 it->next();
1892 }
1893 if (!combined.empty()) {
1894 string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895 bufferlist v;
1896 ceph::encode(combined, v);
1897 t->put(OSD_SNAP_PREFIX, k, v);
1898 dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899 << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900 << dendl;
1901 } else {
1902 dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903 << dendl;
1904 }
1905
1906 // clean out the old removed_snap_ and removed_epoch keys
1907 // ('`' is ASCII '_' + 1)
1908 t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909 t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910 }
7c673cae
FG
1911 }
1912
1913 // tell me about it
31f18b77 1914 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1915 i != pending_inc.new_state.end();
1916 ++i) {
1917 int s = i->second ? i->second : CEPH_OSD_UP;
f6b5b4d7 1918 if (s & CEPH_OSD_UP) {
7c673cae 1919 dout(2) << " osd." << i->first << " DOWN" << dendl;
f6b5b4d7
TL
1920 // Reset laggy parameters if failure interval exceeds a threshold.
1921 const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1922 if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1923 int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1924 if (grace_interval_threshold_exceeded(last_failure_interval)) {
1925 set_default_laggy_params(i->first);
1926 }
1927 }
1928 }
7c673cae
FG
1929 if (s & CEPH_OSD_EXISTS)
1930 dout(2) << " osd." << i->first << " DNE" << dendl;
1931 }
11fdf7f2 1932 for (auto i = pending_inc.new_up_client.begin();
7c673cae
FG
1933 i != pending_inc.new_up_client.end();
1934 ++i) {
1935 //FIXME: insert cluster addresses too
1936 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1937 }
1938 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1939 i != pending_inc.new_weight.end();
1940 ++i) {
1941 if (i->second == CEPH_OSD_OUT) {
1942 dout(2) << " osd." << i->first << " OUT" << dendl;
1943 } else if (i->second == CEPH_OSD_IN) {
1944 dout(2) << " osd." << i->first << " IN" << dendl;
1945 } else {
1946 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1947 }
1948 }
1949
1950 // features for osdmap and its incremental
28e407b8 1951 uint64_t features;
7c673cae
FG
1952
1953 // encode full map and determine its crc
1954 OSDMap tmp;
1955 {
1956 tmp.deepish_copy_from(osdmap);
1957 tmp.apply_incremental(pending_inc);
1958
1959 // determine appropriate features
28e407b8
AA
1960 features = tmp.get_encoding_features();
1961 dout(10) << __func__ << " encoding full map with "
9f95a23c 1962 << tmp.require_osd_release
28e407b8
AA
1963 << " features " << features << dendl;
1964
1965 // the features should be a subset of the mon quorum's features!
11fdf7f2 1966 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
7c673cae
FG
1967
1968 bufferlist fullbl;
11fdf7f2 1969 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
1970 pending_inc.full_crc = tmp.get_crc();
1971
1972 // include full map in the txn. note that old monitors will
1973 // overwrite this. new ones will now skip the local full map
1974 // encode and reload from this.
1975 put_version_full(t, pending_inc.epoch, fullbl);
1976 }
1977
1978 // encode
11fdf7f2
TL
1979 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1980 bufferlist bl;
1981 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
1982
1983 dout(20) << " full_crc " << tmp.get_crc()
1984 << " inc_crc " << pending_inc.inc_crc << dendl;
1985
1986 /* put everything in the transaction */
1987 put_version(t, pending_inc.epoch, bl);
1988 put_last_committed(t, pending_inc.epoch);
1989
1990 // metadata, too!
1991 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1992 p != pending_metadata.end();
1993 ++p)
1994 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1995 for (set<int>::iterator p = pending_metadata_rm.begin();
1996 p != pending_metadata_rm.end();
1997 ++p)
1998 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1999 pending_metadata.clear();
2000 pending_metadata_rm.clear();
2001
9f95a23c
TL
2002 // purged_snaps
2003 if (tmp.require_osd_release >= ceph_release_t::octopus &&
2004 !pending_inc.new_purged_snaps.empty()) {
2005 // all snaps purged this epoch (across all pools)
2006 string k = make_purged_snap_epoch_key(pending_inc.epoch);
2007 bufferlist v;
2008 encode(pending_inc.new_purged_snaps, v);
2009 t->put(OSD_SNAP_PREFIX, k, v);
2010 }
2011 for (auto& i : pending_inc.new_purged_snaps) {
2012 for (auto q = i.second.begin();
2013 q != i.second.end();
2014 ++q) {
2015 insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2016 pending_inc.epoch,
2017 t);
11fdf7f2 2018 }
9f95a23c
TL
2019 }
2020 for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2021 for (auto snap : snaps) {
2022 insert_purged_snap_update(pool, snap, snap + 1,
2023 pending_inc.epoch,
2024 t);
7c673cae 2025 }
7c673cae 2026 }
224ce89b
WB
2027
2028 // health
2029 health_check_map_t next;
92f5a8d4 2030 tmp.check_health(cct, &next);
224ce89b 2031 encode_health(next, t);
7c673cae
FG
2032}
2033
7c673cae
FG
2034int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2035{
2036 bufferlist bl;
2037 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2038 if (r < 0)
2039 return r;
2040 try {
11fdf7f2
TL
2041 auto p = bl.cbegin();
2042 decode(m, p);
7c673cae
FG
2043 }
2044 catch (buffer::error& e) {
2045 if (err)
2046 *err << "osd." << osd << " metadata is corrupt";
2047 return -EIO;
2048 }
2049 return 0;
2050}
2051
c07f9fc5 2052void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 2053{
31f18b77
FG
2054 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2055 if (osdmap.is_up(osd)) {
2056 map<string,string> meta;
2057 load_metadata(osd, meta, nullptr);
2058 auto p = meta.find(field);
2059 if (p == meta.end()) {
c07f9fc5 2060 (*out)["unknown"]++;
31f18b77 2061 } else {
c07f9fc5 2062 (*out)[p->second]++;
31f18b77
FG
2063 }
2064 }
2065 }
c07f9fc5
FG
2066}
2067
2068void OSDMonitor::count_metadata(const string& field, Formatter *f)
2069{
2070 map<string,int> by_val;
2071 count_metadata(field, &by_val);
31f18b77
FG
2072 f->open_object_section(field.c_str());
2073 for (auto& p : by_val) {
2074 f->dump_int(p.first.c_str(), p.second);
2075 }
2076 f->close_section();
2077}
2078
7c673cae
FG
2079int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2080{
2081 map<string, string> metadata;
2082 int r = load_metadata(osd, metadata, nullptr);
2083 if (r < 0)
2084 return r;
2085
2086 auto it = metadata.find("osd_objectstore");
2087 if (it == metadata.end())
2088 return -ENOENT;
2089 *type = it->second;
2090 return 0;
2091}
2092
2093bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2094 const pg_pool_t &pool,
2095 ostream *err)
2096{
2097 // just check a few pgs for efficiency - this can't give a guarantee anyway,
2098 // since filestore osds could always join the pool later
2099 set<int> checked_osds;
11fdf7f2 2100 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
7c673cae 2101 vector<int> up, acting;
11fdf7f2 2102 pg_t pgid(ps, pool_id);
7c673cae
FG
2103 osdmap.pg_to_up_acting_osds(pgid, up, acting);
2104 for (int osd : up) {
2105 if (checked_osds.find(osd) != checked_osds.end())
2106 continue;
2107 string objectstore_type;
2108 int r = get_osd_objectstore_type(osd, &objectstore_type);
2109 // allow with missing metadata, e.g. due to an osd never booting yet
2110 if (r < 0 || objectstore_type == "bluestore") {
2111 checked_osds.insert(osd);
2112 continue;
2113 }
2114 *err << "osd." << osd << " uses " << objectstore_type;
2115 return false;
2116 }
2117 }
2118 return true;
2119}
2120
2121int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2122{
2123 map<string,string> m;
2124 if (int r = load_metadata(osd, m, err))
2125 return r;
2126 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2127 f->dump_string(p->first.c_str(), p->second);
2128 return 0;
2129}
2130
2131void OSDMonitor::print_nodes(Formatter *f)
2132{
2133 // group OSDs by their hosts
2134 map<string, list<int> > osds; // hostname => osd
2135 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2136 map<string, string> m;
2137 if (load_metadata(osd, m, NULL)) {
2138 continue;
2139 }
2140 map<string, string>::iterator hostname = m.find("hostname");
2141 if (hostname == m.end()) {
2142 // not likely though
2143 continue;
2144 }
2145 osds[hostname->second].push_back(osd);
2146 }
2147
2148 dump_services(f, osds, "osd");
2149}
2150
2151void OSDMonitor::share_map_with_random_osd()
2152{
2153 if (osdmap.get_num_up_osds() == 0) {
2154 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2155 return;
2156 }
2157
2158 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2159 if (!s) {
2160 dout(10) << __func__ << " no up osd on our session map" << dendl;
2161 return;
2162 }
2163
11fdf7f2
TL
2164 dout(10) << "committed, telling random " << s->name
2165 << " all about it" << dendl;
28e407b8
AA
2166
2167 // get feature of the peer
2168 // use quorum_con_features, if it's an anonymous connection.
2169 uint64_t features = s->con_features ? s->con_features :
2170 mon->get_quorum_con_features();
7c673cae 2171 // whatev, they'll request more if they need it
28e407b8 2172 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
7c673cae
FG
2173 s->con->send_message(m);
2174 // NOTE: do *not* record osd has up to this epoch (as we do
2175 // elsewhere) as they may still need to request older values.
2176}
2177
11fdf7f2 2178version_t OSDMonitor::get_trim_to() const
7c673cae 2179{
31f18b77
FG
2180 if (mon->get_quorum().empty()) {
2181 dout(10) << __func__ << ": quorum not formed" << dendl;
2182 return 0;
2183 }
7c673cae 2184
11fdf7f2
TL
2185 {
2186 std::lock_guard<std::mutex> l(creating_pgs_lock);
2187 if (!creating_pgs.pgs.empty()) {
7c673cae
FG
2188 return 0;
2189 }
7c673cae 2190 }
11fdf7f2
TL
2191
2192 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2193 dout(0) << __func__
2194 << " blocking osdmap trim"
2195 " ('mon_debug_block_osdmap_trim' set to 'true')"
2196 << dendl;
2197 return 0;
2198 }
2199
7c673cae 2200 {
11fdf7f2 2201 epoch_t floor = get_min_last_epoch_clean();
7c673cae 2202 dout(10) << " min_last_epoch_clean " << floor << dendl;
11fdf7f2
TL
2203 if (g_conf()->mon_osd_force_trim_to > 0 &&
2204 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2205 floor = g_conf()->mon_osd_force_trim_to;
7c673cae
FG
2206 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2207 }
11fdf7f2 2208 unsigned min = g_conf()->mon_min_osdmap_epochs;
7c673cae
FG
2209 if (floor + min > get_last_committed()) {
2210 if (min < get_last_committed())
2211 floor = get_last_committed() - min;
2212 else
2213 floor = 0;
2214 }
2215 if (floor > get_first_committed())
2216 return floor;
2217 }
2218 return 0;
2219}
2220
2221epoch_t OSDMonitor::get_min_last_epoch_clean() const
2222{
2223 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2224 // also scan osd epochs
2225 // don't trim past the oldest reported osd epoch
2226 for (auto& osd_epoch : osd_epochs) {
1911f103 2227 if (osd_epoch.second < floor &&
f6b5b4d7 2228 osdmap.is_in(osd_epoch.first)) {
7c673cae
FG
2229 floor = osd_epoch.second;
2230 }
2231 }
2232 return floor;
2233}
2234
2235void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2236 version_t first)
2237{
2238 dout(10) << __func__ << " including full map for e " << first << dendl;
2239 bufferlist bl;
2240 get_version_full(first, bl);
2241 put_version_full(tx, first, bl);
11fdf7f2
TL
2242
2243 if (has_osdmap_manifest &&
2244 first > osdmap_manifest.get_first_pinned()) {
2245 _prune_update_trimmed(tx, first);
2246 }
7c673cae
FG
2247}
2248
11fdf7f2
TL
2249
2250/* full osdmap prune
2251 *
2252 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2253 */
2254
2255void OSDMonitor::load_osdmap_manifest()
2256{
2257 bool store_has_manifest =
2258 mon->store->exists(get_service_name(), "osdmap_manifest");
2259
2260 if (!store_has_manifest) {
2261 if (!has_osdmap_manifest) {
2262 return;
2263 }
2264
2265 dout(20) << __func__
2266 << " dropping osdmap manifest from memory." << dendl;
2267 osdmap_manifest = osdmap_manifest_t();
2268 has_osdmap_manifest = false;
2269 return;
2270 }
2271
2272 dout(20) << __func__
2273 << " osdmap manifest detected in store; reload." << dendl;
2274
2275 bufferlist manifest_bl;
2276 int r = get_value("osdmap_manifest", manifest_bl);
2277 if (r < 0) {
2278 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2279 ceph_abort_msg("error reading manifest");
2280 }
2281 osdmap_manifest.decode(manifest_bl);
2282 has_osdmap_manifest = true;
2283
2284 dout(10) << __func__ << " store osdmap manifest pinned ("
2285 << osdmap_manifest.get_first_pinned()
2286 << " .. "
2287 << osdmap_manifest.get_last_pinned()
2288 << ")"
2289 << dendl;
2290}
2291
2292bool OSDMonitor::should_prune() const
2293{
2294 version_t first = get_first_committed();
2295 version_t last = get_last_committed();
2296 version_t min_osdmap_epochs =
2297 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2298 version_t prune_min =
2299 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2300 version_t prune_interval =
2301 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2302 version_t last_pinned = osdmap_manifest.get_last_pinned();
2303 version_t last_to_pin = last - min_osdmap_epochs;
2304
2305 // Make it or break it constraints.
2306 //
2307 // If any of these conditions fails, we will not prune, regardless of
2308 // whether we have an on-disk manifest with an on-going pruning state.
2309 //
2310 if ((last - first) <= min_osdmap_epochs) {
2311 // between the first and last committed epochs, we don't have
2312 // enough epochs to trim, much less to prune.
2313 dout(10) << __func__
2314 << " currently holding only " << (last - first)
2315 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2316 << "); do not prune."
2317 << dendl;
2318 return false;
2319
2320 } else if ((last_to_pin - first) < prune_min) {
2321 // between the first committed epoch and the last epoch we would prune,
2322 // we simply don't have enough versions over the minimum to prune maps.
2323 dout(10) << __func__
2324 << " could only prune " << (last_to_pin - first)
2325 << " epochs (" << first << ".." << last_to_pin << "), which"
2326 " is less than the required minimum (" << prune_min << ")"
2327 << dendl;
2328 return false;
2329
2330 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2331 dout(10) << __func__
2332 << " we have pruned as far as we can; do not prune."
2333 << dendl;
2334 return false;
2335
2336 } else if (last_pinned + prune_interval > last_to_pin) {
2337 dout(10) << __func__
2338 << " not enough epochs to form an interval (last pinned: "
2339 << last_pinned << ", last to pin: "
2340 << last_to_pin << ", interval: " << prune_interval << ")"
2341 << dendl;
2342 return false;
2343 }
2344
2345 dout(15) << __func__
2346 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2347 << " lc (" << first << ".." << last << ")"
2348 << dendl;
2349 return true;
2350}
2351
2352void OSDMonitor::_prune_update_trimmed(
2353 MonitorDBStore::TransactionRef tx,
2354 version_t first)
2355{
2356 dout(10) << __func__
2357 << " first " << first
2358 << " last_pinned " << osdmap_manifest.get_last_pinned()
2359 << " last_pinned " << osdmap_manifest.get_last_pinned()
2360 << dendl;
2361
2362 osdmap_manifest_t manifest = osdmap_manifest;
2363
2364 if (!manifest.is_pinned(first)) {
2365 manifest.pin(first);
2366 }
2367
2368 set<version_t>::iterator p_end = manifest.pinned.find(first);
2369 set<version_t>::iterator p = manifest.pinned.begin();
2370 manifest.pinned.erase(p, p_end);
2371 ceph_assert(manifest.get_first_pinned() == first);
2372
2373 if (manifest.get_last_pinned() == first+1 ||
2374 manifest.pinned.size() == 1) {
2375 // we reached the end of the line, as pinned maps go; clean up our
2376 // manifest, and let `should_prune()` decide whether we should prune
2377 // again.
2378 tx->erase(get_service_name(), "osdmap_manifest");
2379 return;
2380 }
2381
2382 bufferlist bl;
2383 manifest.encode(bl);
2384 tx->put(get_service_name(), "osdmap_manifest", bl);
2385}
2386
2387void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2388{
2389 dout(1) << __func__ << dendl;
2390
2391 version_t pin_first;
2392
2393 // verify constrainsts on stable in-memory state
2394 if (!has_osdmap_manifest) {
2395 // we must have never pruned, OR if we pruned the state must no longer
2396 // be relevant (i.e., the state must have been removed alongside with
2397 // the trim that *must* have removed past the last pinned map in a
2398 // previous prune).
2399 ceph_assert(osdmap_manifest.pinned.empty());
2400 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2401 pin_first = get_first_committed();
2402
2403 } else {
2404 // we must have pruned in the past AND its state is still relevant
2405 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2406 // and thus we still hold a manifest in the store).
2407 ceph_assert(!osdmap_manifest.pinned.empty());
2408 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2409 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2410
2411 dout(10) << __func__
2412 << " first_pinned " << osdmap_manifest.get_first_pinned()
2413 << " last_pinned " << osdmap_manifest.get_last_pinned()
2414 << dendl;
2415
2416 pin_first = osdmap_manifest.get_last_pinned();
2417 }
2418
2419 manifest.pin(pin_first);
2420}
2421
2422bool OSDMonitor::_prune_sanitize_options() const
2423{
2424 uint64_t prune_interval =
2425 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2426 uint64_t prune_min =
2427 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2428 uint64_t txsize =
2429 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2430
2431 bool r = true;
2432
2433 if (prune_interval == 0) {
2434 derr << __func__
2435 << " prune is enabled BUT prune interval is zero; abort."
2436 << dendl;
2437 r = false;
2438 } else if (prune_interval == 1) {
2439 derr << __func__
2440 << " prune interval is equal to one, which essentially means"
2441 " no pruning; abort."
2442 << dendl;
2443 r = false;
2444 }
2445 if (prune_min == 0) {
2446 derr << __func__
2447 << " prune is enabled BUT prune min is zero; abort."
2448 << dendl;
2449 r = false;
2450 }
2451 if (prune_interval > prune_min) {
2452 derr << __func__
2453 << " impossible to ascertain proper prune interval because"
2454 << " it is greater than the minimum prune epochs"
2455 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2456 << dendl;
2457 r = false;
2458 }
2459
2460 if (txsize < prune_interval - 1) {
2461 derr << __func__
2462 << "'mon_osdmap_full_prune_txsize' (" << txsize
2463 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2464 << "); abort." << dendl;
2465 r = false;
2466 }
2467 return r;
2468}
2469
2470bool OSDMonitor::is_prune_enabled() const {
2471 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2472}
2473
2474bool OSDMonitor::is_prune_supported() const {
2475 return mon->get_required_mon_features().contains_any(
2476 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2477}
2478
2479/** do_prune
2480 *
2481 * @returns true if has side-effects; false otherwise.
2482 */
2483bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2484{
2485 bool enabled = is_prune_enabled();
2486
2487 dout(1) << __func__ << " osdmap full prune "
2488 << ( enabled ? "enabled" : "disabled")
2489 << dendl;
2490
2491 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2492 return false;
2493 }
2494
2495 // we are beyond the minimum prune versions, we need to remove maps because
2496 // otherwise the store will grow unbounded and we may end up having issues
2497 // with available disk space or store hangs.
2498
2499 // we will not pin all versions. We will leave a buffer number of versions.
2500 // this allows us the monitor to trim maps without caring too much about
2501 // pinned maps, and then allow us to use another ceph-mon without these
2502 // capabilities, without having to repair the store.
2503
2504 osdmap_manifest_t manifest = osdmap_manifest;
2505
2506 version_t first = get_first_committed();
2507 version_t last = get_last_committed();
2508
2509 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2510 version_t last_pinned = manifest.get_last_pinned();
2511 uint64_t prune_interval =
2512 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2513 uint64_t txsize =
2514 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2515
2516 prune_init(manifest);
2517
2518 // we need to get rid of some osdmaps
2519
2520 dout(5) << __func__
2521 << " lc (" << first << " .. " << last << ")"
2522 << " last_pinned " << last_pinned
2523 << " interval " << prune_interval
2524 << " last_to_pin " << last_to_pin
2525 << dendl;
2526
2527 // We will be erasing maps as we go.
2528 //
2529 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2530 //
2531 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2532 // we stop pruning. We could prune the maps between `next_to_pin` and
2533 // `last_to_pin`, but by not doing it we end up with neater pruned
2534 // intervals, aligned with `prune_interval`. Besides, this should not be a
2535 // problem as long as `prune_interval` is set to a sane value, instead of
2536 // hundreds or thousands of maps.
2537
2538 auto map_exists = [this](version_t v) {
2539 string k = mon->store->combine_strings("full", v);
2540 return mon->store->exists(get_service_name(), k);
2541 };
2542
2543 // 'interval' represents the number of maps from the last pinned
2544 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2545 // version 11 next; all intermediate versions will be removed.
2546 //
2547 // 'txsize' represents the maximum number of versions we'll be removing in
2548 // this iteration. If 'txsize' is large enough to perform multiple passes
2549 // pinning and removing maps, we will do so; if not, we'll do at least one
2550 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2551 // ensure that we never go *over* the maximum.
2552
2553 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2554 uint64_t removal_interval = prune_interval - 1;
2555
2556 if (txsize < removal_interval) {
2557 dout(5) << __func__
2558 << " setting txsize to removal interval size ("
2559 << removal_interval << " versions"
2560 << dendl;
2561 txsize = removal_interval;
2562 }
2563 ceph_assert(removal_interval > 0);
2564
2565 uint64_t num_pruned = 0;
2566 while (num_pruned + removal_interval <= txsize) {
2567 last_pinned = manifest.get_last_pinned();
2568
2569 if (last_pinned + prune_interval > last_to_pin) {
2570 break;
2571 }
2572 ceph_assert(last_pinned < last_to_pin);
2573
2574 version_t next_pinned = last_pinned + prune_interval;
2575 ceph_assert(next_pinned <= last_to_pin);
2576 manifest.pin(next_pinned);
2577
2578 dout(20) << __func__
2579 << " last_pinned " << last_pinned
2580 << " next_pinned " << next_pinned
2581 << " num_pruned " << num_pruned
2582 << " removal interval (" << (last_pinned+1)
2583 << ".." << (next_pinned-1) << ")"
2584 << " txsize " << txsize << dendl;
2585
2586 ceph_assert(map_exists(last_pinned));
2587 ceph_assert(map_exists(next_pinned));
2588
2589 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2590 ceph_assert(!manifest.is_pinned(v));
2591
2592 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2593 string full_key = mon->store->combine_strings("full", v);
2594 tx->erase(get_service_name(), full_key);
2595 ++num_pruned;
2596 }
2597 }
2598
2599 ceph_assert(num_pruned > 0);
2600
2601 bufferlist bl;
2602 manifest.encode(bl);
2603 tx->put(get_service_name(), "osdmap_manifest", bl);
2604
2605 return true;
2606}
2607
2608
7c673cae
FG
2609// -------------
2610
2611bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2612{
2613 op->mark_osdmon_event(__func__);
2614 Message *m = op->get_req();
2615 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2616
2617 switch (m->get_type()) {
2618 // READs
2619 case MSG_MON_COMMAND:
f64942e4
AA
2620 try {
2621 return preprocess_command(op);
11fdf7f2 2622 } catch (const bad_cmd_get& e) {
f64942e4
AA
2623 bufferlist bl;
2624 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2625 return true;
2626 }
7c673cae
FG
2627 case CEPH_MSG_MON_GET_OSDMAP:
2628 return preprocess_get_osdmap(op);
2629
2630 // damp updates
2631 case MSG_OSD_MARK_ME_DOWN:
2632 return preprocess_mark_me_down(op);
9f95a23c
TL
2633 case MSG_OSD_MARK_ME_DEAD:
2634 return preprocess_mark_me_dead(op);
7c673cae
FG
2635 case MSG_OSD_FULL:
2636 return preprocess_full(op);
2637 case MSG_OSD_FAILURE:
2638 return preprocess_failure(op);
2639 case MSG_OSD_BOOT:
2640 return preprocess_boot(op);
2641 case MSG_OSD_ALIVE:
2642 return preprocess_alive(op);
2643 case MSG_OSD_PG_CREATED:
2644 return preprocess_pg_created(op);
11fdf7f2
TL
2645 case MSG_OSD_PG_READY_TO_MERGE:
2646 return preprocess_pg_ready_to_merge(op);
7c673cae
FG
2647 case MSG_OSD_PGTEMP:
2648 return preprocess_pgtemp(op);
2649 case MSG_OSD_BEACON:
2650 return preprocess_beacon(op);
2651
2652 case CEPH_MSG_POOLOP:
2653 return preprocess_pool_op(op);
2654
2655 case MSG_REMOVE_SNAPS:
2656 return preprocess_remove_snaps(op);
2657
9f95a23c
TL
2658 case MSG_MON_GET_PURGED_SNAPS:
2659 return preprocess_get_purged_snaps(op);
2660
7c673cae
FG
2661 default:
2662 ceph_abort();
2663 return true;
2664 }
2665}
2666
2667bool OSDMonitor::prepare_update(MonOpRequestRef op)
2668{
2669 op->mark_osdmon_event(__func__);
2670 Message *m = op->get_req();
2671 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2672
2673 switch (m->get_type()) {
2674 // damp updates
2675 case MSG_OSD_MARK_ME_DOWN:
2676 return prepare_mark_me_down(op);
9f95a23c
TL
2677 case MSG_OSD_MARK_ME_DEAD:
2678 return prepare_mark_me_dead(op);
7c673cae
FG
2679 case MSG_OSD_FULL:
2680 return prepare_full(op);
2681 case MSG_OSD_FAILURE:
2682 return prepare_failure(op);
2683 case MSG_OSD_BOOT:
2684 return prepare_boot(op);
2685 case MSG_OSD_ALIVE:
2686 return prepare_alive(op);
2687 case MSG_OSD_PG_CREATED:
2688 return prepare_pg_created(op);
2689 case MSG_OSD_PGTEMP:
2690 return prepare_pgtemp(op);
11fdf7f2
TL
2691 case MSG_OSD_PG_READY_TO_MERGE:
2692 return prepare_pg_ready_to_merge(op);
7c673cae
FG
2693 case MSG_OSD_BEACON:
2694 return prepare_beacon(op);
2695
2696 case MSG_MON_COMMAND:
f64942e4
AA
2697 try {
2698 return prepare_command(op);
11fdf7f2 2699 } catch (const bad_cmd_get& e) {
f64942e4
AA
2700 bufferlist bl;
2701 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2702 return true;
2703 }
7c673cae
FG
2704
2705 case CEPH_MSG_POOLOP:
2706 return prepare_pool_op(op);
2707
2708 case MSG_REMOVE_SNAPS:
2709 return prepare_remove_snaps(op);
2710
2711
2712 default:
2713 ceph_abort();
2714 }
2715
2716 return false;
2717}
2718
2719bool OSDMonitor::should_propose(double& delay)
2720{
2721 dout(10) << "should_propose" << dendl;
2722
2723 // if full map, propose immediately! any subsequent changes will be clobbered.
2724 if (pending_inc.fullmap.length())
2725 return true;
2726
2727 // adjust osd weights?
2728 if (!osd_weight.empty() &&
2729 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2730 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2731 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2732 delay = 0.0;
2733 osd_weight.clear();
2734 return true;
2735 }
2736
7c673cae
FG
2737 return PaxosService::should_propose(delay);
2738}
2739
2740
2741
2742// ---------------------------
2743// READs
2744
2745bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2746{
2747 op->mark_osdmon_event(__func__);
9f95a23c 2748 auto m = op->get_req<MMonGetOSDMap>();
28e407b8
AA
2749
2750 uint64_t features = mon->get_quorum_con_features();
11fdf7f2
TL
2751 if (op->get_session() && op->get_session()->con_features)
2752 features = op->get_session()->con_features;
28e407b8 2753
7c673cae 2754 dout(10) << __func__ << " " << *m << dendl;
28e407b8 2755 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
2756 epoch_t first = get_first_committed();
2757 epoch_t last = osdmap.get_epoch();
11fdf7f2
TL
2758 int max = g_conf()->osd_map_message_max;
2759 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2760 for (epoch_t e = std::max(first, m->get_full_first());
2761 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
7c673cae 2762 ++e, --max) {
11fdf7f2
TL
2763 bufferlist& bl = reply->maps[e];
2764 int r = get_version_full(e, features, bl);
2765 ceph_assert(r >= 0);
2766 max_bytes -= bl.length();
7c673cae 2767 }
11fdf7f2
TL
2768 for (epoch_t e = std::max(first, m->get_inc_first());
2769 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
7c673cae 2770 ++e, --max) {
11fdf7f2
TL
2771 bufferlist& bl = reply->incremental_maps[e];
2772 int r = get_version(e, features, bl);
2773 ceph_assert(r >= 0);
2774 max_bytes -= bl.length();
7c673cae
FG
2775 }
2776 reply->oldest_map = first;
2777 reply->newest_map = last;
2778 mon->send_reply(op, reply);
2779 return true;
2780}
2781
2782
2783// ---------------------------
2784// UPDATEs
2785
2786// failure --
2787
11fdf7f2 2788bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
7c673cae 2789 // check permissions
11fdf7f2 2790 MonSession *session = op->get_session();
7c673cae
FG
2791 if (!session)
2792 return true;
2793 if (!session->is_capable("osd", MON_CAP_X)) {
2794 dout(0) << "got MOSDFailure from entity with insufficient caps "
2795 << session->caps << dendl;
2796 return true;
2797 }
2798 if (fsid != mon->monmap->fsid) {
2799 dout(0) << "check_source: on fsid " << fsid
2800 << " != " << mon->monmap->fsid << dendl;
2801 return true;
2802 }
2803 return false;
2804}
2805
2806
2807bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2808{
2809 op->mark_osdmon_event(__func__);
9f95a23c 2810 auto m = op->get_req<MOSDFailure>();
7c673cae 2811 // who is target_osd
11fdf7f2 2812 int badboy = m->get_target_osd();
7c673cae
FG
2813
2814 // check permissions
11fdf7f2 2815 if (check_source(op, m->fsid))
7c673cae
FG
2816 goto didit;
2817
2818 // first, verify the reporting host is valid
2819 if (m->get_orig_source().is_osd()) {
2820 int from = m->get_orig_source().num();
2821 if (!osdmap.exists(from) ||
11fdf7f2 2822 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
7c673cae 2823 (osdmap.is_down(from) && m->if_osd_failed())) {
11fdf7f2
TL
2824 dout(5) << "preprocess_failure from dead osd." << from
2825 << ", ignoring" << dendl;
7c673cae
FG
2826 send_incremental(op, m->get_epoch()+1);
2827 goto didit;
2828 }
2829 }
2830
2831
2832 // weird?
2833 if (osdmap.is_down(badboy)) {
11fdf7f2
TL
2834 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2835 << " " << m->get_target_addrs()
2836 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2837 if (m->get_epoch() < osdmap.get_epoch())
2838 send_incremental(op, m->get_epoch()+1);
2839 goto didit;
2840 }
11fdf7f2
TL
2841 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2842 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2843 << " " << m->get_target_addrs()
2844 << " != map's " << osdmap.get_addrs(badboy)
2845 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2846 if (m->get_epoch() < osdmap.get_epoch())
2847 send_incremental(op, m->get_epoch()+1);
2848 goto didit;
2849 }
2850
2851 // already reported?
2852 if (osdmap.is_down(badboy) ||
2853 osdmap.get_up_from(badboy) > m->get_epoch()) {
11fdf7f2
TL
2854 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2855 << " " << m->get_target_addrs()
2856 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2857 if (m->get_epoch() < osdmap.get_epoch())
2858 send_incremental(op, m->get_epoch()+1);
2859 goto didit;
2860 }
2861
2862 if (!can_mark_down(badboy)) {
11fdf7f2
TL
2863 dout(5) << "preprocess_failure ignoring report of osd."
2864 << m->get_target_osd() << " " << m->get_target_addrs()
2865 << " from " << m->get_orig_source() << dendl;
7c673cae
FG
2866 goto didit;
2867 }
2868
11fdf7f2
TL
2869 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2870 << " " << m->get_target_addrs()
2871 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2872 return false;
2873
2874 didit:
28e407b8 2875 mon->no_reply(op);
7c673cae
FG
2876 return true;
2877}
2878
2879class C_AckMarkedDown : public C_MonOp {
2880 OSDMonitor *osdmon;
2881public:
2882 C_AckMarkedDown(
2883 OSDMonitor *osdmon,
2884 MonOpRequestRef op)
2885 : C_MonOp(op), osdmon(osdmon) {}
2886
eafe8130
TL
2887 void _finish(int r) override {
2888 if (r == 0) {
9f95a23c 2889 auto m = op->get_req<MOSDMarkMeDown>();
eafe8130
TL
2890 osdmon->mon->send_reply(
2891 op,
2892 new MOSDMarkMeDown(
2893 m->fsid,
2894 m->target_osd,
2895 m->target_addrs,
2896 m->get_epoch(),
2897 false)); // ACK itself does not request an ack
2898 } else if (r == -EAGAIN) {
2899 osdmon->dispatch(op);
2900 } else {
2901 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2902 }
7c673cae
FG
2903 }
2904 ~C_AckMarkedDown() override {
2905 }
2906};
2907
2908bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2909{
2910 op->mark_osdmon_event(__func__);
9f95a23c 2911 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 2912 int from = m->target_osd;
7c673cae
FG
2913
2914 // check permissions
11fdf7f2 2915 if (check_source(op, m->fsid))
7c673cae
FG
2916 goto reply;
2917
2918 // first, verify the reporting host is valid
2919 if (!m->get_orig_source().is_osd())
2920 goto reply;
2921
2922 if (!osdmap.exists(from) ||
2923 osdmap.is_down(from) ||
11fdf7f2 2924 osdmap.get_addrs(from) != m->target_addrs) {
7c673cae
FG
2925 dout(5) << "preprocess_mark_me_down from dead osd."
2926 << from << ", ignoring" << dendl;
2927 send_incremental(op, m->get_epoch()+1);
2928 goto reply;
2929 }
2930
2931 // no down might be set
11fdf7f2 2932 if (!can_mark_down(from))
7c673cae
FG
2933 goto reply;
2934
11fdf7f2
TL
2935 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2936 << " " << m->target_addrs << dendl;
7c673cae
FG
2937 return false;
2938
2939 reply:
2940 if (m->request_ack) {
2941 Context *c(new C_AckMarkedDown(this, op));
2942 c->complete(0);
2943 }
2944 return true;
2945}
2946
2947bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2948{
2949 op->mark_osdmon_event(__func__);
9f95a23c 2950 auto m = op->get_req<MOSDMarkMeDown>();
11fdf7f2 2951 int target_osd = m->target_osd;
7c673cae 2952
11fdf7f2
TL
2953 ceph_assert(osdmap.is_up(target_osd));
2954 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
7c673cae
FG
2955
2956 mon->clog->info() << "osd." << target_osd << " marked itself down";
2957 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2958 if (m->request_ack)
2959 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2960 return true;
2961}
2962
9f95a23c
TL
2963bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2964{
2965 op->mark_osdmon_event(__func__);
2966 auto m = op->get_req<MOSDMarkMeDead>();
2967 int from = m->target_osd;
2968
2969 // check permissions
2970 if (check_source(op, m->fsid)) {
2971 mon->no_reply(op);
2972 return true;
2973 }
2974
2975 // first, verify the reporting host is valid
2976 if (!m->get_orig_source().is_osd()) {
2977 mon->no_reply(op);
2978 return true;
2979 }
2980
2981 if (!osdmap.exists(from) ||
2982 !osdmap.is_down(from)) {
2983 dout(5) << __func__ << " from nonexistent or up osd." << from
2984 << ", ignoring" << dendl;
2985 send_incremental(op, m->get_epoch()+1);
2986 mon->no_reply(op);
2987 return true;
2988 }
2989
2990 return false;
2991}
2992
2993bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2994{
2995 op->mark_osdmon_event(__func__);
2996 auto m = op->get_req<MOSDMarkMeDead>();
2997 int target_osd = m->target_osd;
2998
2999 ceph_assert(osdmap.is_down(target_osd));
3000
3001 mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
3002 << m->get_epoch();
3003 if (!pending_inc.new_xinfo.count(target_osd)) {
3004 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3005 }
3006 pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3007 wait_for_finished_proposal(
3008 op,
3009 new LambdaContext(
3010 [op, this] (int r) {
3011 if (r >= 0) {
3012 mon->no_reply(op); // ignore on success
3013 }
3014 }
3015 ));
3016 return true;
3017}
3018
7c673cae
FG
3019bool OSDMonitor::can_mark_down(int i)
3020{
31f18b77
FG
3021 if (osdmap.is_nodown(i)) {
3022 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3023 << "will not mark it down" << dendl;
7c673cae
FG
3024 return false;
3025 }
31f18b77 3026
7c673cae
FG
3027 int num_osds = osdmap.get_num_osds();
3028 if (num_osds == 0) {
31f18b77 3029 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
3030 return false;
3031 }
3032 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3033 float up_ratio = (float)up / (float)num_osds;
11fdf7f2 3034 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
31f18b77 3035 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
11fdf7f2 3036 << g_conf()->mon_osd_min_up_ratio
7c673cae
FG
3037 << ", will not mark osd." << i << " down" << dendl;
3038 return false;
3039 }
3040 return true;
3041}
3042
3043bool OSDMonitor::can_mark_up(int i)
3044{
31f18b77
FG
3045 if (osdmap.is_noup(i)) {
3046 dout(5) << __func__ << " osd." << i << " is marked as noup, "
3047 << "will not mark it up" << dendl;
7c673cae
FG
3048 return false;
3049 }
31f18b77 3050
7c673cae
FG
3051 return true;
3052}
3053
3054/**
3055 * @note the parameter @p i apparently only exists here so we can output the
3056 * osd's id on messages.
3057 */
3058bool OSDMonitor::can_mark_out(int i)
3059{
31f18b77
FG
3060 if (osdmap.is_noout(i)) {
3061 dout(5) << __func__ << " osd." << i << " is marked as noout, "
3062 << "will not mark it out" << dendl;
3063 return false;
3064 }
3065
7c673cae
FG
3066 int num_osds = osdmap.get_num_osds();
3067 if (num_osds == 0) {
3068 dout(5) << __func__ << " no osds" << dendl;
3069 return false;
3070 }
3071 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3072 float in_ratio = (float)in / (float)num_osds;
11fdf7f2 3073 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
7c673cae
FG
3074 if (i >= 0)
3075 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3076 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3077 << ", will not mark osd." << i << " out" << dendl;
3078 else
3079 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 3080 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
3081 << ", will not mark osds out" << dendl;
3082 return false;
3083 }
3084
3085 return true;
3086}
3087
3088bool OSDMonitor::can_mark_in(int i)
3089{
31f18b77
FG
3090 if (osdmap.is_noin(i)) {
3091 dout(5) << __func__ << " osd." << i << " is marked as noin, "
3092 << "will not mark it in" << dendl;
7c673cae
FG
3093 return false;
3094 }
31f18b77 3095
7c673cae
FG
3096 return true;
3097}
3098
3099bool OSDMonitor::check_failures(utime_t now)
3100{
3101 bool found_failure = false;
3102 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3103 p != failure_info.end();
3104 ++p) {
3105 if (can_mark_down(p->first)) {
3106 found_failure |= check_failure(now, p->first, p->second);
3107 }
3108 }
3109 return found_failure;
3110}
3111
3112bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3113{
3114 // already pending failure?
3115 if (pending_inc.new_state.count(target_osd) &&
3116 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3117 dout(10) << " already pending failure" << dendl;
3118 return true;
3119 }
3120
3121 set<string> reporters_by_subtree;
11fdf7f2
TL
3122 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3123 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
7c673cae
FG
3124 utime_t max_failed_since = fi.get_failed_since();
3125 utime_t failed_for = now - max_failed_since;
3126
3127 utime_t grace = orig_grace;
3128 double my_grace = 0, peer_grace = 0;
3129 double decay_k = 0;
11fdf7f2
TL
3130 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3131 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
3132 decay_k = ::log(.5) / halflife;
3133
3134 // scale grace period based on historical probability of 'lagginess'
3135 // (false positive failures due to slowness).
3136 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3137 double decay = exp((double)failed_for * decay_k);
3138 dout(20) << " halflife " << halflife << " decay_k " << decay_k
3139 << " failed_for " << failed_for << " decay " << decay << dendl;
3140 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3141 grace += my_grace;
3142 }
3143
3144 // consider the peers reporting a failure a proxy for a potential
3145 // 'subcluster' over the overall cluster that is similarly
3146 // laggy. this is clearly not true in all cases, but will sometimes
3147 // help us localize the grace correction to a subset of the system
3148 // (say, a rack with a bad switch) that is unhappy.
11fdf7f2 3149 ceph_assert(fi.reporters.size());
eafe8130 3150 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
7c673cae
FG
3151 // get the parent bucket whose type matches with "reporter_subtree_level".
3152 // fall back to OSD if the level doesn't exist.
eafe8130
TL
3153 if (osdmap.exists(p->first)) {
3154 auto reporter_loc = osdmap.crush->get_full_location(p->first);
3155 if (auto iter = reporter_loc.find(reporter_subtree_level);
3156 iter == reporter_loc.end()) {
3157 reporters_by_subtree.insert("osd." + to_string(p->first));
3158 } else {
3159 reporters_by_subtree.insert(iter->second);
3160 }
3161 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3162 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3163 utime_t elapsed = now - xi.down_stamp;
3164 double decay = exp((double)elapsed * decay_k);
3165 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3166 }
3167 ++p;
7c673cae 3168 } else {
eafe8130
TL
3169 fi.cancel_report(p->first);;
3170 p = fi.reporters.erase(p);
7c673cae
FG
3171 }
3172 }
3173
11fdf7f2 3174 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
7c673cae
FG
3175 peer_grace /= (double)fi.reporters.size();
3176 grace += peer_grace;
3177 }
3178
3179 dout(10) << " osd." << target_osd << " has "
3180 << fi.reporters.size() << " reporters, "
3181 << grace << " grace (" << orig_grace << " + " << my_grace
3182 << " + " << peer_grace << "), max_failed_since " << max_failed_since
3183 << dendl;
3184
3185 if (failed_for >= grace &&
11fdf7f2 3186 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
7c673cae
FG
3187 dout(1) << " we have enough reporters to mark osd." << target_osd
3188 << " down" << dendl;
3189 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3190
31f18b77
FG
3191 mon->clog->info() << "osd." << target_osd << " failed ("
3192 << osdmap.crush->get_full_location_ordered_string(
3193 target_osd)
3194 << ") ("
3195 << (int)reporters_by_subtree.size()
3196 << " reporters from different "
7c673cae
FG
3197 << reporter_subtree_level << " after "
3198 << failed_for << " >= grace " << grace << ")";
3199 return true;
3200 }
3201 return false;
3202}
3203
224ce89b 3204void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
3205{
3206 // already pending failure?
3207 if (pending_inc.new_state.count(target_osd) &&
3208 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3209 dout(10) << " already pending failure" << dendl;
3210 return;
3211 }
3212
3213 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3214 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
9f95a23c
TL
3215 if (!pending_inc.new_xinfo.count(target_osd)) {
3216 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3217 }
3218 pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
7c673cae 3219
31f18b77
FG
3220 mon->clog->info() << "osd." << target_osd << " failed ("
3221 << osdmap.crush->get_full_location_ordered_string(target_osd)
3222 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
3223 return;
3224}
3225
3226bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3227{
3228 op->mark_osdmon_event(__func__);
9f95a23c 3229 auto m = op->get_req<MOSDFailure>();
11fdf7f2
TL
3230 dout(1) << "prepare_failure osd." << m->get_target_osd()
3231 << " " << m->get_target_addrs()
3232 << " from " << m->get_orig_source()
7c673cae
FG
3233 << " is reporting failure:" << m->if_osd_failed() << dendl;
3234
11fdf7f2 3235 int target_osd = m->get_target_osd();
7c673cae 3236 int reporter = m->get_orig_source().num();
11fdf7f2
TL
3237 ceph_assert(osdmap.is_up(target_osd));
3238 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
7c673cae 3239
eafe8130
TL
3240 mon->no_reply(op);
3241
7c673cae
FG
3242 if (m->if_osd_failed()) {
3243 // calculate failure time
3244 utime_t now = ceph_clock_now();
3245 utime_t failed_since =
3246 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3247
3248 // add a report
3249 if (m->is_immediate()) {
11fdf7f2
TL
3250 mon->clog->debug() << "osd." << m->get_target_osd()
3251 << " reported immediately failed by "
3252 << m->get_orig_source();
224ce89b 3253 force_failure(target_osd, reporter);
7c673cae
FG
3254 return true;
3255 }
11fdf7f2
TL
3256 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3257 << m->get_orig_source();
7c673cae
FG
3258
3259 failure_info_t& fi = failure_info[target_osd];
3260 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3261 if (old_op) {
3262 mon->no_reply(old_op);
3263 }
3264
3265 return check_failure(now, target_osd, fi);
3266 } else {
3267 // remove the report
11fdf7f2
TL
3268 mon->clog->debug() << "osd." << m->get_target_osd()
3269 << " failure report canceled by "
3270 << m->get_orig_source();
7c673cae
FG
3271 if (failure_info.count(target_osd)) {
3272 failure_info_t& fi = failure_info[target_osd];
3273 MonOpRequestRef report_op = fi.cancel_report(reporter);
3274 if (report_op) {
3275 mon->no_reply(report_op);
3276 }
3277 if (fi.reporters.empty()) {
3278 dout(10) << " removing last failure_info for osd." << target_osd
3279 << dendl;
3280 failure_info.erase(target_osd);
3281 } else {
3282 dout(10) << " failure_info for osd." << target_osd << " now "
3283 << fi.reporters.size() << " reporters" << dendl;
3284 }
3285 } else {
3286 dout(10) << " no failure_info for osd." << target_osd << dendl;
3287 }
7c673cae
FG
3288 }
3289
3290 return false;
3291}
3292
3293void OSDMonitor::process_failures()
3294{
3295 map<int,failure_info_t>::iterator p = failure_info.begin();
3296 while (p != failure_info.end()) {
3297 if (osdmap.is_up(p->first)) {
3298 ++p;
3299 } else {
3300 dout(10) << "process_failures osd." << p->first << dendl;
3301 list<MonOpRequestRef> ls;
3302 p->second.take_report_messages(ls);
3303 failure_info.erase(p++);
3304
3305 while (!ls.empty()) {
3306 MonOpRequestRef o = ls.front();
3307 if (o) {
3308 o->mark_event(__func__);
3309 MOSDFailure *m = o->get_req<MOSDFailure>();
3310 send_latest(o, m->get_epoch());
28e407b8 3311 mon->no_reply(o);
7c673cae
FG
3312 }
3313 ls.pop_front();
3314 }
3315 }
3316 }
3317}
3318
3319void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3320{
3321 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3322
3323 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3324 p != failure_info.end();
3325 ++p) {
3326 p->second.take_report_messages(ls);
3327 }
3328 failure_info.clear();
3329}
3330
f6b5b4d7
TL
3331int OSDMonitor::get_grace_interval_threshold()
3332{
3333 int halflife = g_conf()->mon_osd_laggy_halflife;
3334 // Scale the halflife period (default: 1_hr) by
3335 // a factor (48) to calculate the threshold.
3336 int grace_threshold_factor = 48;
3337 return halflife * grace_threshold_factor;
3338}
3339
3340bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3341{
3342 int grace_interval_threshold_secs = get_grace_interval_threshold();
3343 if (last_failed_interval > grace_interval_threshold_secs) {
3344 dout(1) << " last_failed_interval " << last_failed_interval
3345 << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3346 << dendl;
3347 return true;
3348 }
3349 return false;
3350}
3351
3352void OSDMonitor::set_default_laggy_params(int target_osd)
3353{
3354 if (pending_inc.new_xinfo.count(target_osd) == 0) {
3355 pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3356 }
3357 osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3358 xi.down_stamp = pending_inc.modified;
3359 xi.laggy_probability = 0.0;
3360 xi.laggy_interval = 0;
3361 dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3362}
3363
7c673cae
FG
3364
3365// boot --
3366
3367bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3368{
3369 op->mark_osdmon_event(__func__);
9f95a23c 3370 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3371 int from = m->get_orig_source_inst().name.num();
3372
3373 // check permissions, ignore if failed (no response expected)
11fdf7f2 3374 MonSession *session = op->get_session();
7c673cae
FG
3375 if (!session)
3376 goto ignore;
3377 if (!session->is_capable("osd", MON_CAP_X)) {
3378 dout(0) << "got preprocess_boot message from entity with insufficient caps"
11fdf7f2 3379 << session->caps << dendl;
7c673cae
FG
3380 goto ignore;
3381 }
3382
11fdf7f2
TL
3383 if (m->sb.cluster_fsid != mon->monmap->fsid) {
3384 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3385 << " != " << mon->monmap->fsid << dendl;
7c673cae
FG
3386 goto ignore;
3387 }
3388
11fdf7f2
TL
3389 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3390 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
7c673cae
FG
3391 goto ignore;
3392 }
3393
11fdf7f2 3394 ceph_assert(m->get_orig_source_inst().name.is_osd());
7c673cae 3395
11fdf7f2
TL
3396 // force all osds to have gone through luminous prior to upgrade to nautilus
3397 {
3398 vector<string> missing;
3399 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3400 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3401 }
3402 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3403 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3404 }
3405 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3406 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3407 }
3408 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3409 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3410 }
7c673cae 3411
11fdf7f2
TL
3412 if (!missing.empty()) {
3413 using std::experimental::make_ostream_joiner;
7c673cae 3414
11fdf7f2
TL
3415 stringstream ss;
3416 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
c07f9fc5 3417
11fdf7f2
TL
3418 mon->clog->info() << "disallowing boot of OSD "
3419 << m->get_orig_source_inst()
3420 << " because the osd lacks " << ss.str();
7c673cae
FG
3421 goto ignore;
3422 }
3423 }
3424
9f95a23c
TL
3425 // make sure osd versions do not span more than 3 releases
3426 if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3427 osdmap.require_osd_release < ceph_release_t::mimic) {
3428 mon->clog->info() << "disallowing boot of octopus+ OSD "
7c673cae 3429 << m->get_orig_source_inst()
9f95a23c 3430 << " because require_osd_release < mimic";
7c673cae
FG
3431 goto ignore;
3432 }
3433
f64942e4
AA
3434 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3435 // we are reusing a jewel feature bit that was retired in luminous.
9f95a23c 3436 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
f64942e4
AA
3437 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3438 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3439 mon->clog->info() << "disallowing boot of OSD "
3440 << m->get_orig_source_inst()
3441 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3442 goto ignore;
3443 }
3444
7c673cae
FG
3445 // already booted?
3446 if (osdmap.is_up(from) &&
11fdf7f2
TL
3447 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3448 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
7c673cae 3449 // yup.
11fdf7f2
TL
3450 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3451 << " " << m->get_orig_source_addrs()
3452 << " =~ " << osdmap.get_addrs(from) << dendl;
7c673cae
FG
3453 _booted(op, false);
3454 return true;
3455 }
3456
3457 if (osdmap.exists(from) &&
3458 !osdmap.get_uuid(from).is_zero() &&
3459 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3460 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3461 << " clashes with existing osd: different fsid"
3462 << " (ours: " << osdmap.get_uuid(from)
3463 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3464 goto ignore;
3465 }
3466
3467 if (osdmap.exists(from) &&
3468 osdmap.get_info(from).up_from > m->version &&
11fdf7f2
TL
3469 osdmap.get_most_recent_addrs(from).legacy_equals(
3470 m->get_orig_source_addrs())) {
7c673cae
FG
3471 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3472 send_latest(op, m->sb.current_epoch+1);
3473 return true;
3474 }
3475
3476 // noup?
3477 if (!can_mark_up(from)) {
3478 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3479 send_latest(op, m->sb.current_epoch+1);
3480 return true;
3481 }
3482
3483 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3484 return false;
3485
3486 ignore:
3487 return true;
3488}
3489
3490bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3491{
3492 op->mark_osdmon_event(__func__);
9f95a23c 3493 auto m = op->get_req<MOSDBoot>();
11fdf7f2
TL
3494 dout(7) << __func__ << " from " << m->get_source()
3495 << " sb " << m->sb
3496 << " client_addrs" << m->get_connection()->get_peer_addrs()
3497 << " cluster_addrs " << m->cluster_addrs
3498 << " hb_back_addrs " << m->hb_back_addrs
3499 << " hb_front_addrs " << m->hb_front_addrs
7c673cae
FG
3500 << dendl;
3501
11fdf7f2 3502 ceph_assert(m->get_orig_source().is_osd());
7c673cae
FG
3503 int from = m->get_orig_source().num();
3504
3505 // does this osd exist?
3506 if (from >= osdmap.get_max_osd()) {
3507 dout(1) << "boot from osd." << from << " >= max_osd "
3508 << osdmap.get_max_osd() << dendl;
3509 return false;
3510 }
3511
3512 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3513 if (pending_inc.new_state.count(from))
3514 oldstate ^= pending_inc.new_state[from];
3515
3516 // already up? mark down first?
3517 if (osdmap.is_up(from)) {
11fdf7f2
TL
3518 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3519 << osdmap.get_addrs(from) << dendl;
7c673cae 3520 // preprocess should have caught these; if not, assert.
11fdf7f2
TL
3521 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3522 m->get_orig_source_addrs()) ||
3523 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3524 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
7c673cae
FG
3525
3526 if (pending_inc.new_state.count(from) == 0 ||
3527 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3528 // mark previous guy down
3529 pending_inc.new_state[from] = CEPH_OSD_UP;
3530 }
3531 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3532 } else if (pending_inc.new_up_client.count(from)) {
3533 // already prepared, just wait
3534 dout(7) << __func__ << " already prepared, waiting on "
3535 << m->get_orig_source_addr() << dendl;
3536 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3537 } else {
3538 // mark new guy up.
11fdf7f2
TL
3539 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3540 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3541 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3542 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
7c673cae
FG
3543
3544 down_pending_out.erase(from); // if any
3545
3546 if (m->sb.weight)
3547 osd_weight[from] = m->sb.weight;
3548
3549 // set uuid?
3550 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3551 << dendl;
3552 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3553 // preprocess should have caught this; if not, assert.
11fdf7f2 3554 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
7c673cae
FG
3555 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3556 }
3557
3558 // fresh osd?
3559 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3560 const osd_info_t& i = osdmap.get_info(from);
3561 if (i.up_from > i.lost_at) {
3562 dout(10) << " fresh osd; marking lost_at too" << dendl;
3563 pending_inc.new_lost[from] = osdmap.get_epoch();
3564 }
3565 }
3566
3567 // metadata
3568 bufferlist osd_metadata;
11fdf7f2 3569 encode(m->metadata, osd_metadata);
7c673cae 3570 pending_metadata[from] = osd_metadata;
31f18b77 3571 pending_metadata_rm.erase(from);
7c673cae
FG
3572
3573 // adjust last clean unmount epoch?
3574 const osd_info_t& info = osdmap.get_info(from);
3575 dout(10) << " old osd_info: " << info << dendl;
3576 if (m->sb.mounted > info.last_clean_begin ||
3577 (m->sb.mounted == info.last_clean_begin &&
3578 m->sb.clean_thru > info.last_clean_end)) {
3579 epoch_t begin = m->sb.mounted;
3580 epoch_t end = m->sb.clean_thru;
3581
3582 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3583 << "[" << info.last_clean_begin << "," << info.last_clean_end
3584 << ") -> [" << begin << "-" << end << ")"
3585 << dendl;
3586 pending_inc.new_last_clean_interval[from] =
3587 pair<epoch_t,epoch_t>(begin, end);
3588 }
3589
9f95a23c
TL
3590 if (pending_inc.new_xinfo.count(from) == 0)
3591 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3592 osd_xinfo_t& xi = pending_inc.new_xinfo[from];
7c673cae 3593 if (m->boot_epoch == 0) {
11fdf7f2
TL
3594 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3595 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3596 dout(10) << " not laggy, new xi " << xi << dendl;
3597 } else {
3598 if (xi.down_stamp.sec()) {
3599 int interval = ceph_clock_now().sec() -
3600 xi.down_stamp.sec();
11fdf7f2
TL
3601 if (g_conf()->mon_osd_laggy_max_interval &&
3602 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3603 interval = g_conf()->mon_osd_laggy_max_interval;
7c673cae
FG
3604 }
3605 xi.laggy_interval =
11fdf7f2
TL
3606 interval * g_conf()->mon_osd_laggy_weight +
3607 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3608 }
3609 xi.laggy_probability =
11fdf7f2
TL
3610 g_conf()->mon_osd_laggy_weight +
3611 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3612 dout(10) << " laggy, now xi " << xi << dendl;
3613 }
3614
3615 // set features shared by the osd
3616 if (m->osd_features)
3617 xi.features = m->osd_features;
3618 else
3619 xi.features = m->get_connection()->get_features();
3620
3621 // mark in?
11fdf7f2 3622 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
7c673cae 3623 (oldstate & CEPH_OSD_AUTOOUT)) ||
11fdf7f2
TL
3624 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3625 (g_conf()->mon_osd_auto_mark_in)) {
7c673cae 3626 if (can_mark_in(from)) {
9f95a23c
TL
3627 if (xi.old_weight > 0) {
3628 pending_inc.new_weight[from] = xi.old_weight;
7c673cae
FG
3629 xi.old_weight = 0;
3630 } else {
3631 pending_inc.new_weight[from] = CEPH_OSD_IN;
3632 }
3633 } else {
3634 dout(7) << __func__ << " NOIN set, will not mark in "
3635 << m->get_orig_source_addr() << dendl;
3636 }
3637 }
3638
7c673cae
FG
3639 // wait
3640 wait_for_finished_proposal(op, new C_Booted(this, op));
3641 }
3642 return true;
3643}
3644
3645void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3646{
3647 op->mark_osdmon_event(__func__);
9f95a23c 3648 auto m = op->get_req<MOSDBoot>();
7c673cae
FG
3649 dout(7) << "_booted " << m->get_orig_source_inst()
3650 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3651
3652 if (logit) {
11fdf7f2
TL
3653 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3654 << " boot";
7c673cae
FG
3655 }
3656
3657 send_latest(op, m->sb.current_epoch+1);
3658}
3659
3660
3661// -------------
3662// full
3663
3664bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3665{
3666 op->mark_osdmon_event(__func__);
9f95a23c 3667 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3668 int from = m->get_orig_source().num();
3669 set<string> state;
3670 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3671
3672 // check permissions, ignore if failed
11fdf7f2 3673 MonSession *session = op->get_session();
7c673cae
FG
3674 if (!session)
3675 goto ignore;
3676 if (!session->is_capable("osd", MON_CAP_X)) {
3677 dout(0) << "MOSDFull from entity with insufficient privileges:"
3678 << session->caps << dendl;
3679 goto ignore;
3680 }
3681
3682 // ignore a full message from the osd instance that already went down
3683 if (!osdmap.exists(from)) {
3684 dout(7) << __func__ << " ignoring full message from nonexistent "
3685 << m->get_orig_source_inst() << dendl;
3686 goto ignore;
3687 }
3688 if ((!osdmap.is_up(from) &&
11fdf7f2
TL
3689 osdmap.get_most_recent_addrs(from).legacy_equals(
3690 m->get_orig_source_addrs())) ||
7c673cae 3691 (osdmap.is_up(from) &&
11fdf7f2 3692 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
7c673cae
FG
3693 dout(7) << __func__ << " ignoring full message from down "
3694 << m->get_orig_source_inst() << dendl;
3695 goto ignore;
3696 }
3697
3698 OSDMap::calc_state_set(osdmap.get_state(from), state);
3699
3700 if ((osdmap.get_state(from) & mask) == m->state) {
3701 dout(7) << __func__ << " state already " << state << " for osd." << from
3702 << " " << m->get_orig_source_inst() << dendl;
3703 _reply_map(op, m->version);
3704 goto ignore;
3705 }
3706
3707 dout(10) << __func__ << " want state " << state << " for osd." << from
3708 << " " << m->get_orig_source_inst() << dendl;
3709 return false;
3710
3711 ignore:
3712 return true;
3713}
3714
3715bool OSDMonitor::prepare_full(MonOpRequestRef op)
3716{
3717 op->mark_osdmon_event(__func__);
9f95a23c 3718 auto m = op->get_req<MOSDFull>();
7c673cae
FG
3719 const int from = m->get_orig_source().num();
3720
3721 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3722 const unsigned want_state = m->state & mask; // safety first
3723
3724 unsigned cur_state = osdmap.get_state(from);
3725 auto p = pending_inc.new_state.find(from);
3726 if (p != pending_inc.new_state.end()) {
3727 cur_state ^= p->second;
3728 }
3729 cur_state &= mask;
3730
3731 set<string> want_state_set, cur_state_set;
3732 OSDMap::calc_state_set(want_state, want_state_set);
3733 OSDMap::calc_state_set(cur_state, cur_state_set);
3734
3735 if (cur_state != want_state) {
3736 if (p != pending_inc.new_state.end()) {
3737 p->second &= ~mask;
3738 } else {
3739 pending_inc.new_state[from] = 0;
3740 }
3741 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3742 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3743 << " -> " << want_state_set << dendl;
3744 } else {
3745 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3746 << " = wanted " << want_state_set << ", just waiting" << dendl;
3747 }
3748
3749 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3750 return true;
3751}
3752
3753// -------------
3754// alive
3755
3756bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3757{
3758 op->mark_osdmon_event(__func__);
9f95a23c 3759 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3760 int from = m->get_orig_source().num();
3761
3762 // check permissions, ignore if failed
11fdf7f2 3763 MonSession *session = op->get_session();
7c673cae
FG
3764 if (!session)
3765 goto ignore;
3766 if (!session->is_capable("osd", MON_CAP_X)) {
3767 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3768 << session->caps << dendl;
3769 goto ignore;
3770 }
3771
3772 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3773 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3774 dout(7) << "preprocess_alive ignoring alive message from down "
3775 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3776 << dendl;
7c673cae
FG
3777 goto ignore;
3778 }
3779
3780 if (osdmap.get_up_thru(from) >= m->want) {
3781 // yup.
3782 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3783 _reply_map(op, m->version);
3784 return true;
3785 }
3786
3787 dout(10) << "preprocess_alive want up_thru " << m->want
3788 << " from " << m->get_orig_source_inst() << dendl;
3789 return false;
3790
3791 ignore:
3792 return true;
3793}
3794
3795bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3796{
3797 op->mark_osdmon_event(__func__);
9f95a23c 3798 auto m = op->get_req<MOSDAlive>();
7c673cae
FG
3799 int from = m->get_orig_source().num();
3800
3801 if (0) { // we probably don't care much about these
3802 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3803 }
3804
3805 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3806 << " from " << m->get_orig_source_inst() << dendl;
3807
3808 update_up_thru(from, m->version); // set to the latest map the OSD has
3809 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3810 return true;
3811}
3812
3813void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3814{
3815 op->mark_osdmon_event(__func__);
3816 dout(7) << "_reply_map " << e
3817 << " from " << op->get_req()->get_orig_source_inst()
3818 << dendl;
3819 send_latest(op, e);
3820}
3821
3822// pg_created
3823bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3824{
3825 op->mark_osdmon_event(__func__);
9f95a23c 3826 auto m = op->get_req<MOSDPGCreated>();
7c673cae 3827 dout(10) << __func__ << " " << *m << dendl;
11fdf7f2 3828 auto session = op->get_session();
94b18763 3829 mon->no_reply(op);
7c673cae
FG
3830 if (!session) {
3831 dout(10) << __func__ << ": no monitor session!" << dendl;
3832 return true;
3833 }
3834 if (!session->is_capable("osd", MON_CAP_X)) {
3835 derr << __func__ << " received from entity "
3836 << "with insufficient privileges " << session->caps << dendl;
3837 return true;
3838 }
3839 // always forward the "created!" to the leader
3840 return false;
3841}
3842
3843bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3844{
3845 op->mark_osdmon_event(__func__);
9f95a23c 3846 auto m = op->get_req<MOSDPGCreated>();
7c673cae
FG
3847 dout(10) << __func__ << " " << *m << dendl;
3848 auto src = m->get_orig_source();
3849 auto from = src.num();
3850 if (!src.is_osd() ||
3851 !mon->osdmon()->osdmap.is_up(from) ||
11fdf7f2
TL
3852 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3853 m->get_orig_source_addrs())) {
7c673cae
FG
3854 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3855 return false;
3856 }
3857 pending_created_pgs.push_back(m->pgid);
3858 return true;
3859}
3860
11fdf7f2
TL
3861bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3862{
3863 op->mark_osdmon_event(__func__);
9f95a23c 3864 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
3865 dout(10) << __func__ << " " << *m << dendl;
3866 const pg_pool_t *pi;
3867 auto session = op->get_session();
3868 if (!session) {
3869 dout(10) << __func__ << ": no monitor session!" << dendl;
3870 goto ignore;
3871 }
3872 if (!session->is_capable("osd", MON_CAP_X)) {
3873 derr << __func__ << " received from entity "
3874 << "with insufficient privileges " << session->caps << dendl;
3875 goto ignore;
3876 }
3877 pi = osdmap.get_pg_pool(m->pgid.pool());
3878 if (!pi) {
3879 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3880 goto ignore;
3881 }
3882 if (pi->get_pg_num() <= m->pgid.ps()) {
3883 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3884 goto ignore;
3885 }
3886 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3887 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3888 goto ignore;
3889 }
3890 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3891 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3892 goto ignore;
3893 }
3894 return false;
3895
3896 ignore:
3897 mon->no_reply(op);
3898 return true;
3899}
3900
3901bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3902{
3903 op->mark_osdmon_event(__func__);
9f95a23c 3904 auto m = op->get_req<MOSDPGReadyToMerge>();
11fdf7f2
TL
3905 dout(10) << __func__ << " " << *m << dendl;
3906 pg_pool_t p;
3907 if (pending_inc.new_pools.count(m->pgid.pool()))
3908 p = pending_inc.new_pools[m->pgid.pool()];
3909 else
3910 p = *osdmap.get_pg_pool(m->pgid.pool());
3911 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3912 p.get_pg_num_pending() > m->pgid.ps()) {
3913 dout(10) << __func__
3914 << " race with concurrent pg_num[_pending] update, will retry"
3915 << dendl;
3916 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3917 return true;
3918 }
3919
3920 if (m->ready) {
3921 p.dec_pg_num(m->pgid,
3922 pending_inc.epoch,
3923 m->source_version,
3924 m->target_version,
3925 m->last_epoch_started,
3926 m->last_epoch_clean);
3927 p.last_change = pending_inc.epoch;
3928 } else {
3929 // back off the merge attempt!
3930 p.set_pg_num_pending(p.get_pg_num());
3931 }
3932
3933 // force pre-nautilus clients to resend their ops, since they
3934 // don't understand pg_num_pending changes form a new interval
3935 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3936
3937 pending_inc.new_pools[m->pgid.pool()] = p;
3938
3939 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3940 if (m->ready &&
3941 prob > 0 &&
3942 prob > (double)(rand() % 1000)/1000.0) {
3943 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3944 auto n = new MMonCommand(mon->monmap->get_fsid());
3945 n->set_connection(m->get_connection());
3946 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3947 osdmap.get_pool_name(m->pgid.pool()) +
3948 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3949 stringify(m->pgid.ps() + 1) + "\"}" };
3950 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3951 nop->set_type_service();
3952 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3953 } else {
3954 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3955 }
3956 return true;
3957}
3958
3959
7c673cae
FG
3960// -------------
3961// pg_temp changes
3962
3963bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3964{
9f95a23c 3965 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
3966 dout(10) << "preprocess_pgtemp " << *m << dendl;
3967 mempool::osdmap::vector<int> empty;
3968 int from = m->get_orig_source().num();
3969 size_t ignore_cnt = 0;
3970
3971 // check caps
11fdf7f2 3972 MonSession *session = op->get_session();
7c673cae
FG
3973 if (!session)
3974 goto ignore;
3975 if (!session->is_capable("osd", MON_CAP_X)) {
3976 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3977 << session->caps << dendl;
3978 goto ignore;
3979 }
3980
3981 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3982 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3983 dout(7) << "ignoring pgtemp message from down "
3984 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3985 << dendl;
7c673cae
FG
3986 goto ignore;
3987 }
3988
3efd9988
FG
3989 if (m->forced) {
3990 return false;
3991 }
3992
7c673cae
FG
3993 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3994 dout(20) << " " << p->first
31f18b77 3995 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
3996 << " -> " << p->second << dendl;
3997
3998 // does the pool exist?
3999 if (!osdmap.have_pg_pool(p->first.pool())) {
4000 /*
4001 * 1. If the osdmap does not have the pool, it means the pool has been
4002 * removed in-between the osd sending this message and us handling it.
4003 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4004 * not exist in the pending either, as the osds would not send a
4005 * message about a pool they know nothing about (yet).
4006 * 3. However, if the pool does exist in the pending, then it must be a
4007 * new pool, and not relevant to this message (see 1).
4008 */
4009 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4010 << ": pool has been removed" << dendl;
4011 ignore_cnt++;
4012 continue;
4013 }
4014
4015 int acting_primary = -1;
4016 osdmap.pg_to_up_acting_osds(
4017 p->first, nullptr, nullptr, nullptr, &acting_primary);
4018 if (acting_primary != from) {
4019 /* If the source isn't the primary based on the current osdmap, we know
4020 * that the interval changed and that we can discard this message.
4021 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4022 * which of two pg temp mappings on the same pg is more recent.
4023 */
4024 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4025 << ": primary has changed" << dendl;
4026 ignore_cnt++;
4027 continue;
4028 }
4029
4030 // removal?
4031 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4032 osdmap.primary_temp->count(p->first)))
4033 return false;
4034 // change?
4035 // NOTE: we assume that this will clear pg_primary, so consider
4036 // an existing pg_primary field to imply a change
4037 if (p->second.size() &&
4038 (osdmap.pg_temp->count(p->first) == 0 ||
11fdf7f2 4039 osdmap.pg_temp->get(p->first) != p->second ||
7c673cae
FG
4040 osdmap.primary_temp->count(p->first)))
4041 return false;
4042 }
4043
4044 // should we ignore all the pgs?
4045 if (ignore_cnt == m->pg_temp.size())
4046 goto ignore;
4047
4048 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4049 _reply_map(op, m->map_epoch);
4050 return true;
4051
4052 ignore:
4053 return true;
4054}
4055
4056void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4057{
4058 epoch_t old_up_thru = osdmap.get_up_thru(from);
4059 auto ut = pending_inc.new_up_thru.find(from);
4060 if (ut != pending_inc.new_up_thru.end()) {
4061 old_up_thru = ut->second;
4062 }
4063 if (up_thru > old_up_thru) {
4064 // set up_thru too, so the osd doesn't have to ask again
4065 pending_inc.new_up_thru[from] = up_thru;
4066 }
4067}
4068
4069bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4070{
4071 op->mark_osdmon_event(__func__);
9f95a23c 4072 auto m = op->get_req<MOSDPGTemp>();
7c673cae
FG
4073 int from = m->get_orig_source().num();
4074 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4075 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4076 uint64_t pool = p->first.pool();
4077 if (pending_inc.old_pools.count(pool)) {
4078 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4079 << ": pool pending removal" << dendl;
4080 continue;
4081 }
4082 if (!osdmap.have_pg_pool(pool)) {
4083 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4084 << ": pool has been removed" << dendl;
4085 continue;
4086 }
4087 pending_inc.new_pg_temp[p->first] =
4088 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4089
4090 // unconditionally clear pg_primary (until this message can encode
4091 // a change for that, too.. at which point we need to also fix
4092 // preprocess_pg_temp)
4093 if (osdmap.primary_temp->count(p->first) ||
4094 pending_inc.new_primary_temp.count(p->first))
4095 pending_inc.new_primary_temp[p->first] = -1;
4096 }
4097
4098 // set up_thru too, so the osd doesn't have to ask again
4099 update_up_thru(from, m->map_epoch);
4100
4101 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4102 return true;
4103}
4104
4105
4106// ---
4107
4108bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4109{
4110 op->mark_osdmon_event(__func__);
9f95a23c 4111 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4112 dout(7) << "preprocess_remove_snaps " << *m << dendl;
4113
4114 // check privilege, ignore if failed
11fdf7f2 4115 MonSession *session = op->get_session();
f64942e4 4116 mon->no_reply(op);
7c673cae
FG
4117 if (!session)
4118 goto ignore;
4119 if (!session->caps.is_capable(
11fdf7f2 4120 cct,
7c673cae 4121 session->entity_name,
11fdf7f2
TL
4122 "osd", "osd pool rmsnap", {}, true, true, false,
4123 session->get_peer_socket_addr())) {
7c673cae
FG
4124 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4125 << session->caps << dendl;
4126 goto ignore;
4127 }
4128
4129 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4130 q != m->snaps.end();
4131 ++q) {
4132 if (!osdmap.have_pg_pool(q->first)) {
9f95a23c
TL
4133 dout(10) << " ignoring removed_snaps " << q->second
4134 << " on non-existent pool " << q->first << dendl;
7c673cae
FG
4135 continue;
4136 }
4137 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4138 for (vector<snapid_t>::iterator p = q->second.begin();
4139 p != q->second.end();
4140 ++p) {
4141 if (*p > pi->get_snap_seq() ||
9f95a23c 4142 !_is_removed_snap(q->first, *p)) {
7c673cae 4143 return false;
9f95a23c 4144 }
7c673cae
FG
4145 }
4146 }
4147
9f95a23c
TL
4148 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4149 auto reply = make_message<MRemoveSnaps>();
4150 reply->snaps = m->snaps;
4151 mon->send_reply(op, reply.detach());
4152 }
4153
7c673cae
FG
4154 ignore:
4155 return true;
4156}
4157
4158bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4159{
4160 op->mark_osdmon_event(__func__);
9f95a23c 4161 auto m = op->get_req<MRemoveSnaps>();
7c673cae
FG
4162 dout(7) << "prepare_remove_snaps " << *m << dendl;
4163
9f95a23c
TL
4164 for (auto& [pool, snaps] : m->snaps) {
4165 if (!osdmap.have_pg_pool(pool)) {
4166 dout(10) << " ignoring removed_snaps " << snaps
4167 << " on non-existent pool " << pool << dendl;
7c673cae
FG
4168 continue;
4169 }
4170
9f95a23c
TL
4171 pg_pool_t& pi = osdmap.pools[pool];
4172 for (auto s : snaps) {
4173 if (!_is_removed_snap(pool, s) &&
4174 (!pending_inc.new_pools.count(pool) ||
4175 !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4176 (!pending_inc.new_removed_snaps.count(pool) ||
4177 !pending_inc.new_removed_snaps[pool].contains(s))) {
4178 pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4179 if (osdmap.require_osd_release < ceph_release_t::octopus) {
4180 newpi->removed_snaps.insert(s);
4181 dout(10) << " pool " << pool << " removed_snaps added " << s
4182 << " (now " << newpi->removed_snaps << ")" << dendl;
4183 }
11fdf7f2 4184 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
9f95a23c
TL
4185 if (s > newpi->get_snap_seq()) {
4186 dout(10) << " pool " << pool << " snap_seq "
4187 << newpi->get_snap_seq() << " -> " << s << dendl;
4188 newpi->set_snap_seq(s);
7c673cae
FG
4189 }
4190 newpi->set_snap_epoch(pending_inc.epoch);
9f95a23c
TL
4191 dout(10) << " added pool " << pool << " snap " << s
4192 << " to removed_snaps queue" << dendl;
4193 pending_inc.new_removed_snaps[pool].insert(s);
7c673cae
FG
4194 }
4195 }
4196 }
9f95a23c
TL
4197
4198 if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4199 auto reply = make_message<MRemoveSnaps>();
4200 reply->snaps = m->snaps;
4201 wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4202 }
4203
4204 return true;
4205}
4206
4207bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4208{
4209 op->mark_osdmon_event(__func__);
4210 auto m = op->get_req<MMonGetPurgedSnaps>();
4211 dout(7) << __func__ << " " << *m << dendl;
4212
4213 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4214
4215 string k = make_purged_snap_epoch_key(m->start);
4216 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4217 it->upper_bound(k);
4218 unsigned long epoch = m->last;
4219 while (it->valid()) {
4220 if (it->key().find("purged_epoch_") != 0) {
4221 break;
4222 }
4223 string k = it->key();
4224 int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4225 if (n != 1) {
4226 derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4227 } else if (epoch > m->last) {
4228 break;
4229 } else {
4230 bufferlist bl = it->value();
4231 auto p = bl.cbegin();
4232 auto &v = r[epoch];
4233 try {
4234 ceph::decode(v, p);
4235 } catch (buffer::error& e) {
4236 derr << __func__ << " unable to parse value for key '" << it->key()
4237 << "': \n";
4238 bl.hexdump(*_dout);
4239 *_dout << dendl;
4240 }
4241 n += 4 + v.size() * 16;
4242 }
4243 if (n > 1048576) {
4244 // impose a semi-arbitrary limit to message size
4245 break;
4246 }
4247 it->next();
4248 }
4249
4250 auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4251 reply->purged_snaps.swap(r);
4252 mon->send_reply(op, reply.detach());
4253
7c673cae
FG
4254 return true;
4255}
4256
4257// osd beacon
4258bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4259{
4260 op->mark_osdmon_event(__func__);
7c673cae 4261 // check caps
11fdf7f2 4262 auto session = op->get_session();
94b18763 4263 mon->no_reply(op);
7c673cae
FG
4264 if (!session) {
4265 dout(10) << __func__ << " no monitor session!" << dendl;
4266 return true;
4267 }
4268 if (!session->is_capable("osd", MON_CAP_X)) {
4269 derr << __func__ << " received from entity "
4270 << "with insufficient privileges " << session->caps << dendl;
4271 return true;
4272 }
4273 // Always forward the beacon to the leader, even if they are the same as
4274 // the old one. The leader will mark as down osds that haven't sent
4275 // beacon for a few minutes.
4276 return false;
4277}
4278
4279bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4280{
4281 op->mark_osdmon_event(__func__);
9f95a23c 4282 const auto beacon = op->get_req<MOSDBeacon>();
7c673cae
FG
4283 const auto src = beacon->get_orig_source();
4284 dout(10) << __func__ << " " << *beacon
4285 << " from " << src << dendl;
4286 int from = src.num();
4287
4288 if (!src.is_osd() ||
4289 !osdmap.is_up(from) ||
11fdf7f2
TL
4290 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4291 if (src.is_osd() && !osdmap.is_up(from)) {
4292 // share some new maps with this guy in case it may not be
4293 // aware of its own deadness...
4294 send_latest(op, beacon->version+1);
4295 }
4296 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
7c673cae
FG
4297 return false;
4298 }
4299
4300 last_osd_report[from] = ceph_clock_now();
4301 osd_epochs[from] = beacon->version;
4302
4303 for (const auto& pg : beacon->pgs) {
4304 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4305 }
9f95a23c
TL
4306
4307 if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4308 beacon->last_purged_snaps_scrub) {
4309 if (pending_inc.new_xinfo.count(from) == 0) {
4310 pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4311 }
4312 pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4313 beacon->last_purged_snaps_scrub;
4314 return true;
4315 } else {
4316 return false;
4317 }
7c673cae
FG
4318}
4319
4320// ---------------
4321// map helpers
4322
4323void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4324{
4325 op->mark_osdmon_event(__func__);
4326 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4327 << " start " << start << dendl;
4328 if (start == 0)
4329 send_full(op);
4330 else
4331 send_incremental(op, start);
4332}
4333
4334
28e407b8 4335MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
7c673cae 4336{
28e407b8
AA
4337 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4338 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
7c673cae
FG
4339 r->oldest_map = get_first_committed();
4340 r->newest_map = osdmap.get_epoch();
4341 return r;
4342}
4343
28e407b8 4344MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
7c673cae 4345{
11fdf7f2
TL
4346 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4347 << std::hex << features << std::dec << dendl;
28e407b8 4348 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
4349 m->oldest_map = get_first_committed();
4350 m->newest_map = osdmap.get_epoch();
4351
4352 for (epoch_t e = to; e >= from && e > 0; e--) {
4353 bufferlist bl;
28e407b8 4354 int err = get_version(e, features, bl);
7c673cae 4355 if (err == 0) {
11fdf7f2 4356 ceph_assert(bl.length());
7c673cae
FG
4357 // if (get_version(e, bl) > 0) {
4358 dout(20) << "build_incremental inc " << e << " "
4359 << bl.length() << " bytes" << dendl;
4360 m->incremental_maps[e] = bl;
4361 } else {
11fdf7f2
TL
4362 ceph_assert(err == -ENOENT);
4363 ceph_assert(!bl.length());
28e407b8 4364 get_version_full(e, features, bl);
7c673cae
FG
4365 if (bl.length() > 0) {
4366 //else if (get_version("full", e, bl) > 0) {
4367 dout(20) << "build_incremental full " << e << " "
4368 << bl.length() << " bytes" << dendl;
4369 m->maps[e] = bl;
4370 } else {
4371 ceph_abort(); // we should have all maps.
4372 }
4373 }
4374 }
4375 return m;
4376}
4377
4378void OSDMonitor::send_full(MonOpRequestRef op)
4379{
4380 op->mark_osdmon_event(__func__);
4381 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
28e407b8 4382 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
7c673cae
FG
4383}
4384
4385void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4386{
4387 op->mark_osdmon_event(__func__);
4388
4389 MonSession *s = op->get_session();
11fdf7f2 4390 ceph_assert(s);
7c673cae 4391
11fdf7f2 4392 if (s->proxy_con) {
7c673cae
FG
4393 // oh, we can tell the other mon to do it
4394 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4395 << first << dendl;
4396 MRoute *r = new MRoute(s->proxy_tid, NULL);
4397 r->send_osdmap_first = first;
4398 s->proxy_con->send_message(r);
4399 op->mark_event("reply: send routed send_osdmap_first reply");
4400 } else {
4401 // do it ourselves
4402 send_incremental(first, s, false, op);
4403 }
4404}
4405
4406void OSDMonitor::send_incremental(epoch_t first,
4407 MonSession *session,
4408 bool onetime,
4409 MonOpRequestRef req)
4410{
4411 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
11fdf7f2 4412 << " to " << session->name << dendl;
7c673cae 4413
28e407b8
AA
4414 // get feature of the peer
4415 // use quorum_con_features, if it's an anonymous connection.
4416 uint64_t features = session->con_features ? session->con_features :
4417 mon->get_quorum_con_features();
4418
7c673cae 4419 if (first <= session->osd_epoch) {
11fdf7f2 4420 dout(10) << __func__ << " " << session->name << " should already have epoch "
7c673cae
FG
4421 << session->osd_epoch << dendl;
4422 first = session->osd_epoch + 1;
4423 }
4424
4425 if (first < get_first_committed()) {
11fdf7f2
TL
4426 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4427 m->oldest_map = get_first_committed();
4428 m->newest_map = osdmap.get_epoch();
4429
7c673cae
FG
4430 first = get_first_committed();
4431 bufferlist bl;
28e407b8 4432 int err = get_version_full(first, features, bl);
11fdf7f2
TL
4433 ceph_assert(err == 0);
4434 ceph_assert(bl.length());
7c673cae
FG
4435 dout(20) << "send_incremental starting with base full "
4436 << first << " " << bl.length() << " bytes" << dendl;
7c673cae
FG
4437 m->maps[first] = bl;
4438
4439 if (req) {
4440 mon->send_reply(req, m);
4441 session->osd_epoch = first;
4442 return;
4443 } else {
4444 session->con->send_message(m);
4445 session->osd_epoch = first;
4446 }
4447 first++;
4448 }
4449
4450 while (first <= osdmap.get_epoch()) {
11fdf7f2 4451 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
28e407b8
AA
4452 osdmap.get_epoch());
4453 MOSDMap *m = build_incremental(first, last, features);
7c673cae
FG
4454
4455 if (req) {
4456 // send some maps. it may not be all of them, but it will get them
4457 // started.
4458 mon->send_reply(req, m);
4459 } else {
4460 session->con->send_message(m);
4461 first = last + 1;
4462 }
4463 session->osd_epoch = last;
4464 if (onetime || req)
4465 break;
4466 }
4467}
4468
4469int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4470{
28e407b8
AA
4471 return get_version(ver, mon->get_quorum_con_features(), bl);
4472}
4473
4474void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4475{
4476 OSDMap::Incremental inc;
11fdf7f2 4477 auto q = bl.cbegin();
28e407b8
AA
4478 inc.decode(q);
4479 // always encode with subset of osdmap's canonical features
4480 uint64_t f = features & inc.encode_features;
4481 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4482 << dendl;
4483 bl.clear();
4484 if (inc.fullmap.length()) {
4485 // embedded full map?
4486 OSDMap m;
4487 m.decode(inc.fullmap);
4488 inc.fullmap.clear();
4489 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4490 }
4491 if (inc.crush.length()) {
4492 // embedded crush map
4493 CrushWrapper c;
11fdf7f2 4494 auto p = inc.crush.cbegin();
28e407b8
AA
4495 c.decode(p);
4496 inc.crush.clear();
4497 c.encode(inc.crush, f);
4498 }
4499 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4500}
4501
4502void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4503{
4504 OSDMap m;
11fdf7f2 4505 auto q = bl.cbegin();
28e407b8
AA
4506 m.decode(q);
4507 // always encode with subset of osdmap's canonical features
4508 uint64_t f = features & m.get_encoding_features();
4509 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4510 << dendl;
4511 bl.clear();
4512 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4513}
4514
4515int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4516{
4517 uint64_t significant_features = OSDMap::get_significant_features(features);
4518 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4519 return 0;
4520 }
4521 int ret = PaxosService::get_version(ver, bl);
4522 if (ret < 0) {
7c673cae 4523 return ret;
28e407b8
AA
4524 }
4525 // NOTE: this check is imprecise; the OSDMap encoding features may
4526 // be a subset of the latest mon quorum features, but worst case we
4527 // reencode once and then cache the (identical) result under both
4528 // feature masks.
4529 if (significant_features !=
4530 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4531 reencode_incremental_map(bl, features);
4532 }
eafe8130 4533 inc_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4534 return 0;
7c673cae
FG
4535}
4536
11fdf7f2
TL
4537int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4538{
4539 bufferlist inc_bl;
4540 int err = get_version(ver, inc_bl);
4541 ceph_assert(err == 0);
4542 ceph_assert(inc_bl.length());
4543
4544 auto p = inc_bl.cbegin();
4545 inc.decode(p);
4546 dout(10) << __func__ << " "
4547 << " epoch " << inc.epoch
4548 << " inc_crc " << inc.inc_crc
4549 << " full_crc " << inc.full_crc
4550 << " encode_features " << inc.encode_features << dendl;
4551 return 0;
4552}
4553
4554int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4555{
4556 dout(10) << __func__ << " ver " << ver << dendl;
4557
4558 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4559 if (closest_pinned == 0) {
4560 return -ENOENT;
4561 }
4562 if (closest_pinned > ver) {
4563 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4564 }
4565 ceph_assert(closest_pinned <= ver);
4566
4567 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4568
4569 // get osdmap incremental maps and apply on top of this one.
4570 bufferlist osdm_bl;
4571 bool has_cached_osdmap = false;
4572 for (version_t v = ver-1; v >= closest_pinned; --v) {
4573 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4574 &osdm_bl)) {
4575 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4576 closest_pinned = v;
4577 has_cached_osdmap = true;
4578 break;
4579 }
4580 }
4581
4582 if (!has_cached_osdmap) {
4583 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4584 if (err != 0) {
4585 derr << __func__ << " closest pinned map ver " << closest_pinned
4586 << " not available! error: " << cpp_strerror(err) << dendl;
4587 }
4588 ceph_assert(err == 0);
4589 }
4590
4591 ceph_assert(osdm_bl.length());
4592
4593 OSDMap osdm;
4594 osdm.decode(osdm_bl);
4595
4596 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4597 << " e" << osdm.epoch
4598 << " crc " << osdm.get_crc()
4599 << " -- applying incremental maps." << dendl;
4600
4601 uint64_t encode_features = 0;
4602 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4603 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4604
4605 OSDMap::Incremental inc;
4606 int err = get_inc(v, inc);
4607 ceph_assert(err == 0);
4608
4609 encode_features = inc.encode_features;
4610
4611 err = osdm.apply_incremental(inc);
4612 ceph_assert(err == 0);
4613
4614 // this block performs paranoid checks on map retrieval
4615 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4616 inc.full_crc != 0) {
4617
4618 uint64_t f = encode_features;
4619 if (!f) {
4620 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4621 }
4622
4623 // encode osdmap to force calculating crcs
4624 bufferlist tbl;
4625 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4626 // decode osdmap to compare crcs with what's expected by incremental
4627 OSDMap tosdm;
4628 tosdm.decode(tbl);
4629
4630 if (tosdm.get_crc() != inc.full_crc) {
4631 derr << __func__
4632 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4633 << ", expected " << inc.full_crc << ")" << dendl;
4634 ceph_abort_msg("osdmap crc mismatch");
4635 }
4636 }
4637
4638 // note: we cannot add the recently computed map to the cache, as is,
4639 // because we have not encoded the map into a bl.
4640 }
4641
4642 if (!encode_features) {
4643 dout(10) << __func__
4644 << " last incremental map didn't have features;"
4645 << " defaulting to quorum's or all" << dendl;
4646 encode_features =
4647 (mon->quorum_con_features ? mon->quorum_con_features : -1);
4648 }
4649 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4650
4651 return 0;
4652}
4653
7c673cae
FG
4654int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4655{
28e407b8
AA
4656 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4657}
4658
4659int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4660 bufferlist& bl)
4661{
4662 uint64_t significant_features = OSDMap::get_significant_features(features);
4663 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4664 return 0;
4665 }
4666 int ret = PaxosService::get_version_full(ver, bl);
11fdf7f2
TL
4667 if (ret == -ENOENT) {
4668 // build map?
4669 ret = get_full_from_pinned_map(ver, bl);
4670 }
28e407b8 4671 if (ret < 0) {
7c673cae 4672 return ret;
28e407b8
AA
4673 }
4674 // NOTE: this check is imprecise; the OSDMap encoding features may
4675 // be a subset of the latest mon quorum features, but worst case we
4676 // reencode once and then cache the (identical) result under both
4677 // feature masks.
4678 if (significant_features !=
4679 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4680 reencode_full_map(bl, features);
4681 }
eafe8130 4682 full_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4683 return 0;
7c673cae
FG
4684}
4685
11fdf7f2
TL
4686epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4687{
4688 dout(10) << "blacklist " << av << " until " << until << dendl;
4689 for (auto a : av.v) {
9f95a23c 4690 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4691 a.set_type(entity_addr_t::TYPE_ANY);
4692 } else {
4693 a.set_type(entity_addr_t::TYPE_LEGACY);
4694 }
4695 pending_inc.new_blacklist[a] = until;
4696 }
4697 return pending_inc.epoch;
4698}
4699
4700epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
7c673cae 4701{
9f95a23c 4702 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
4703 a.set_type(entity_addr_t::TYPE_ANY);
4704 } else {
4705 a.set_type(entity_addr_t::TYPE_LEGACY);
4706 }
7c673cae
FG
4707 dout(10) << "blacklist " << a << " until " << until << dendl;
4708 pending_inc.new_blacklist[a] = until;
4709 return pending_inc.epoch;
4710}
4711
4712
4713void OSDMonitor::check_osdmap_subs()
4714{
4715 dout(10) << __func__ << dendl;
4716 if (!osdmap.get_epoch()) {
4717 return;
4718 }
4719 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4720 if (osdmap_subs == mon->session_map.subs.end()) {
4721 return;
4722 }
4723 auto p = osdmap_subs->second->begin();
4724 while (!p.end()) {
4725 auto sub = *p;
4726 ++p;
4727 check_osdmap_sub(sub);
4728 }
4729}
4730
4731void OSDMonitor::check_osdmap_sub(Subscription *sub)
4732{
4733 dout(10) << __func__ << " " << sub << " next " << sub->next
4734 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4735 if (sub->next <= osdmap.get_epoch()) {
4736 if (sub->next >= 1)
4737 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4738 else
28e407b8 4739 sub->session->con->send_message(build_latest_full(sub->session->con_features));
7c673cae
FG
4740 if (sub->onetime)
4741 mon->session_map.remove_sub(sub);
4742 else
4743 sub->next = osdmap.get_epoch() + 1;
4744 }
4745}
4746
4747void OSDMonitor::check_pg_creates_subs()
4748{
7c673cae
FG
4749 if (!osdmap.get_num_up_osds()) {
4750 return;
4751 }
11fdf7f2 4752 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
7c673cae
FG
4753 mon->with_session_map([this](const MonSessionMap& session_map) {
4754 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4755 if (pg_creates_subs == session_map.subs.end()) {
4756 return;
4757 }
4758 for (auto sub : *pg_creates_subs->second) {
4759 check_pg_creates_sub(sub);
4760 }
4761 });
4762}
4763
4764void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4765{
11fdf7f2
TL
4766 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4767 ceph_assert(sub->type == "osd_pg_creates");
7c673cae
FG
4768 // only send these if the OSD is up. we will check_subs() when they do
4769 // come up so they will get the creates then.
11fdf7f2
TL
4770 if (sub->session->name.is_osd() &&
4771 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4772 sub->next = send_pg_creates(sub->session->name.num(),
7c673cae
FG
4773 sub->session->con.get(),
4774 sub->next);
4775 }
4776}
4777
c07f9fc5 4778void OSDMonitor::do_application_enable(int64_t pool_id,
11fdf7f2
TL
4779 const std::string &app_name,
4780 const std::string &app_key,
1911f103
TL
4781 const std::string &app_value,
4782 bool force)
c07f9fc5 4783{
11fdf7f2 4784 ceph_assert(paxos->is_plugged() && is_writeable());
c07f9fc5
FG
4785
4786 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4787 << dendl;
4788
9f95a23c 4789 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
35e4c445 4790
c07f9fc5 4791 auto pp = osdmap.get_pg_pool(pool_id);
11fdf7f2 4792 ceph_assert(pp != nullptr);
c07f9fc5
FG
4793
4794 pg_pool_t p = *pp;
4795 if (pending_inc.new_pools.count(pool_id)) {
4796 p = pending_inc.new_pools[pool_id];
4797 }
4798
11fdf7f2
TL
4799 if (app_key.empty()) {
4800 p.application_metadata.insert({app_name, {}});
4801 } else {
1911f103
TL
4802 if (force) {
4803 p.application_metadata[app_name][app_key] = app_value;
4804 } else {
4805 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4806 }
11fdf7f2 4807 }
c07f9fc5
FG
4808 p.last_change = pending_inc.epoch;
4809 pending_inc.new_pools[pool_id] = p;
4810}
4811
494da23a
TL
4812void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4813 pool_opts_t::key_t opt,
4814 pool_opts_t::value_t val)
4815{
4816 auto p = pending_inc.new_pools.try_emplace(
4817 pool_id, *osdmap.get_pg_pool(pool_id));
4818 p.first->second.opts.set(opt, val);
4819}
4820
31f18b77 4821unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
4822 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4823 const mempool::osdmap::set<int64_t>& removed_pools,
4824 utime_t modified,
4825 creating_pgs_t* creating_pgs) const
4826{
31f18b77 4827 unsigned queued = 0;
7c673cae
FG
4828 for (auto& p : pools) {
4829 int64_t poolid = p.first;
11fdf7f2
TL
4830 if (creating_pgs->created_pools.count(poolid)) {
4831 dout(10) << __func__ << " already created " << poolid << dendl;
4832 continue;
4833 }
7c673cae 4834 const pg_pool_t& pool = p.second;
31f18b77 4835 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
4836 pool.get_type(), pool.get_size());
4837 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4838 continue;
4839
4840 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4841 const auto created = pool.get_last_change();
4842 if (last_scan_epoch && created <= last_scan_epoch) {
4843 dout(10) << __func__ << " no change in pool " << poolid
4844 << " " << pool << dendl;
4845 continue;
4846 }
4847 if (removed_pools.count(poolid)) {
4848 dout(10) << __func__ << " pool is being removed: " << poolid
4849 << " " << pool << dendl;
4850 continue;
4851 }
31f18b77 4852 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 4853 << " " << pool << dendl;
11fdf7f2
TL
4854 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4855 created, modified);
4856 queued++;
7c673cae 4857 }
31f18b77 4858 return queued;
7c673cae
FG
4859}
4860
4861void OSDMonitor::update_creating_pgs()
4862{
31f18b77
FG
4863 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4864 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
4865 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4866 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 4867 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
4868 int acting_primary = -1;
4869 auto pgid = pg.first;
94b18763
FG
4870 if (!osdmap.pg_exists(pgid)) {
4871 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4872 << dendl;
4873 continue;
4874 }
9f95a23c 4875 auto mapped = pg.second.create_epoch;
c07f9fc5 4876 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
11fdf7f2
TL
4877 spg_t spgid(pgid);
4878 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
7c673cae
FG
4879 // check the previous creating_pgs, look for the target to whom the pg was
4880 // previously mapped
4881 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4882 const auto last_acting_primary = pgs_by_epoch.first;
4883 for (auto& pgs: pgs_by_epoch.second) {
11fdf7f2 4884 if (pgs.second.count(spgid)) {
7c673cae
FG
4885 if (last_acting_primary == acting_primary) {
4886 mapped = pgs.first;
4887 } else {
4888 dout(20) << __func__ << " " << pgid << " "
4889 << " acting_primary:" << last_acting_primary
4890 << " -> " << acting_primary << dendl;
4891 // note epoch if the target of the create message changed.
4892 mapped = mapping.get_epoch();
4893 }
4894 break;
31f18b77
FG
4895 } else {
4896 // newly creating
4897 mapped = mapping.get_epoch();
4898 }
7c673cae
FG
4899 }
4900 }
4901 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 4902 << " to create " << pgid << "@" << mapped << dendl;
11fdf7f2 4903 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
7c673cae
FG
4904 }
4905 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4906 creating_pgs_epoch = mapping.get_epoch();
4907}
4908
c07f9fc5 4909epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
4910{
4911 dout(30) << __func__ << " osd." << osd << " next=" << next
4912 << " " << creating_pgs_by_osd_epoch << dendl;
4913 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
4914 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4915 dout(20) << __func__
4916 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4917 // the subscribers will be updated when the mapping is completed anyway
4918 return next;
4919 }
7c673cae
FG
4920 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4921 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4922 return next;
11fdf7f2
TL
4923 ceph_assert(!creating_pgs_by_epoch->second.empty());
4924
4925 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4926 MOSDPGCreate2 *m = nullptr;
4927
9f95a23c 4928 bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
7c673cae 4929
7c673cae
FG
4930 epoch_t last = 0;
4931 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4932 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4933 auto epoch = epoch_pgs->first;
4934 auto& pgs = epoch_pgs->second;
4935 dout(20) << __func__ << " osd." << osd << " from " << next
4936 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4937 last = epoch;
4938 for (auto& pg : pgs) {
7c673cae
FG
4939 // Need the create time from the monitor using its clock to set
4940 // last_scrub_stamp upon pg creation.
11fdf7f2
TL
4941 auto create = creating_pgs.pgs.find(pg.pgid);
4942 ceph_assert(create != creating_pgs.pgs.end());
4943 if (old) {
4944 if (!oldm) {
4945 oldm = new MOSDPGCreate(creating_pgs_epoch);
4946 }
4947 oldm->mkpg.emplace(pg.pgid,
9f95a23c
TL
4948 pg_create_t{create->second.create_epoch, pg.pgid, 0});
4949 oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
11fdf7f2
TL
4950 } else {
4951 if (!m) {
4952 m = new MOSDPGCreate2(creating_pgs_epoch);
4953 }
9f95a23c
TL
4954 m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4955 create->second.create_stamp));
4956 if (create->second.history.epoch_created) {
4957 dout(20) << __func__ << " " << pg << " " << create->second.history
4958 << " " << create->second.past_intervals << dendl;
4959 m->pg_extra.emplace(pg, make_pair(create->second.history,
4960 create->second.past_intervals));
4961 }
11fdf7f2 4962 }
7c673cae 4963 dout(20) << __func__ << " will create " << pg
9f95a23c 4964 << " at " << create->second.create_epoch << dendl;
7c673cae
FG
4965 }
4966 }
11fdf7f2
TL
4967 if (m) {
4968 con->send_message(m);
4969 } else if (oldm) {
4970 con->send_message(oldm);
4971 } else {
7c673cae
FG
4972 dout(20) << __func__ << " osd." << osd << " from " << next
4973 << " has nothing to send" << dendl;
4974 return next;
4975 }
11fdf7f2 4976
7c673cae
FG
4977 // sub is current through last + 1
4978 return last + 1;
4979}
4980
4981// TICK
4982
4983
4984void OSDMonitor::tick()
4985{
4986 if (!is_active()) return;
4987
4988 dout(10) << osdmap << dendl;
4989
11fdf7f2
TL
4990 // always update osdmap manifest, regardless of being the leader.
4991 load_osdmap_manifest();
4992
1911f103
TL
4993 // always tune priority cache manager memory on leader and peons
4994 if (ceph_using_tcmalloc() && mon_memory_autotune) {
4995 std::lock_guard l(balancer_lock);
4996 if (pcm != nullptr) {
4997 pcm->tune_memory();
4998 pcm->balance();
4999 _set_new_cache_sizes();
5000 dout(10) << "tick balancer "
5001 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5002 << " inc comtd_bytes: " << inc_cache->get_committed_size()
5003 << " inc used_bytes: " << inc_cache->_get_used_bytes()
5004 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5005 << dendl;
5006 dout(10) << "tick balancer "
5007 << " full cache_bytes: " << full_cache->get_cache_bytes()
5008 << " full comtd_bytes: " << full_cache->get_committed_size()
5009 << " full used_bytes: " << full_cache->_get_used_bytes()
5010 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5011 << dendl;
5012 }
5013 }
5014
7c673cae
FG
5015 if (!mon->is_leader()) return;
5016
5017 bool do_propose = false;
5018 utime_t now = ceph_clock_now();
5019
11fdf7f2 5020 if (handle_osd_timeouts(now, last_osd_report)) {
181888fb
FG
5021 do_propose = true;
5022 }
7c673cae
FG
5023
5024 // mark osds down?
11fdf7f2 5025 if (check_failures(now)) {
7c673cae 5026 do_propose = true;
11fdf7f2
TL
5027 }
5028
5029 // Force a proposal if we need to prune; pruning is performed on
5030 // ``encode_pending()``, hence why we need to regularly trigger a proposal
5031 // even if there's nothing going on.
5032 if (is_prune_enabled() && should_prune()) {
5033 do_propose = true;
5034 }
7c673cae
FG
5035
5036 // mark down osds out?
5037
5038 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5039 * influence at all. The decision is made based on the ratio of "in" osds,
5040 * and the function returns false if this ratio is lower that the minimum
11fdf7f2 5041 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
7c673cae
FG
5042 */
5043 if (can_mark_out(-1)) {
11fdf7f2
TL
5044 string down_out_subtree_limit = g_conf().get_val<string>(
5045 "mon_osd_down_out_subtree_limit");
7c673cae
FG
5046 set<int> down_cache; // quick cache of down subtrees
5047
5048 map<int,utime_t>::iterator i = down_pending_out.begin();
5049 while (i != down_pending_out.end()) {
5050 int o = i->first;
5051 utime_t down = now;
5052 down -= i->second;
5053 ++i;
5054
5055 if (osdmap.is_down(o) &&
5056 osdmap.is_in(o) &&
5057 can_mark_out(o)) {
11fdf7f2 5058 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
7c673cae
FG
5059 utime_t grace = orig_grace;
5060 double my_grace = 0.0;
5061
11fdf7f2 5062 if (g_conf()->mon_osd_adjust_down_out_interval) {
7c673cae
FG
5063 // scale grace period the same way we do the heartbeat grace.
5064 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
11fdf7f2 5065 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
5066 double decay_k = ::log(.5) / halflife;
5067 double decay = exp((double)down * decay_k);
5068 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5069 << " down for " << down << " decay " << decay << dendl;
5070 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5071 grace += my_grace;
5072 }
5073
5074 // is this an entire large subtree down?
11fdf7f2
TL
5075 if (down_out_subtree_limit.length()) {
5076 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
7c673cae 5077 if (type > 0) {
11fdf7f2
TL
5078 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5079 dout(10) << "tick entire containing " << down_out_subtree_limit
5080 << " subtree for osd." << o
5081 << " is down; resetting timer" << dendl;
7c673cae
FG
5082 // reset timer, too.
5083 down_pending_out[o] = now;
5084 continue;
5085 }
5086 }
5087 }
5088
c07f9fc5 5089 bool down_out = !osdmap.is_destroyed(o) &&
11fdf7f2 5090 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
c07f9fc5 5091 bool destroyed_out = osdmap.is_destroyed(o) &&
11fdf7f2 5092 g_conf()->mon_osd_destroyed_out_interval > 0 &&
c07f9fc5
FG
5093 // this is not precise enough as we did not make a note when this osd
5094 // was marked as destroyed, but let's not bother with that
5095 // complexity for now.
11fdf7f2 5096 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
c07f9fc5 5097 if (down_out || destroyed_out) {
7c673cae
FG
5098 dout(10) << "tick marking osd." << o << " OUT after " << down
5099 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5100 pending_inc.new_weight[o] = CEPH_OSD_OUT;
5101
5102 // set the AUTOOUT bit.
5103 if (pending_inc.new_state.count(o) == 0)
5104 pending_inc.new_state[o] = 0;
5105 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5106
5107 // remember previous weight
5108 if (pending_inc.new_xinfo.count(o) == 0)
5109 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5110 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5111
5112 do_propose = true;
5113
224ce89b
WB
5114 mon->clog->info() << "Marking osd." << o << " out (has been down for "
5115 << int(down.sec()) << " seconds)";
7c673cae
FG
5116 } else
5117 continue;
5118 }
5119
5120 down_pending_out.erase(o);
5121 }
5122 } else {
5123 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5124 }
5125
5126 // expire blacklisted items?
5127 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5128 p != osdmap.blacklist.end();
5129 ++p) {
5130 if (p->second < now) {
5131 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5132 pending_inc.old_blacklist.push_back(p->first);
5133 do_propose = true;
5134 }
5135 }
5136
11fdf7f2
TL
5137 if (try_prune_purged_snaps()) {
5138 do_propose = true;
7c673cae
FG
5139 }
5140
5141 if (update_pools_status())
5142 do_propose = true;
5143
5144 if (do_propose ||
5145 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
5146 propose_pending();
eafe8130
TL
5147}
5148
5149void OSDMonitor::_set_new_cache_sizes()
5150{
5151 uint64_t cache_size = 0;
5152 int64_t inc_alloc = 0;
5153 int64_t full_alloc = 0;
5154 int64_t kv_alloc = 0;
5155
5156 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5157 cache_size = pcm->get_tuned_mem();
5158 inc_alloc = inc_cache->get_committed_size();
5159 full_alloc = full_cache->get_committed_size();
5160 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5161 }
5162
5163 inc_osd_cache.set_bytes(inc_alloc);
5164 full_osd_cache.set_bytes(full_alloc);
5165
92f5a8d4 5166 dout(1) << __func__ << " cache_size:" << cache_size
eafe8130
TL
5167 << " inc_alloc: " << inc_alloc
5168 << " full_alloc: " << full_alloc
5169 << " kv_alloc: " << kv_alloc
5170 << dendl;
7c673cae
FG
5171}
5172
5173bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5174 std::map<int,utime_t> &last_osd_report)
5175{
11fdf7f2 5176 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
7c673cae
FG
5177 if (now - mon->get_leader_since() < timeo) {
5178 // We haven't been the leader for long enough to consider OSD timeouts
5179 return false;
5180 }
5181
5182 int max_osd = osdmap.get_max_osd();
5183 bool new_down = false;
5184
5185 for (int i=0; i < max_osd; ++i) {
5186 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
5187 if (!osdmap.exists(i)) {
5188 last_osd_report.erase(i); // if any
5189 continue;
5190 }
7c673cae
FG
5191 if (!osdmap.is_up(i))
5192 continue;
5193 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5194 if (t == last_osd_report.end()) {
5195 // it wasn't in the map; start the timer.
5196 last_osd_report[i] = now;
5197 } else if (can_mark_down(i)) {
5198 utime_t diff = now - t->second;
5199 if (diff > timeo) {
31f18b77
FG
5200 mon->clog->info() << "osd." << i << " marked down after no beacon for "
5201 << diff << " seconds";
5202 derr << "no beacon from osd." << i << " since " << t->second
5203 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
5204 pending_inc.new_state[i] = CEPH_OSD_UP;
5205 new_down = true;
5206 }
5207 }
5208 }
5209 return new_down;
5210}
5211
11fdf7f2
TL
5212static void dump_cpu_list(Formatter *f, const char *name,
5213 const string& strlist)
7c673cae 5214{
11fdf7f2
TL
5215 cpu_set_t cpu_set;
5216 size_t cpu_set_size;
5217 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5218 return;
5219 }
5220 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5221 f->open_array_section(name);
5222 for (auto cpu : cpus) {
5223 f->dump_int("cpu", cpu);
7c673cae 5224 }
11fdf7f2 5225 f->close_section();
7c673cae
FG
5226}
5227
5228void OSDMonitor::dump_info(Formatter *f)
5229{
5230 f->open_object_section("osdmap");
5231 osdmap.dump(f);
5232 f->close_section();
5233
5234 f->open_array_section("osd_metadata");
5235 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5236 if (osdmap.exists(i)) {
5237 f->open_object_section("osd");
5238 f->dump_unsigned("id", i);
5239 dump_osd_metadata(i, f, NULL);
5240 f->close_section();
5241 }
5242 }
5243 f->close_section();
5244
1911f103
TL
5245 f->open_object_section("osdmap_clean_epochs");
5246 f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5247
5248 f->open_object_section("last_epoch_clean");
5249 last_epoch_clean.dump(f);
5250 f->close_section();
5251
5252 f->open_array_section("osd_epochs");
5253 for (auto& osd_epoch : osd_epochs) {
5254 f->open_object_section("osd");
5255 f->dump_unsigned("id", osd_epoch.first);
5256 f->dump_unsigned("epoch", osd_epoch.second);
5257 f->close_section();
5258 }
5259 f->close_section(); // osd_epochs
5260
5261 f->close_section(); // osd_clean_epochs
5262
7c673cae
FG
5263 f->dump_unsigned("osdmap_first_committed", get_first_committed());
5264 f->dump_unsigned("osdmap_last_committed", get_last_committed());
5265
5266 f->open_object_section("crushmap");
5267 osdmap.crush->dump(f);
5268 f->close_section();
11fdf7f2
TL
5269
5270 if (has_osdmap_manifest) {
5271 f->open_object_section("osdmap_manifest");
5272 osdmap_manifest.dump(f);
5273 f->close_section();
5274 }
7c673cae
FG
5275}
5276
5277namespace {
5278 enum osd_pool_get_choices {
11fdf7f2 5279 SIZE, MIN_SIZE,
28e407b8 5280 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
7c673cae
FG
5281 NODELETE, NOPGCHANGE, NOSIZECHANGE,
5282 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5283 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
11fdf7f2 5284 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
7c673cae
FG
5285 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5286 CACHE_TARGET_FULL_RATIO,
5287 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5288 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5289 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5290 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5291 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5292 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5293 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5294 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
11fdf7f2
TL
5295 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5296 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5297 PG_AUTOSCALE_BIAS };
7c673cae
FG
5298
5299 std::set<osd_pool_get_choices>
5300 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5301 const std::set<osd_pool_get_choices>& second)
5302 {
5303 std::set<osd_pool_get_choices> result;
5304 std::set_difference(first.begin(), first.end(),
5305 second.begin(), second.end(),
5306 std::inserter(result, result.end()));
5307 return result;
5308 }
5309}
5310
5311
5312bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5313{
5314 op->mark_osdmon_event(__func__);
9f95a23c 5315 auto m = op->get_req<MMonCommand>();
7c673cae
FG
5316 int r = 0;
5317 bufferlist rdata;
5318 stringstream ss, ds;
5319
11fdf7f2 5320 cmdmap_t cmdmap;
7c673cae
FG
5321 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5322 string rs = ss.str();
5323 mon->reply_command(op, -EINVAL, rs, get_last_committed());
5324 return true;
5325 }
5326
11fdf7f2 5327 MonSession *session = op->get_session();
7c673cae 5328 if (!session) {
11fdf7f2 5329 derr << __func__ << " no session" << dendl;
7c673cae
FG
5330 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5331 return true;
5332 }
5333
5334 string prefix;
9f95a23c 5335 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
5336
5337 string format;
9f95a23c 5338 cmd_getval(cmdmap, "format", format, string("plain"));
7c673cae
FG
5339 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5340
5341 if (prefix == "osd stat") {
92f5a8d4
TL
5342 if (f) {
5343 f->open_object_section("osdmap");
5344 osdmap.print_summary(f.get(), ds, "", true);
5345 f->close_section();
7c673cae 5346 f->flush(rdata);
92f5a8d4
TL
5347 } else {
5348 osdmap.print_summary(nullptr, ds, "", true);
7c673cae 5349 rdata.append(ds);
92f5a8d4 5350 }
7c673cae 5351 }
7c673cae
FG
5352 else if (prefix == "osd dump" ||
5353 prefix == "osd tree" ||
11fdf7f2 5354 prefix == "osd tree-from" ||
7c673cae
FG
5355 prefix == "osd ls" ||
5356 prefix == "osd getmap" ||
31f18b77 5357 prefix == "osd getcrushmap" ||
9f95a23c
TL
5358 prefix == "osd ls-tree" ||
5359 prefix == "osd info") {
7c673cae
FG
5360 string val;
5361
5362 epoch_t epoch = 0;
5363 int64_t epochnum;
9f95a23c 5364 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
7c673cae
FG
5365 epoch = epochnum;
5366
5367 bufferlist osdmap_bl;
5368 int err = get_version_full(epoch, osdmap_bl);
5369 if (err == -ENOENT) {
5370 r = -ENOENT;
5371 ss << "there is no map for epoch " << epoch;
5372 goto reply;
5373 }
11fdf7f2
TL
5374 ceph_assert(err == 0);
5375 ceph_assert(osdmap_bl.length());
7c673cae
FG
5376
5377 OSDMap *p;
5378 if (epoch == osdmap.get_epoch()) {
5379 p = &osdmap;
5380 } else {
5381 p = new OSDMap;
5382 p->decode(osdmap_bl);
5383 }
5384
224ce89b
WB
5385 auto sg = make_scope_guard([&] {
5386 if (p != &osdmap) {
5387 delete p;
5388 }
5389 });
5390
7c673cae
FG
5391 if (prefix == "osd dump") {
5392 stringstream ds;
5393 if (f) {
5394 f->open_object_section("osdmap");
5395 p->dump(f.get());
5396 f->close_section();
5397 f->flush(ds);
5398 } else {
5399 p->print(ds);
5400 }
5401 rdata.append(ds);
5402 if (!f)
5403 ds << " ";
5404 } else if (prefix == "osd ls") {
5405 if (f) {
5406 f->open_array_section("osds");
5407 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5408 if (osdmap.exists(i)) {
5409 f->dump_int("osd", i);
5410 }
5411 }
5412 f->close_section();
5413 f->flush(ds);
5414 } else {
5415 bool first = true;
5416 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5417 if (osdmap.exists(i)) {
5418 if (!first)
5419 ds << "\n";
5420 first = false;
5421 ds << i;
5422 }
5423 }
5424 }
5425 rdata.append(ds);
9f95a23c
TL
5426 } else if (prefix == "osd info") {
5427 int64_t osd_id;
5428 bool do_single_osd = true;
5429 if (!cmd_getval(cmdmap, "id", osd_id)) {
5430 do_single_osd = false;
5431 }
5432
5433 if (do_single_osd && !osdmap.exists(osd_id)) {
5434 ss << "osd." << osd_id << " does not exist";
5435 r = -EINVAL;
5436 goto reply;
5437 }
5438
5439 if (f) {
5440 if (do_single_osd) {
5441 osdmap.dump_osd(osd_id, f.get());
5442 } else {
5443 osdmap.dump_osds(f.get());
5444 }
5445 f->flush(ds);
5446 } else {
5447 if (do_single_osd) {
5448 osdmap.print_osd(osd_id, ds);
5449 } else {
5450 osdmap.print_osds(ds);
5451 }
5452 }
5453 rdata.append(ds);
11fdf7f2
TL
5454 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5455 string bucket;
5456 if (prefix == "osd tree-from") {
9f95a23c 5457 cmd_getval(cmdmap, "bucket", bucket);
11fdf7f2
TL
5458 if (!osdmap.crush->name_exists(bucket)) {
5459 ss << "bucket '" << bucket << "' does not exist";
5460 r = -ENOENT;
5461 goto reply;
5462 }
5463 int id = osdmap.crush->get_item_id(bucket);
5464 if (id >= 0) {
5465 ss << "\"" << bucket << "\" is not a bucket";
5466 r = -EINVAL;
5467 goto reply;
5468 }
5469 }
5470
31f18b77 5471 vector<string> states;
9f95a23c 5472 cmd_getval(cmdmap, "states", states);
31f18b77
FG
5473 unsigned filter = 0;
5474 for (auto& s : states) {
5475 if (s == "up") {
5476 filter |= OSDMap::DUMP_UP;
5477 } else if (s == "down") {
5478 filter |= OSDMap::DUMP_DOWN;
5479 } else if (s == "in") {
5480 filter |= OSDMap::DUMP_IN;
5481 } else if (s == "out") {
5482 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
5483 } else if (s == "destroyed") {
5484 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
5485 } else {
5486 ss << "unrecognized state '" << s << "'";
5487 r = -EINVAL;
5488 goto reply;
5489 }
5490 }
5491 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
5492 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5493 ss << "cannot specify both 'in' and 'out'";
5494 r = -EINVAL;
5495 goto reply;
5496 }
5497 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5498 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5499 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5500 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5501 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5502 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5503 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
5504 r = -EINVAL;
5505 goto reply;
5506 }
7c673cae
FG
5507 if (f) {
5508 f->open_object_section("tree");
11fdf7f2 5509 p->print_tree(f.get(), NULL, filter, bucket);
7c673cae
FG
5510 f->close_section();
5511 f->flush(ds);
5512 } else {
11fdf7f2 5513 p->print_tree(NULL, &ds, filter, bucket);
7c673cae
FG
5514 }
5515 rdata.append(ds);
5516 } else if (prefix == "osd getmap") {
5517 rdata.append(osdmap_bl);
5518 ss << "got osdmap epoch " << p->get_epoch();
5519 } else if (prefix == "osd getcrushmap") {
5520 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
5521 ss << p->get_crush_version();
5522 } else if (prefix == "osd ls-tree") {
5523 string bucket_name;
9f95a23c 5524 cmd_getval(cmdmap, "name", bucket_name);
31f18b77
FG
5525 set<int> osds;
5526 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5527 if (r == -ENOENT) {
5528 ss << "\"" << bucket_name << "\" does not exist";
5529 goto reply;
5530 } else if (r < 0) {
5531 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5532 goto reply;
5533 }
5534
5535 if (f) {
5536 f->open_array_section("osds");
5537 for (auto &i : osds) {
5538 if (osdmap.exists(i)) {
5539 f->dump_int("osd", i);
5540 }
5541 }
5542 f->close_section();
5543 f->flush(ds);
5544 } else {
5545 bool first = true;
5546 for (auto &i : osds) {
5547 if (osdmap.exists(i)) {
5548 if (!first)
5549 ds << "\n";
5550 first = false;
5551 ds << i;
5552 }
5553 }
5554 }
5555
5556 rdata.append(ds);
7c673cae 5557 }
7c673cae
FG
5558 } else if (prefix == "osd getmaxosd") {
5559 if (f) {
5560 f->open_object_section("getmaxosd");
5561 f->dump_unsigned("epoch", osdmap.get_epoch());
5562 f->dump_int("max_osd", osdmap.get_max_osd());
5563 f->close_section();
5564 f->flush(rdata);
5565 } else {
5566 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5567 rdata.append(ds);
5568 }
5569 } else if (prefix == "osd utilization") {
5570 string out;
5571 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5572 if (f)
5573 f->flush(rdata);
5574 else
5575 rdata.append(out);
5576 r = 0;
5577 goto reply;
5578 } else if (prefix == "osd find") {
5579 int64_t osd;
9f95a23c 5580 if (!cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5581 ss << "unable to parse osd id value '"
5582 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5583 r = -EINVAL;
5584 goto reply;
5585 }
5586 if (!osdmap.exists(osd)) {
5587 ss << "osd." << osd << " does not exist";
5588 r = -ENOENT;
5589 goto reply;
5590 }
5591 string format;
9f95a23c 5592 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5593 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5594 f->open_object_section("osd_location");
5595 f->dump_int("osd", osd);
11fdf7f2 5596 f->dump_object("addrs", osdmap.get_addrs(osd));
f64942e4 5597 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
11fdf7f2
TL
5598
5599 // try to identify host, pod/container name, etc.
5600 map<string,string> m;
5601 load_metadata(osd, m, nullptr);
5602 if (auto p = m.find("hostname"); p != m.end()) {
5603 f->dump_string("host", p->second);
5604 }
5605 for (auto& k : {
5606 "pod_name", "pod_namespace", // set by rook
9f95a23c 5607 "container_name" // set by cephadm, ceph-ansible
11fdf7f2
TL
5608 }) {
5609 if (auto p = m.find(k); p != m.end()) {
5610 f->dump_string(k, p->second);
5611 }
5612 }
5613
5614 // crush is helpful too
7c673cae
FG
5615 f->open_object_section("crush_location");
5616 map<string,string> loc = osdmap.crush->get_full_location(osd);
5617 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5618 f->dump_string(p->first.c_str(), p->second);
5619 f->close_section();
5620 f->close_section();
5621 f->flush(rdata);
5622 } else if (prefix == "osd metadata") {
5623 int64_t osd = -1;
5624 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
9f95a23c 5625 !cmd_getval(cmdmap, "id", osd)) {
7c673cae
FG
5626 ss << "unable to parse osd id value '"
5627 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5628 r = -EINVAL;
5629 goto reply;
5630 }
5631 if (osd >= 0 && !osdmap.exists(osd)) {
5632 ss << "osd." << osd << " does not exist";
5633 r = -ENOENT;
5634 goto reply;
5635 }
5636 string format;
9f95a23c 5637 cmd_getval(cmdmap, "format", format);
7c673cae
FG
5638 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5639 if (osd >= 0) {
5640 f->open_object_section("osd_metadata");
5641 f->dump_unsigned("id", osd);
5642 r = dump_osd_metadata(osd, f.get(), &ss);
5643 if (r < 0)
5644 goto reply;
5645 f->close_section();
5646 } else {
5647 r = 0;
5648 f->open_array_section("osd_metadata");
5649 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5650 if (osdmap.exists(i)) {
5651 f->open_object_section("osd");
5652 f->dump_unsigned("id", i);
5653 r = dump_osd_metadata(i, f.get(), NULL);
5654 if (r == -EINVAL || r == -ENOENT) {
5655 // Drop error, continue to get other daemons' metadata
5656 dout(4) << "No metadata for osd." << i << dendl;
5657 r = 0;
5658 } else if (r < 0) {
5659 // Unexpected error
5660 goto reply;
5661 }
5662 f->close_section();
5663 }
5664 }
5665 f->close_section();
5666 }
5667 f->flush(rdata);
31f18b77
FG
5668 } else if (prefix == "osd versions") {
5669 if (!f)
5670 f.reset(Formatter::create("json-pretty"));
5671 count_metadata("ceph_version", f.get());
5672 f->flush(rdata);
5673 r = 0;
5674 } else if (prefix == "osd count-metadata") {
5675 if (!f)
5676 f.reset(Formatter::create("json-pretty"));
5677 string field;
9f95a23c 5678 cmd_getval(cmdmap, "property", field);
31f18b77
FG
5679 count_metadata(field, f.get());
5680 f->flush(rdata);
5681 r = 0;
11fdf7f2
TL
5682 } else if (prefix == "osd numa-status") {
5683 TextTable tbl;
5684 if (f) {
5685 f->open_array_section("osds");
5686 } else {
5687 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5688 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5689 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5690 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5691 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5692 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5693 }
5694 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5695 if (osdmap.exists(i)) {
5696 map<string,string> m;
5697 ostringstream err;
5698 if (load_metadata(i, m, &err) < 0) {
5699 continue;
5700 }
5701 string host;
5702 auto p = m.find("hostname");
5703 if (p != m.end()) {
5704 host = p->second;
5705 }
5706 if (f) {
5707 f->open_object_section("osd");
5708 f->dump_int("osd", i);
5709 f->dump_string("host", host);
5710 for (auto n : { "network_numa_node", "objectstore_numa_node",
5711 "numa_node" }) {
5712 p = m.find(n);
5713 if (p != m.end()) {
5714 f->dump_int(n, atoi(p->second.c_str()));
5715 }
5716 }
5717 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5718 p = m.find(n);
5719 if (p != m.end()) {
5720 list<string> ls = get_str_list(p->second, ",");
5721 f->open_array_section(n);
5722 for (auto node : ls) {
5723 f->dump_int("node", atoi(node.c_str()));
5724 }
5725 f->close_section();
5726 }
5727 }
5728 for (auto n : { "numa_node_cpus" }) {
5729 p = m.find(n);
5730 if (p != m.end()) {
5731 dump_cpu_list(f.get(), n, p->second);
5732 }
5733 }
5734 f->close_section();
5735 } else {
5736 tbl << i;
5737 tbl << host;
5738 p = m.find("network_numa_nodes");
5739 if (p != m.end()) {
5740 tbl << p->second;
5741 } else {
5742 tbl << "-";
5743 }
5744 p = m.find("objectstore_numa_nodes");
5745 if (p != m.end()) {
5746 tbl << p->second;
5747 } else {
5748 tbl << "-";
5749 }
5750 p = m.find("numa_node");
5751 auto q = m.find("numa_node_cpus");
5752 if (p != m.end() && q != m.end()) {
5753 tbl << p->second;
5754 tbl << q->second;
5755 } else {
5756 tbl << "-";
5757 tbl << "-";
5758 }
5759 tbl << TextTable::endrow;
5760 }
5761 }
5762 }
5763 if (f) {
5764 f->close_section();
5765 f->flush(rdata);
5766 } else {
5767 rdata.append(stringify(tbl));
5768 }
7c673cae
FG
5769 } else if (prefix == "osd map") {
5770 string poolstr, objstr, namespacestr;
9f95a23c
TL
5771 cmd_getval(cmdmap, "pool", poolstr);
5772 cmd_getval(cmdmap, "object", objstr);
5773 cmd_getval(cmdmap, "nspace", namespacestr);
7c673cae
FG
5774
5775 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5776 if (pool < 0) {
5777 ss << "pool " << poolstr << " does not exist";
5778 r = -ENOENT;
5779 goto reply;
5780 }
5781 object_locator_t oloc(pool, namespacestr);
5782 object_t oid(objstr);
5783 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5784 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5785 vector<int> up, acting;
5786 int up_p, acting_p;
5787 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5788
5789 string fullobjname;
5790 if (!namespacestr.empty())
5791 fullobjname = namespacestr + string("/") + oid.name;
5792 else
5793 fullobjname = oid.name;
5794 if (f) {
5795 f->open_object_section("osd_map");
5796 f->dump_unsigned("epoch", osdmap.get_epoch());
5797 f->dump_string("pool", poolstr);
5798 f->dump_int("pool_id", pool);
5799 f->dump_stream("objname") << fullobjname;
5800 f->dump_stream("raw_pgid") << pgid;
5801 f->dump_stream("pgid") << mpgid;
5802 f->open_array_section("up");
5803 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5804 f->dump_int("osd", *p);
5805 f->close_section();
5806 f->dump_int("up_primary", up_p);
5807 f->open_array_section("acting");
5808 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5809 f->dump_int("osd", *p);
5810 f->close_section();
5811 f->dump_int("acting_primary", acting_p);
5812 f->close_section(); // osd_map
5813 f->flush(rdata);
5814 } else {
5815 ds << "osdmap e" << osdmap.get_epoch()
5816 << " pool '" << poolstr << "' (" << pool << ")"
5817 << " object '" << fullobjname << "' ->"
5818 << " pg " << pgid << " (" << mpgid << ")"
5819 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5820 << pg_vector_string(acting) << ", p" << acting_p << ")";
5821 rdata.append(ds);
5822 }
5823
5824 } else if (prefix == "pg map") {
5825 pg_t pgid;
5826 string pgidstr;
9f95a23c 5827 cmd_getval(cmdmap, "pgid", pgidstr);
7c673cae
FG
5828 if (!pgid.parse(pgidstr.c_str())) {
5829 ss << "invalid pgid '" << pgidstr << "'";
5830 r = -EINVAL;
5831 goto reply;
5832 }
5833 vector<int> up, acting;
5834 if (!osdmap.have_pg_pool(pgid.pool())) {
5835 ss << "pg '" << pgidstr << "' does not exist";
5836 r = -ENOENT;
5837 goto reply;
5838 }
5839 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5840 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5841 if (f) {
5842 f->open_object_section("pg_map");
5843 f->dump_unsigned("epoch", osdmap.get_epoch());
5844 f->dump_stream("raw_pgid") << pgid;
5845 f->dump_stream("pgid") << mpgid;
5846 f->open_array_section("up");
5847 for (auto osd : up) {
5848 f->dump_int("up_osd", osd);
5849 }
5850 f->close_section();
5851 f->open_array_section("acting");
5852 for (auto osd : acting) {
5853 f->dump_int("acting_osd", osd);
5854 }
5855 f->close_section();
5856 f->close_section();
5857 f->flush(rdata);
5858 } else {
5859 ds << "osdmap e" << osdmap.get_epoch()
5860 << " pg " << pgid << " (" << mpgid << ")"
5861 << " -> up " << up << " acting " << acting;
5862 rdata.append(ds);
5863 }
5864 goto reply;
5865
7c673cae 5866 } else if (prefix == "osd lspools") {
7c673cae
FG
5867 if (f)
5868 f->open_array_section("pools");
5869 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5870 p != osdmap.pools.end();
5871 ++p) {
11fdf7f2
TL
5872 if (f) {
5873 f->open_object_section("pool");
5874 f->dump_int("poolnum", p->first);
5875 f->dump_string("poolname", osdmap.pool_name[p->first]);
5876 f->close_section();
5877 } else {
5878 ds << p->first << ' ' << osdmap.pool_name[p->first];
5879 if (next(p) != osdmap.pools.end()) {
5880 ds << '\n';
7c673cae
FG
5881 }
5882 }
5883 }
5884 if (f) {
5885 f->close_section();
5886 f->flush(ds);
5887 }
5888 rdata.append(ds);
5889 } else if (prefix == "osd blacklist ls") {
5890 if (f)
5891 f->open_array_section("blacklist");
5892
5893 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5894 p != osdmap.blacklist.end();
5895 ++p) {
5896 if (f) {
5897 f->open_object_section("entry");
11fdf7f2 5898 f->dump_string("addr", p->first.get_legacy_str());
7c673cae
FG
5899 f->dump_stream("until") << p->second;
5900 f->close_section();
5901 } else {
5902 stringstream ss;
5903 string s;
5904 ss << p->first << " " << p->second;
5905 getline(ss, s);
5906 s += "\n";
5907 rdata.append(s);
5908 }
5909 }
5910 if (f) {
5911 f->close_section();
5912 f->flush(rdata);
5913 }
5914 ss << "listed " << osdmap.blacklist.size() << " entries";
5915
5916 } else if (prefix == "osd pool ls") {
5917 string detail;
9f95a23c 5918 cmd_getval(cmdmap, "detail", detail);
7c673cae
FG
5919 if (!f && detail == "detail") {
5920 ostringstream ss;
5921 osdmap.print_pools(ss);
5922 rdata.append(ss.str());
5923 } else {
5924 if (f)
5925 f->open_array_section("pools");
5926 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5927 it != osdmap.get_pools().end();
5928 ++it) {
5929 if (f) {
5930 if (detail == "detail") {
5931 f->open_object_section("pool");
eafe8130 5932 f->dump_int("pool_id", it->first);
7c673cae
FG
5933 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5934 it->second.dump(f.get());
5935 f->close_section();
5936 } else {
5937 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5938 }
5939 } else {
5940 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5941 }
5942 }
5943 if (f) {
5944 f->close_section();
5945 f->flush(rdata);
5946 }
5947 }
5948
5949 } else if (prefix == "osd crush get-tunable") {
5950 string tunable;
9f95a23c 5951 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
5952 ostringstream rss;
5953 if (f)
5954 f->open_object_section("tunable");
5955 if (tunable == "straw_calc_version") {
5956 if (f)
5957 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5958 else
5959 rss << osdmap.crush->get_straw_calc_version() << "\n";
5960 } else {
5961 r = -EINVAL;
5962 goto reply;
5963 }
5964 if (f) {
5965 f->close_section();
5966 f->flush(rdata);
5967 } else {
5968 rdata.append(rss.str());
5969 }
5970 r = 0;
5971
5972 } else if (prefix == "osd pool get") {
5973 string poolstr;
9f95a23c 5974 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
5975 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5976 if (pool < 0) {
5977 ss << "unrecognized pool '" << poolstr << "'";
5978 r = -ENOENT;
5979 goto reply;
5980 }
5981
5982 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5983 string var;
9f95a23c 5984 cmd_getval(cmdmap, "var", var);
7c673cae
FG
5985
5986 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5987 const choices_map_t ALL_CHOICES = {
5988 {"size", SIZE},
5989 {"min_size", MIN_SIZE},
7c673cae 5990 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
28e407b8
AA
5991 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5992 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
7c673cae
FG
5993 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5994 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5995 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5996 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5997 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5998 {"use_gmt_hitset", USE_GMT_HITSET},
11fdf7f2 5999 {"target_max_objects", TARGET_MAX_OBJECTS},
7c673cae
FG
6000 {"target_max_bytes", TARGET_MAX_BYTES},
6001 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6002 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6003 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6004 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6005 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6006 {"erasure_code_profile", ERASURE_CODE_PROFILE},
6007 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6008 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6009 {"fast_read", FAST_READ},
6010 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6011 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6012 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6013 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6014 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6015 {"recovery_priority", RECOVERY_PRIORITY},
6016 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6017 {"scrub_priority", SCRUB_PRIORITY},
6018 {"compression_mode", COMPRESSION_MODE},
6019 {"compression_algorithm", COMPRESSION_ALGORITHM},
6020 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6021 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6022 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6023 {"csum_type", CSUM_TYPE},
6024 {"csum_max_block", CSUM_MAX_BLOCK},
6025 {"csum_min_block", CSUM_MIN_BLOCK},
11fdf7f2
TL
6026 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6027 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6028 {"pg_num_min", PG_NUM_MIN},
6029 {"target_size_bytes", TARGET_SIZE_BYTES},
6030 {"target_size_ratio", TARGET_SIZE_RATIO},
6031 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
7c673cae
FG
6032 };
6033
6034 typedef std::set<osd_pool_get_choices> choices_set_t;
6035
6036 const choices_set_t ONLY_TIER_CHOICES = {
6037 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6038 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6039 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6040 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6041 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 6042 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
6043 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6044 };
6045 const choices_set_t ONLY_ERASURE_CHOICES = {
28e407b8 6046 EC_OVERWRITES, ERASURE_CODE_PROFILE
7c673cae
FG
6047 };
6048
6049 choices_set_t selected_choices;
6050 if (var == "all") {
6051 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6052 it != ALL_CHOICES.end(); ++it) {
6053 selected_choices.insert(it->second);
6054 }
6055
6056 if(!p->is_tier()) {
6057 selected_choices = subtract_second_from_first(selected_choices,
6058 ONLY_TIER_CHOICES);
6059 }
6060
6061 if(!p->is_erasure()) {
6062 selected_choices = subtract_second_from_first(selected_choices,
6063 ONLY_ERASURE_CHOICES);
6064 }
6065 } else /* var != "all" */ {
6066 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6067 osd_pool_get_choices selected = found->second;
6068
6069 if (!p->is_tier() &&
6070 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6071 ss << "pool '" << poolstr
6072 << "' is not a tier pool: variable not applicable";
6073 r = -EACCES;
6074 goto reply;
6075 }
6076
6077 if (!p->is_erasure() &&
6078 ONLY_ERASURE_CHOICES.find(selected)
6079 != ONLY_ERASURE_CHOICES.end()) {
6080 ss << "pool '" << poolstr
6081 << "' is not a erasure pool: variable not applicable";
6082 r = -EACCES;
6083 goto reply;
6084 }
6085
94b18763
FG
6086 if (pool_opts_t::is_opt_name(var) &&
6087 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6088 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6089 r = -ENOENT;
6090 goto reply;
6091 }
6092
7c673cae
FG
6093 selected_choices.insert(selected);
6094 }
6095
6096 if (f) {
94b18763
FG
6097 f->open_object_section("pool");
6098 f->dump_string("pool", poolstr);
6099 f->dump_int("pool_id", pool);
7c673cae
FG
6100 for(choices_set_t::const_iterator it = selected_choices.begin();
6101 it != selected_choices.end(); ++it) {
6102 choices_map_t::const_iterator i;
c07f9fc5
FG
6103 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6104 if (i->second == *it) {
6105 break;
6106 }
6107 }
11fdf7f2 6108 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6109 switch(*it) {
6110 case PG_NUM:
6111 f->dump_int("pg_num", p->get_pg_num());
6112 break;
6113 case PGP_NUM:
6114 f->dump_int("pgp_num", p->get_pgp_num());
6115 break;
7c673cae
FG
6116 case SIZE:
6117 f->dump_int("size", p->get_size());
6118 break;
6119 case MIN_SIZE:
6120 f->dump_int("min_size", p->get_min_size());
6121 break;
7c673cae 6122 case CRUSH_RULE:
31f18b77 6123 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6124 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 6125 p->get_crush_rule()));
7c673cae 6126 } else {
31f18b77 6127 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
6128 }
6129 break;
28e407b8
AA
6130 case EC_OVERWRITES:
6131 f->dump_bool("allow_ec_overwrites",
6132 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6133 break;
11fdf7f2
TL
6134 case PG_AUTOSCALE_MODE:
6135 f->dump_string("pg_autoscale_mode",
6136 pg_pool_t::get_pg_autoscale_mode_name(
6137 p->pg_autoscale_mode));
6138 break;
7c673cae
FG
6139 case HASHPSPOOL:
6140 case NODELETE:
6141 case NOPGCHANGE:
6142 case NOSIZECHANGE:
6143 case WRITE_FADVISE_DONTNEED:
6144 case NOSCRUB:
6145 case NODEEP_SCRUB:
94b18763
FG
6146 f->dump_bool(i->first.c_str(),
6147 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
6148 break;
6149 case HIT_SET_PERIOD:
6150 f->dump_int("hit_set_period", p->hit_set_period);
6151 break;
6152 case HIT_SET_COUNT:
6153 f->dump_int("hit_set_count", p->hit_set_count);
6154 break;
6155 case HIT_SET_TYPE:
6156 f->dump_string("hit_set_type",
6157 HitSet::get_type_name(p->hit_set_params.get_type()));
6158 break;
6159 case HIT_SET_FPP:
6160 {
6161 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6162 BloomHitSet::Params *bloomp =
6163 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6164 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6165 } else if(var != "all") {
6166 f->close_section();
6167 ss << "hit set is not of type Bloom; " <<
6168 "invalid to get a false positive rate!";
6169 r = -EINVAL;
6170 goto reply;
6171 }
6172 }
6173 break;
6174 case USE_GMT_HITSET:
6175 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6176 break;
6177 case TARGET_MAX_OBJECTS:
6178 f->dump_unsigned("target_max_objects", p->target_max_objects);
6179 break;
6180 case TARGET_MAX_BYTES:
6181 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6182 break;
6183 case CACHE_TARGET_DIRTY_RATIO:
6184 f->dump_unsigned("cache_target_dirty_ratio_micro",
6185 p->cache_target_dirty_ratio_micro);
6186 f->dump_float("cache_target_dirty_ratio",
6187 ((float)p->cache_target_dirty_ratio_micro/1000000));
6188 break;
6189 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6190 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6191 p->cache_target_dirty_high_ratio_micro);
6192 f->dump_float("cache_target_dirty_high_ratio",
6193 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6194 break;
6195 case CACHE_TARGET_FULL_RATIO:
6196 f->dump_unsigned("cache_target_full_ratio_micro",
6197 p->cache_target_full_ratio_micro);
6198 f->dump_float("cache_target_full_ratio",
6199 ((float)p->cache_target_full_ratio_micro/1000000));
6200 break;
6201 case CACHE_MIN_FLUSH_AGE:
6202 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6203 break;
6204 case CACHE_MIN_EVICT_AGE:
6205 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6206 break;
6207 case ERASURE_CODE_PROFILE:
6208 f->dump_string("erasure_code_profile", p->erasure_code_profile);
6209 break;
6210 case MIN_READ_RECENCY_FOR_PROMOTE:
6211 f->dump_int("min_read_recency_for_promote",
6212 p->min_read_recency_for_promote);
6213 break;
6214 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6215 f->dump_int("min_write_recency_for_promote",
6216 p->min_write_recency_for_promote);
6217 break;
6218 case FAST_READ:
6219 f->dump_int("fast_read", p->fast_read);
6220 break;
6221 case HIT_SET_GRADE_DECAY_RATE:
6222 f->dump_int("hit_set_grade_decay_rate",
6223 p->hit_set_grade_decay_rate);
6224 break;
6225 case HIT_SET_SEARCH_LAST_N:
6226 f->dump_int("hit_set_search_last_n",
6227 p->hit_set_search_last_n);
6228 break;
6229 case SCRUB_MIN_INTERVAL:
6230 case SCRUB_MAX_INTERVAL:
6231 case DEEP_SCRUB_INTERVAL:
6232 case RECOVERY_PRIORITY:
6233 case RECOVERY_OP_PRIORITY:
6234 case SCRUB_PRIORITY:
6235 case COMPRESSION_MODE:
6236 case COMPRESSION_ALGORITHM:
6237 case COMPRESSION_REQUIRED_RATIO:
6238 case COMPRESSION_MAX_BLOB_SIZE:
6239 case COMPRESSION_MIN_BLOB_SIZE:
6240 case CSUM_TYPE:
6241 case CSUM_MAX_BLOCK:
6242 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6243 case FINGERPRINT_ALGORITHM:
6244 case PG_NUM_MIN:
6245 case TARGET_SIZE_BYTES:
6246 case TARGET_SIZE_RATIO:
6247 case PG_AUTOSCALE_BIAS:
c07f9fc5
FG
6248 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6249 if (p->opts.is_set(key)) {
c07f9fc5 6250 if(*it == CSUM_TYPE) {
11fdf7f2 6251 int64_t val;
c07f9fc5
FG
6252 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6253 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6254 } else {
6255 p->opts.dump(i->first, f.get());
6256 }
94b18763 6257 }
7c673cae
FG
6258 break;
6259 }
7c673cae 6260 }
94b18763
FG
6261 f->close_section();
6262 f->flush(rdata);
7c673cae
FG
6263 } else /* !f */ {
6264 for(choices_set_t::const_iterator it = selected_choices.begin();
6265 it != selected_choices.end(); ++it) {
6266 choices_map_t::const_iterator i;
6267 switch(*it) {
6268 case PG_NUM:
6269 ss << "pg_num: " << p->get_pg_num() << "\n";
6270 break;
6271 case PGP_NUM:
6272 ss << "pgp_num: " << p->get_pgp_num() << "\n";
6273 break;
7c673cae
FG
6274 case SIZE:
6275 ss << "size: " << p->get_size() << "\n";
6276 break;
6277 case MIN_SIZE:
6278 ss << "min_size: " << p->get_min_size() << "\n";
6279 break;
7c673cae 6280 case CRUSH_RULE:
31f18b77 6281 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 6282 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 6283 p->get_crush_rule()) << "\n";
7c673cae 6284 } else {
31f18b77 6285 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
6286 }
6287 break;
11fdf7f2
TL
6288 case PG_AUTOSCALE_MODE:
6289 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6290 p->pg_autoscale_mode) <<"\n";
6291 break;
7c673cae
FG
6292 case HIT_SET_PERIOD:
6293 ss << "hit_set_period: " << p->hit_set_period << "\n";
6294 break;
6295 case HIT_SET_COUNT:
6296 ss << "hit_set_count: " << p->hit_set_count << "\n";
6297 break;
6298 case HIT_SET_TYPE:
6299 ss << "hit_set_type: " <<
6300 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6301 break;
6302 case HIT_SET_FPP:
6303 {
6304 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6305 BloomHitSet::Params *bloomp =
6306 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6307 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6308 } else if(var != "all") {
6309 ss << "hit set is not of type Bloom; " <<
6310 "invalid to get a false positive rate!";
6311 r = -EINVAL;
6312 goto reply;
6313 }
6314 }
6315 break;
6316 case USE_GMT_HITSET:
6317 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6318 break;
6319 case TARGET_MAX_OBJECTS:
6320 ss << "target_max_objects: " << p->target_max_objects << "\n";
6321 break;
6322 case TARGET_MAX_BYTES:
6323 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6324 break;
6325 case CACHE_TARGET_DIRTY_RATIO:
6326 ss << "cache_target_dirty_ratio: "
6327 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6328 break;
6329 case CACHE_TARGET_DIRTY_HIGH_RATIO:
6330 ss << "cache_target_dirty_high_ratio: "
6331 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6332 break;
6333 case CACHE_TARGET_FULL_RATIO:
6334 ss << "cache_target_full_ratio: "
6335 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6336 break;
6337 case CACHE_MIN_FLUSH_AGE:
6338 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6339 break;
6340 case CACHE_MIN_EVICT_AGE:
6341 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6342 break;
6343 case ERASURE_CODE_PROFILE:
6344 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6345 break;
6346 case MIN_READ_RECENCY_FOR_PROMOTE:
6347 ss << "min_read_recency_for_promote: " <<
6348 p->min_read_recency_for_promote << "\n";
6349 break;
6350 case HIT_SET_GRADE_DECAY_RATE:
6351 ss << "hit_set_grade_decay_rate: " <<
6352 p->hit_set_grade_decay_rate << "\n";
6353 break;
6354 case HIT_SET_SEARCH_LAST_N:
6355 ss << "hit_set_search_last_n: " <<
6356 p->hit_set_search_last_n << "\n";
6357 break;
28e407b8
AA
6358 case EC_OVERWRITES:
6359 ss << "allow_ec_overwrites: " <<
6360 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6361 "\n";
6362 break;
7c673cae
FG
6363 case HASHPSPOOL:
6364 case NODELETE:
6365 case NOPGCHANGE:
6366 case NOSIZECHANGE:
6367 case WRITE_FADVISE_DONTNEED:
6368 case NOSCRUB:
6369 case NODEEP_SCRUB:
6370 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6371 if (i->second == *it)
6372 break;
6373 }
11fdf7f2 6374 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6375 ss << i->first << ": " <<
6376 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6377 "true" : "false") << "\n";
6378 break;
6379 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6380 ss << "min_write_recency_for_promote: " <<
6381 p->min_write_recency_for_promote << "\n";
6382 break;
6383 case FAST_READ:
6384 ss << "fast_read: " << p->fast_read << "\n";
6385 break;
6386 case SCRUB_MIN_INTERVAL:
6387 case SCRUB_MAX_INTERVAL:
6388 case DEEP_SCRUB_INTERVAL:
6389 case RECOVERY_PRIORITY:
6390 case RECOVERY_OP_PRIORITY:
6391 case SCRUB_PRIORITY:
6392 case COMPRESSION_MODE:
6393 case COMPRESSION_ALGORITHM:
6394 case COMPRESSION_REQUIRED_RATIO:
6395 case COMPRESSION_MAX_BLOB_SIZE:
6396 case COMPRESSION_MIN_BLOB_SIZE:
6397 case CSUM_TYPE:
6398 case CSUM_MAX_BLOCK:
6399 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6400 case FINGERPRINT_ALGORITHM:
6401 case PG_NUM_MIN:
6402 case TARGET_SIZE_BYTES:
6403 case TARGET_SIZE_RATIO:
6404 case PG_AUTOSCALE_BIAS:
7c673cae
FG
6405 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6406 if (i->second == *it)
6407 break;
6408 }
11fdf7f2 6409 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6410 {
6411 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6412 if (p->opts.is_set(key)) {
6413 if(key == pool_opts_t::CSUM_TYPE) {
11fdf7f2 6414 int64_t val;
7c673cae
FG
6415 p->opts.get(key, &val);
6416 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6417 } else {
6418 ss << i->first << ": " << p->opts.get(key) << "\n";
6419 }
6420 }
6421 }
6422 break;
6423 }
6424 rdata.append(ss.str());
6425 ss.str("");
6426 }
6427 }
6428 r = 0;
7c673cae
FG
6429 } else if (prefix == "osd pool get-quota") {
6430 string pool_name;
9f95a23c 6431 cmd_getval(cmdmap, "pool", pool_name);
7c673cae
FG
6432
6433 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6434 if (poolid < 0) {
11fdf7f2 6435 ceph_assert(poolid == -ENOENT);
7c673cae
FG
6436 ss << "unrecognized pool '" << pool_name << "'";
6437 r = -ENOENT;
6438 goto reply;
6439 }
6440 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
9f95a23c
TL
6441 const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6442 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
6443 if (f) {
6444 f->open_object_section("pool_quotas");
6445 f->dump_string("pool_name", pool_name);
6446 f->dump_unsigned("pool_id", poolid);
6447 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
9f95a23c 6448 f->dump_int("current_num_objects", sum.num_objects);
7c673cae 6449 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
9f95a23c 6450 f->dump_int("current_num_bytes", sum.num_bytes);
7c673cae
FG
6451 f->close_section();
6452 f->flush(rdata);
6453 } else {
6454 stringstream rs;
6455 rs << "quotas for pool '" << pool_name << "':\n"
6456 << " max objects: ";
6457 if (p->quota_max_objects == 0)
6458 rs << "N/A";
9f95a23c 6459 else {
1adf2230 6460 rs << si_u_t(p->quota_max_objects) << " objects";
9f95a23c
TL
6461 rs << " (current num objects: " << sum.num_objects << " objects)";
6462 }
7c673cae
FG
6463 rs << "\n"
6464 << " max bytes : ";
6465 if (p->quota_max_bytes == 0)
6466 rs << "N/A";
9f95a23c 6467 else {
1adf2230 6468 rs << byte_u_t(p->quota_max_bytes);
9f95a23c
TL
6469 rs << " (current num bytes: " << sum.num_bytes << " bytes)";
6470 }
7c673cae
FG
6471 rdata.append(rs.str());
6472 }
6473 rdata.append("\n");
6474 r = 0;
6475 } else if (prefix == "osd crush rule list" ||
6476 prefix == "osd crush rule ls") {
c07f9fc5
FG
6477 if (f) {
6478 f->open_array_section("rules");
6479 osdmap.crush->list_rules(f.get());
6480 f->close_section();
6481 f->flush(rdata);
6482 } else {
6483 ostringstream ss;
6484 osdmap.crush->list_rules(&ss);
6485 rdata.append(ss.str());
6486 }
b5b8bbf5
FG
6487 } else if (prefix == "osd crush rule ls-by-class") {
6488 string class_name;
9f95a23c 6489 cmd_getval(cmdmap, "class", class_name);
b5b8bbf5
FG
6490 if (class_name.empty()) {
6491 ss << "no class specified";
6492 r = -EINVAL;
6493 goto reply;
6494 }
6495 set<int> rules;
6496 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6497 if (r < 0) {
6498 ss << "failed to get rules by class '" << class_name << "'";
6499 goto reply;
6500 }
6501 if (f) {
6502 f->open_array_section("rules");
6503 for (auto &rule: rules) {
6504 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6505 }
6506 f->close_section();
6507 f->flush(rdata);
6508 } else {
6509 ostringstream rs;
6510 for (auto &rule: rules) {
6511 rs << osdmap.crush->get_rule_name(rule) << "\n";
6512 }
6513 rdata.append(rs.str());
6514 }
7c673cae
FG
6515 } else if (prefix == "osd crush rule dump") {
6516 string name;
9f95a23c 6517 cmd_getval(cmdmap, "name", name);
7c673cae 6518 string format;
9f95a23c 6519 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6520 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6521 if (name == "") {
6522 f->open_array_section("rules");
6523 osdmap.crush->dump_rules(f.get());
6524 f->close_section();
6525 } else {
6526 int ruleno = osdmap.crush->get_rule_id(name);
6527 if (ruleno < 0) {
31f18b77 6528 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
6529 r = ruleno;
6530 goto reply;
6531 }
6532 osdmap.crush->dump_rule(ruleno, f.get());
6533 }
6534 ostringstream rs;
6535 f->flush(rs);
6536 rs << "\n";
6537 rdata.append(rs.str());
6538 } else if (prefix == "osd crush dump") {
6539 string format;
9f95a23c 6540 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6541 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6542 f->open_object_section("crush_map");
6543 osdmap.crush->dump(f.get());
6544 f->close_section();
6545 ostringstream rs;
6546 f->flush(rs);
6547 rs << "\n";
6548 rdata.append(rs.str());
6549 } else if (prefix == "osd crush show-tunables") {
6550 string format;
9f95a23c 6551 cmd_getval(cmdmap, "format", format);
7c673cae
FG
6552 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6553 f->open_object_section("crush_map_tunables");
6554 osdmap.crush->dump_tunables(f.get());
6555 f->close_section();
6556 ostringstream rs;
6557 f->flush(rs);
6558 rs << "\n";
6559 rdata.append(rs.str());
6560 } else if (prefix == "osd crush tree") {
c07f9fc5 6561 string shadow;
9f95a23c 6562 cmd_getval(cmdmap, "shadow", shadow);
c07f9fc5
FG
6563 bool show_shadow = shadow == "--show-shadow";
6564 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6565 if (f) {
91327a77 6566 f->open_object_section("crush_tree");
c07f9fc5
FG
6567 osdmap.crush->dump_tree(nullptr,
6568 f.get(),
6569 osdmap.get_pool_names(),
6570 show_shadow);
91327a77 6571 f->close_section();
c07f9fc5
FG
6572 f->flush(rdata);
6573 } else {
6574 ostringstream ss;
6575 osdmap.crush->dump_tree(&ss,
6576 nullptr,
6577 osdmap.get_pool_names(),
6578 show_shadow);
6579 rdata.append(ss.str());
6580 }
d2e6a577
FG
6581 } else if (prefix == "osd crush ls") {
6582 string name;
9f95a23c 6583 if (!cmd_getval(cmdmap, "node", name)) {
d2e6a577
FG
6584 ss << "no node specified";
6585 r = -EINVAL;
6586 goto reply;
6587 }
6588 if (!osdmap.crush->name_exists(name)) {
6589 ss << "node '" << name << "' does not exist";
6590 r = -ENOENT;
6591 goto reply;
6592 }
6593 int id = osdmap.crush->get_item_id(name);
6594 list<int> result;
6595 if (id >= 0) {
6596 result.push_back(id);
6597 } else {
6598 int num = osdmap.crush->get_bucket_size(id);
6599 for (int i = 0; i < num; ++i) {
6600 result.push_back(osdmap.crush->get_bucket_item(id, i));
6601 }
6602 }
6603 if (f) {
6604 f->open_array_section("items");
6605 for (auto i : result) {
6606 f->dump_string("item", osdmap.crush->get_item_name(i));
6607 }
6608 f->close_section();
6609 f->flush(rdata);
6610 } else {
6611 ostringstream ss;
6612 for (auto i : result) {
6613 ss << osdmap.crush->get_item_name(i) << "\n";
6614 }
6615 rdata.append(ss.str());
6616 }
6617 r = 0;
7c673cae
FG
6618 } else if (prefix == "osd crush class ls") {
6619 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6620 f->open_array_section("crush_classes");
6621 for (auto i : osdmap.crush->class_name)
6622 f->dump_string("class", i.second);
6623 f->close_section();
6624 f->flush(rdata);
224ce89b
WB
6625 } else if (prefix == "osd crush class ls-osd") {
6626 string name;
9f95a23c 6627 cmd_getval(cmdmap, "class", name);
224ce89b
WB
6628 set<int> osds;
6629 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
6630 if (f) {
6631 f->open_array_section("osds");
6632 for (auto &osd: osds)
6633 f->dump_int("osd", osd);
6634 f->close_section();
6635 f->flush(rdata);
6636 } else {
6637 bool first = true;
6638 for (auto &osd : osds) {
6639 if (!first)
6640 ds << "\n";
6641 first = false;
6642 ds << osd;
6643 }
6644 rdata.append(ds);
6645 }
11fdf7f2
TL
6646 } else if (prefix == "osd crush get-device-class") {
6647 vector<string> idvec;
9f95a23c 6648 cmd_getval(cmdmap, "ids", idvec);
11fdf7f2
TL
6649 map<int, string> class_by_osd;
6650 for (auto& id : idvec) {
6651 ostringstream ts;
6652 long osd = parse_osd_id(id.c_str(), &ts);
6653 if (osd < 0) {
6654 ss << "unable to parse osd id:'" << id << "'";
6655 r = -EINVAL;
6656 goto reply;
6657 }
6658 auto device_class = osdmap.crush->get_item_class(osd);
6659 if (device_class)
6660 class_by_osd[osd] = device_class;
6661 else
6662 class_by_osd[osd] = ""; // no class
6663 }
6664 if (f) {
6665 f->open_array_section("osd_device_classes");
6666 for (auto& i : class_by_osd) {
6667 f->open_object_section("osd_device_class");
6668 f->dump_int("osd", i.first);
6669 f->dump_string("device_class", i.second);
6670 f->close_section();
6671 }
6672 f->close_section();
6673 f->flush(rdata);
6674 } else {
6675 if (class_by_osd.size() == 1) {
6676 // for single input, make a clean output
6677 ds << class_by_osd.begin()->second;
6678 } else {
6679 // note that we do not group osds by class here
6680 for (auto it = class_by_osd.begin();
6681 it != class_by_osd.end();
6682 it++) {
6683 ds << "osd." << it->first << ' ' << it->second;
6684 if (next(it) != class_by_osd.end())
6685 ds << '\n';
6686 }
6687 }
6688 rdata.append(ds);
6689 }
7c673cae
FG
6690 } else if (prefix == "osd erasure-code-profile ls") {
6691 const auto &profiles = osdmap.get_erasure_code_profiles();
6692 if (f)
6693 f->open_array_section("erasure-code-profiles");
6694 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6695 if (f)
6696 f->dump_string("profile", i->first.c_str());
6697 else
6698 rdata.append(i->first + "\n");
6699 }
6700 if (f) {
6701 f->close_section();
6702 ostringstream rs;
6703 f->flush(rs);
6704 rs << "\n";
6705 rdata.append(rs.str());
6706 }
c07f9fc5
FG
6707 } else if (prefix == "osd crush weight-set ls") {
6708 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6709 if (f) {
6710 f->open_array_section("weight_sets");
6711 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6712 f->dump_string("pool", "(compat)");
6713 }
6714 for (auto& i : osdmap.crush->choose_args) {
6715 if (i.first >= 0) {
6716 f->dump_string("pool", osdmap.get_pool_name(i.first));
6717 }
6718 }
6719 f->close_section();
6720 f->flush(rdata);
6721 } else {
6722 ostringstream rs;
6723 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6724 rs << "(compat)\n";
6725 }
6726 for (auto& i : osdmap.crush->choose_args) {
6727 if (i.first >= 0) {
6728 rs << osdmap.get_pool_name(i.first) << "\n";
6729 }
6730 }
6731 rdata.append(rs.str());
6732 }
6733 } else if (prefix == "osd crush weight-set dump") {
6734 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6735 "json-pretty"));
6736 osdmap.crush->dump_choose_args(f.get());
6737 f->flush(rdata);
7c673cae
FG
6738 } else if (prefix == "osd erasure-code-profile get") {
6739 string name;
9f95a23c 6740 cmd_getval(cmdmap, "name", name);
7c673cae
FG
6741 if (!osdmap.has_erasure_code_profile(name)) {
6742 ss << "unknown erasure code profile '" << name << "'";
6743 r = -ENOENT;
6744 goto reply;
6745 }
6746 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6747 if (f)
6748 f->open_object_section("profile");
6749 for (map<string,string>::const_iterator i = profile.begin();
6750 i != profile.end();
6751 ++i) {
6752 if (f)
6753 f->dump_string(i->first.c_str(), i->second.c_str());
6754 else
6755 rdata.append(i->first + "=" + i->second + "\n");
6756 }
6757 if (f) {
6758 f->close_section();
6759 ostringstream rs;
6760 f->flush(rs);
6761 rs << "\n";
6762 rdata.append(rs.str());
6763 }
181888fb
FG
6764 } else if (prefix == "osd pool application get") {
6765 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6766 "json-pretty"));
6767 string pool_name;
9f95a23c 6768 cmd_getval(cmdmap, "pool", pool_name);
181888fb 6769 string app;
9f95a23c 6770 cmd_getval(cmdmap, "app", app);
181888fb 6771 string key;
9f95a23c 6772 cmd_getval(cmdmap, "key", key);
181888fb
FG
6773
6774 if (pool_name.empty()) {
6775 // all
6776 f->open_object_section("pools");
6777 for (const auto &pool : osdmap.pools) {
6778 std::string name("<unknown>");
6779 const auto &pni = osdmap.pool_name.find(pool.first);
6780 if (pni != osdmap.pool_name.end())
6781 name = pni->second;
6782 f->open_object_section(name.c_str());
6783 for (auto &app_pair : pool.second.application_metadata) {
6784 f->open_object_section(app_pair.first.c_str());
6785 for (auto &kv_pair : app_pair.second) {
6786 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6787 }
6788 f->close_section();
6789 }
6790 f->close_section(); // name
6791 }
6792 f->close_section(); // pools
6793 f->flush(rdata);
6794 } else {
6795 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6796 if (pool < 0) {
6797 ss << "unrecognized pool '" << pool_name << "'";
6798 r = -ENOENT;
6799 goto reply;
6800 }
6801 auto p = osdmap.get_pg_pool(pool);
6802 // filter by pool
6803 if (app.empty()) {
6804 f->open_object_section(pool_name.c_str());
6805 for (auto &app_pair : p->application_metadata) {
6806 f->open_object_section(app_pair.first.c_str());
6807 for (auto &kv_pair : app_pair.second) {
6808 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6809 }
6810 f->close_section(); // application
6811 }
6812 f->close_section(); // pool_name
6813 f->flush(rdata);
6814 goto reply;
6815 }
6816
6817 auto app_it = p->application_metadata.find(app);
6818 if (app_it == p->application_metadata.end()) {
6819 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6820 r = -ENOENT;
6821 goto reply;
6822 }
6823 // filter by pool + app
6824 if (key.empty()) {
6825 f->open_object_section(app_it->first.c_str());
6826 for (auto &kv_pair : app_it->second) {
6827 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6828 }
6829 f->close_section(); // application
6830 f->flush(rdata);
6831 goto reply;
6832 }
6833 // filter by pool + app + key
6834 auto key_it = app_it->second.find(key);
6835 if (key_it == app_it->second.end()) {
6836 ss << "application '" << app << "' on pool '" << pool_name
6837 << "' does not have key '" << key << "'";
6838 r = -ENOENT;
6839 goto reply;
6840 }
6841 ss << key_it->second << "\n";
6842 rdata.append(ss.str());
6843 ss.str("");
6844 }
11fdf7f2 6845 } else if (prefix == "osd get-require-min-compat-client") {
9f95a23c 6846 ss << osdmap.require_min_compat_client << std::endl;
11fdf7f2
TL
6847 rdata.append(ss.str());
6848 ss.str("");
6849 goto reply;
6850 } else if (prefix == "osd pool application enable" ||
6851 prefix == "osd pool application disable" ||
6852 prefix == "osd pool application set" ||
6853 prefix == "osd pool application rm") {
6854 bool changed = false;
6855 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6856 if (r != 0) {
6857 // Error, reply.
6858 goto reply;
6859 } else if (changed) {
6860 // Valid mutation, proceed to prepare phase
6861 return false;
6862 } else {
6863 // Idempotent case, reply
6864 goto reply;
6865 }
7c673cae
FG
6866 } else {
6867 // try prepare update
6868 return false;
6869 }
6870
6871 reply:
6872 string rs;
6873 getline(ss, rs);
6874 mon->reply_command(op, r, rs, rdata, get_last_committed());
6875 return true;
6876}
6877
3efd9988
FG
6878void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6879{
6880 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6881 osdmap.get_pg_pool(pool_id));
11fdf7f2 6882 ceph_assert(pool);
3efd9988
FG
6883 pool->set_flag(flags);
6884}
6885
6886void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 6887{
3efd9988
FG
6888 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6889 osdmap.get_pg_pool(pool_id));
11fdf7f2 6890 ceph_assert(pool);
3efd9988 6891 pool->unset_flag(flags);
7c673cae
FG
6892}
6893
9f95a23c 6894string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
11fdf7f2
TL
6895{
6896 char k[80];
9f95a23c 6897 snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
11fdf7f2
TL
6898 return k;
6899}
6900
9f95a23c 6901string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
11fdf7f2
TL
6902{
6903 char k[80];
9f95a23c 6904 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
11fdf7f2
TL
6905 (unsigned long long)pool, (unsigned long long)snap);
6906 return k;
6907}
6908
9f95a23c 6909string OSDMonitor::make_purged_snap_key_value(
11fdf7f2
TL
6910 int64_t pool, snapid_t snap, snapid_t num,
6911 epoch_t epoch, bufferlist *v)
6912{
6913 // encode the *last* epoch in the key so that we can use forward
6914 // iteration only to search for an epoch in an interval.
6915 encode(snap, *v);
6916 encode(snap + num, *v);
6917 encode(epoch, *v);
9f95a23c 6918 return make_purged_snap_key(pool, snap + num - 1);
11fdf7f2
TL
6919}
6920
11fdf7f2 6921
9f95a23c
TL
6922int OSDMonitor::lookup_purged_snap(
6923 int64_t pool, snapid_t snap,
6924 snapid_t *begin, snapid_t *end)
11fdf7f2 6925{
9f95a23c 6926 string k = make_purged_snap_key(pool, snap);
11fdf7f2
TL
6927 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6928 it->lower_bound(k);
6929 if (!it->valid()) {
9f95a23c
TL
6930 dout(20) << __func__
6931 << " pool " << pool << " snap " << snap
6932 << " - key '" << k << "' not found" << dendl;
6933 return -ENOENT;
6934 }
6935 if (it->key().find("purged_snap_") != 0) {
6936 dout(20) << __func__
6937 << " pool " << pool << " snap " << snap
6938 << " - key '" << k << "' got '" << it->key()
6939 << "', wrong prefix" << dendl;
11fdf7f2
TL
6940 return -ENOENT;
6941 }
9f95a23c
TL
6942 string gotk = it->key();
6943 const char *format = "purged_snap_%llu_";
6944 long long int keypool;
6945 int n = sscanf(gotk.c_str(), format, &keypool);
6946 if (n != 1) {
6947 derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6948 return -ENOENT;
6949 }
6950 if (pool != keypool) {
6951 dout(20) << __func__
6952 << " pool " << pool << " snap " << snap
6953 << " - key '" << k << "' got '" << gotk
6954 << "', wrong pool " << keypool
6955 << dendl;
11fdf7f2
TL
6956 return -ENOENT;
6957 }
6958 bufferlist v = it->value();
6959 auto p = v.cbegin();
6960 decode(*begin, p);
6961 decode(*end, p);
6962 if (snap < *begin || snap >= *end) {
9f95a23c
TL
6963 dout(20) << __func__
6964 << " pool " << pool << " snap " << snap
6965 << " - found [" << *begin << "," << *end << "), no overlap"
6966 << dendl;
11fdf7f2
TL
6967 return -ENOENT;
6968 }
6969 return 0;
6970}
6971
9f95a23c
TL
6972void OSDMonitor::insert_purged_snap_update(
6973 int64_t pool,
6974 snapid_t start, snapid_t end,
6975 epoch_t epoch,
6976 MonitorDBStore::TransactionRef t)
6977{
6978 snapid_t before_begin, before_end;
6979 snapid_t after_begin, after_end;
6980 int b = lookup_purged_snap(pool, start - 1,
6981 &before_begin, &before_end);
6982 int a = lookup_purged_snap(pool, end,
6983 &after_begin, &after_end);
6984 if (!b && !a) {
6985 dout(10) << __func__
6986 << " [" << start << "," << end << ") - joins ["
6987 << before_begin << "," << before_end << ") and ["
6988 << after_begin << "," << after_end << ")" << dendl;
6989 // erase only the begin record; we'll overwrite the end one.
6990 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6991 bufferlist v;
6992 string k = make_purged_snap_key_value(pool,
6993 before_begin, after_end - before_begin,
6994 pending_inc.epoch, &v);
6995 t->put(OSD_SNAP_PREFIX, k, v);
6996 } else if (!b) {
6997 dout(10) << __func__
6998 << " [" << start << "," << end << ") - join with earlier ["
6999 << before_begin << "," << before_end << ")" << dendl;
7000 t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7001 bufferlist v;
7002 string k = make_purged_snap_key_value(pool,
7003 before_begin, end - before_begin,
7004 pending_inc.epoch, &v);
7005 t->put(OSD_SNAP_PREFIX, k, v);
7006 } else if (!a) {
7007 dout(10) << __func__
7008 << " [" << start << "," << end << ") - join with later ["
7009 << after_begin << "," << after_end << ")" << dendl;
7010 // overwrite after record
7011 bufferlist v;
7012 string k = make_purged_snap_key_value(pool,
7013 start, after_end - start,
7014 pending_inc.epoch, &v);
7015 t->put(OSD_SNAP_PREFIX, k, v);
7016 } else {
7017 dout(10) << __func__
7018 << " [" << start << "," << end << ") - new"
7019 << dendl;
7020 bufferlist v;
7021 string k = make_purged_snap_key_value(pool,
7022 start, end - start,
7023 pending_inc.epoch, &v);
7024 t->put(OSD_SNAP_PREFIX, k, v);
7025 }
7026}
7027
11fdf7f2
TL
7028bool OSDMonitor::try_prune_purged_snaps()
7029{
7030 if (!mon->mgrstatmon()->is_readable()) {
7031 return false;
7032 }
11fdf7f2
TL
7033 if (!pending_inc.new_purged_snaps.empty()) {
7034 return false; // we already pruned for this epoch
7035 }
7036
7037 unsigned max_prune = cct->_conf.get_val<uint64_t>(
7038 "mon_max_snap_prune_per_epoch");
7039 if (!max_prune) {
7040 max_prune = 100000;
7041 }
7042 dout(10) << __func__ << " max_prune " << max_prune << dendl;
7043
7044 unsigned actually_pruned = 0;
7045 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7046 for (auto& p : osdmap.get_pools()) {
7047 auto q = purged_snaps.find(p.first);
7048 if (q == purged_snaps.end()) {
7049 continue;
7050 }
7051 auto& purged = q->second;
7052 if (purged.empty()) {
7053 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7054 continue;
7055 }
7056 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
9f95a23c 7057 snap_interval_set_t to_prune;
11fdf7f2
TL
7058 unsigned maybe_pruned = actually_pruned;
7059 for (auto i = purged.begin(); i != purged.end(); ++i) {
7060 snapid_t begin = i.get_start();
7061 auto end = i.get_start() + i.get_len();
7062 snapid_t pbegin = 0, pend = 0;
9f95a23c 7063 int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
11fdf7f2
TL
7064 if (r == 0) {
7065 // already purged.
7066 // be a bit aggressive about backing off here, because the mon may
7067 // do a lot of work going through this set, and if we know the
7068 // purged set from the OSDs is at least *partly* stale we may as
7069 // well wait for it to be fresh.
9f95a23c 7070 dout(20) << __func__ << " we've already purged " << pbegin
11fdf7f2
TL
7071 << "~" << (pend - pbegin) << dendl;
7072 break; // next pool
7073 }
9f95a23c 7074 if (pbegin && pbegin > begin && pbegin < end) {
11fdf7f2 7075 // the tail of [begin,end) is purged; shorten the range
11fdf7f2
TL
7076 end = pbegin;
7077 }
7078 to_prune.insert(begin, end - begin);
7079 maybe_pruned += end - begin;
7080 if (maybe_pruned >= max_prune) {
7081 break;
7082 }
7083 }
7084 if (!to_prune.empty()) {
7085 // PGs may still be reporting things as purged that we have already
7086 // pruned from removed_snaps_queue.
9f95a23c 7087 snap_interval_set_t actual;
11fdf7f2
TL
7088 auto r = osdmap.removed_snaps_queue.find(p.first);
7089 if (r != osdmap.removed_snaps_queue.end()) {
7090 actual.intersection_of(to_prune, r->second);
7091 }
7092 actually_pruned += actual.size();
7093 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7094 << ", actual pruned " << actual << dendl;
7095 if (!actual.empty()) {
7096 pending_inc.new_purged_snaps[p.first].swap(actual);
7097 }
7098 }
7099 if (actually_pruned >= max_prune) {
7100 break;
7101 }
7102 }
7103 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7104 return !!actually_pruned;
7105}
7106
7c673cae
FG
7107bool OSDMonitor::update_pools_status()
7108{
11fdf7f2 7109 if (!mon->mgrstatmon()->is_readable())
7c673cae
FG
7110 return false;
7111
7112 bool ret = false;
7113
7114 auto& pools = osdmap.get_pools();
7115 for (auto it = pools.begin(); it != pools.end(); ++it) {
11fdf7f2 7116 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
31f18b77 7117 if (!pstat)
7c673cae 7118 continue;
31f18b77 7119 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
7120 const pg_pool_t &pool = it->second;
7121 const string& pool_name = osdmap.get_pool_name(it->first);
7122
7123 bool pool_is_full =
7124 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7125 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7126
11fdf7f2 7127 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7c673cae
FG
7128 if (pool_is_full)
7129 continue;
7130
7131 mon->clog->info() << "pool '" << pool_name
3efd9988
FG
7132 << "' no longer out of quota; removing NO_QUOTA flag";
7133 // below we cancel FLAG_FULL too, we'll set it again in
7134 // OSDMonitor::encode_pending if it still fails the osd-full checking.
7135 clear_pool_flags(it->first,
11fdf7f2 7136 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
7137 ret = true;
7138 } else {
7139 if (!pool_is_full)
7140 continue;
7141
7142 if (pool.quota_max_bytes > 0 &&
7143 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7144 mon->clog->warn() << "pool '" << pool_name << "' is full"
7145 << " (reached quota's max_bytes: "
1adf2230 7146 << byte_u_t(pool.quota_max_bytes) << ")";
7c673cae
FG
7147 }
7148 if (pool.quota_max_objects > 0 &&
7149 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7150 mon->clog->warn() << "pool '" << pool_name << "' is full"
7151 << " (reached quota's max_objects: "
7152 << pool.quota_max_objects << ")";
7153 }
11fdf7f2 7154 // set both FLAG_FULL_QUOTA and FLAG_FULL
3efd9988
FG
7155 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7156 // since FLAG_FULL should always take precedence
7157 set_pool_flags(it->first,
11fdf7f2 7158 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
3efd9988
FG
7159 clear_pool_flags(it->first,
7160 pg_pool_t::FLAG_NEARFULL |
7161 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
7162 ret = true;
7163 }
7164 }
7165 return ret;
7166}
7167
7c673cae
FG
7168int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7169{
7170 op->mark_osdmon_event(__func__);
9f95a23c 7171 auto m = op->get_req<MPoolOp>();
7c673cae 7172 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
11fdf7f2 7173 MonSession *session = op->get_session();
7c673cae
FG
7174 if (!session)
7175 return -EPERM;
7176 string erasure_code_profile;
7177 stringstream ss;
31f18b77 7178 string rule_name;
94b18763 7179 int ret = 0;
11fdf7f2
TL
7180 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7181 0, 0, 0, 0, 0, 0.0,
7182 erasure_code_profile,
9f95a23c
TL
7183 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7184 &ss);
94b18763
FG
7185
7186 if (ret < 0) {
7187 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7188 }
7189 return ret;
7c673cae
FG
7190}
7191
7192int OSDMonitor::crush_rename_bucket(const string& srcname,
7193 const string& dstname,
7194 ostream *ss)
7195{
7196 int ret;
7197 //
7198 // Avoid creating a pending crush if it does not already exists and
7199 // the rename would fail.
7200 //
7201 if (!_have_pending_crush()) {
7202 ret = _get_stable_crush().can_rename_bucket(srcname,
7203 dstname,
7204 ss);
7205 if (ret)
7206 return ret;
7207 }
7208
7209 CrushWrapper newcrush;
7210 _get_pending_crush(newcrush);
7211
7212 ret = newcrush.rename_bucket(srcname,
7213 dstname,
7214 ss);
7215 if (ret)
7216 return ret;
7217
7218 pending_inc.crush.clear();
7219 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7220 *ss << "renamed bucket " << srcname << " into " << dstname;
7221 return 0;
7222}
7223
7224void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7225{
7226 string replacement = "";
7227
7228 if (plugin == "jerasure_generic" ||
7229 plugin == "jerasure_sse3" ||
7230 plugin == "jerasure_sse4" ||
7231 plugin == "jerasure_neon") {
7232 replacement = "jerasure";
7233 } else if (plugin == "shec_generic" ||
7234 plugin == "shec_sse3" ||
7235 plugin == "shec_sse4" ||
7236 plugin == "shec_neon") {
7237 replacement = "shec";
7238 }
7239
7240 if (replacement != "") {
7241 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7242 << plugin << " that has been deprecated. Please use "
7243 << replacement << " instead." << dendl;
7244 }
7245}
7246
7247int OSDMonitor::normalize_profile(const string& profilename,
7248 ErasureCodeProfile &profile,
7249 bool force,
7250 ostream *ss)
7251{
7252 ErasureCodeInterfaceRef erasure_code;
7253 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7254 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7255 check_legacy_ec_plugin(plugin->second, profilename);
7256 int err = instance.factory(plugin->second,
11fdf7f2 7257 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7258 profile, &erasure_code, ss);
7259 if (err) {
7260 return err;
7261 }
7262
7263 err = erasure_code->init(profile, ss);
7264 if (err) {
7265 return err;
7266 }
7267
7268 auto it = profile.find("stripe_unit");
7269 if (it != profile.end()) {
7270 string err_str;
1adf2230 7271 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7c673cae
FG
7272 if (!err_str.empty()) {
7273 *ss << "could not parse stripe_unit '" << it->second
7274 << "': " << err_str << std::endl;
7275 return -EINVAL;
7276 }
7277 uint32_t data_chunks = erasure_code->get_data_chunk_count();
7278 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7279 if (chunk_size != stripe_unit) {
7280 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7281 << "alignment. Would be padded to " << chunk_size
7282 << std::endl;
7283 return -EINVAL;
7284 }
7285 if ((stripe_unit % 4096) != 0 && !force) {
7286 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7287 << "use --force to override this check" << std::endl;
7288 return -EINVAL;
7289 }
7290 }
7291 return 0;
7292}
7293
31f18b77 7294int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 7295 const string &profile,
31f18b77 7296 int *rule,
7c673cae
FG
7297 ostream *ss)
7298{
7299 int ruleid = osdmap.crush->get_rule_id(name);
7300 if (ruleid != -ENOENT) {
31f18b77 7301 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
7302 return -EEXIST;
7303 }
7304
7305 CrushWrapper newcrush;
7306 _get_pending_crush(newcrush);
7307
7308 ruleid = newcrush.get_rule_id(name);
7309 if (ruleid != -ENOENT) {
31f18b77 7310 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
7311 return -EALREADY;
7312 } else {
7313 ErasureCodeInterfaceRef erasure_code;
7314 int err = get_erasure_code(profile, &erasure_code, ss);
7315 if (err) {
7316 *ss << "failed to load plugin using profile " << profile << std::endl;
7317 return err;
7318 }
7319
224ce89b 7320 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
7321 erasure_code.reset();
7322 if (err < 0)
7323 return err;
31f18b77 7324 *rule = err;
7c673cae
FG
7325 pending_inc.crush.clear();
7326 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7327 return 0;
7328 }
7329}
7330
7331int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7332 ErasureCodeInterfaceRef *erasure_code,
7333 ostream *ss) const
7334{
7335 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7336 return -EAGAIN;
7337 ErasureCodeProfile profile =
7338 osdmap.get_erasure_code_profile(erasure_code_profile);
7339 ErasureCodeProfile::const_iterator plugin =
7340 profile.find("plugin");
7341 if (plugin == profile.end()) {
7342 *ss << "cannot determine the erasure code plugin"
7343 << " because there is no 'plugin' entry in the erasure_code_profile "
7344 << profile << std::endl;
7345 return -EINVAL;
7346 }
7347 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7348 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7349 return instance.factory(plugin->second,
11fdf7f2 7350 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
7351 profile, erasure_code, ss);
7352}
7353
7354int OSDMonitor::check_cluster_features(uint64_t features,
7355 stringstream &ss)
7356{
7357 stringstream unsupported_ss;
7358 int unsupported_count = 0;
7359 if ((mon->get_quorum_con_features() & features) != features) {
7360 unsupported_ss << "the monitor cluster";
7361 ++unsupported_count;
7362 }
7363
7364 set<int32_t> up_osds;
7365 osdmap.get_up_osds(up_osds);
7366 for (set<int32_t>::iterator it = up_osds.begin();
7367 it != up_osds.end(); ++it) {
7368 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7369 if ((xi.features & features) != features) {
7370 if (unsupported_count > 0)
7371 unsupported_ss << ", ";
7372 unsupported_ss << "osd." << *it;
7373 unsupported_count ++;
7374 }
7375 }
7376
7377 if (unsupported_count > 0) {
7378 ss << "features " << features << " unsupported by: "
7379 << unsupported_ss.str();
7380 return -ENOTSUP;
7381 }
7382
7383 // check pending osd state, too!
7384 for (map<int32_t,osd_xinfo_t>::const_iterator p =
7385 pending_inc.new_xinfo.begin();
7386 p != pending_inc.new_xinfo.end(); ++p) {
7387 const osd_xinfo_t &xi = p->second;
7388 if ((xi.features & features) != features) {
7389 dout(10) << __func__ << " pending osd." << p->first
7390 << " features are insufficient; retry" << dendl;
7391 return -EAGAIN;
7392 }
7393 }
7394
7395 return 0;
7396}
7397
7398bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7399 stringstream& ss)
7400{
7401 OSDMap::Incremental new_pending = pending_inc;
11fdf7f2 7402 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7c673cae
FG
7403 OSDMap newmap;
7404 newmap.deepish_copy_from(osdmap);
7405 newmap.apply_incremental(new_pending);
7406
7407 // client compat
9f95a23c 7408 if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7c673cae 7409 auto mv = newmap.get_min_compat_client();
31f18b77 7410 if (mv > newmap.require_min_compat_client) {
9f95a23c 7411 ss << "new crush map requires client version " << mv
7c673cae 7412 << " but require_min_compat_client is "
9f95a23c 7413 << newmap.require_min_compat_client;
7c673cae
FG
7414 return false;
7415 }
7416 }
7417
7418 // osd compat
7419 uint64_t features =
7420 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7421 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7422 stringstream features_ss;
7423 int r = check_cluster_features(features, features_ss);
7424 if (r) {
7425 ss << "Could not change CRUSH: " << features_ss.str();
7426 return false;
7427 }
7428
7429 return true;
7430}
7431
7432bool OSDMonitor::erasure_code_profile_in_use(
7433 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7434 const string &profile,
7435 ostream *ss)
7436{
7437 bool found = false;
7438 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7439 p != pools.end();
7440 ++p) {
11fdf7f2 7441 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7c673cae
FG
7442 *ss << osdmap.pool_name[p->first] << " ";
7443 found = true;
7444 }
7445 }
7446 if (found) {
7447 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7448 }
7449 return found;
7450}
7451
7452int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7453 map<string,string> *erasure_code_profile_map,
7454 ostream *ss)
7455{
11fdf7f2
TL
7456 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7457 get_json_str_map,
7458 *ss,
7459 erasure_code_profile_map,
7460 true);
7c673cae
FG
7461 if (r)
7462 return r;
11fdf7f2 7463 ceph_assert((*erasure_code_profile_map).count("plugin"));
7c673cae
FG
7464 string default_plugin = (*erasure_code_profile_map)["plugin"];
7465 map<string,string> user_map;
7466 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7467 i != erasure_code_profile.end();
7468 ++i) {
7469 size_t equal = i->find('=');
7470 if (equal == string::npos) {
7471 user_map[*i] = string();
7472 (*erasure_code_profile_map)[*i] = string();
7473 } else {
11fdf7f2 7474 const string key = i->substr(0, equal);
7c673cae
FG
7475 equal++;
7476 const string value = i->substr(equal);
11fdf7f2
TL
7477 if (key.find("ruleset-") == 0) {
7478 *ss << "property '" << key << "' is no longer supported; try "
7479 << "'crush-" << key.substr(8) << "' instead";
7480 return -EINVAL;
3efd9988 7481 }
7c673cae
FG
7482 user_map[key] = value;
7483 (*erasure_code_profile_map)[key] = value;
7484 }
7485 }
7486
7487 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7488 (*erasure_code_profile_map) = user_map;
7489
7490 return 0;
7491}
7492
7493int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7494 const string &erasure_code_profile,
11fdf7f2 7495 uint8_t repl_size,
7c673cae
FG
7496 unsigned *size, unsigned *min_size,
7497 ostream *ss)
7498{
7499 int err = 0;
7500 switch (pool_type) {
7501 case pg_pool_t::TYPE_REPLICATED:
11fdf7f2
TL
7502 if (repl_size == 0) {
7503 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7504 }
7505 *size = repl_size;
7506 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7c673cae
FG
7507 break;
7508 case pg_pool_t::TYPE_ERASURE:
7509 {
7510 ErasureCodeInterfaceRef erasure_code;
7511 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7512 if (err == 0) {
7513 *size = erasure_code->get_chunk_count();
11fdf7f2
TL
7514 *min_size =
7515 erasure_code->get_data_chunk_count() +
7516 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7517 assert(*min_size <= *size);
7518 assert(*min_size >= erasure_code->get_data_chunk_count());
7c673cae
FG
7519 }
7520 }
7521 break;
7522 default:
7523 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7524 err = -EINVAL;
7525 break;
7526 }
7527 return err;
7528}
7529
7530int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7531 const string &erasure_code_profile,
7532 uint32_t *stripe_width,
7533 ostream *ss)
7534{
7535 int err = 0;
7536 switch (pool_type) {
7537 case pg_pool_t::TYPE_REPLICATED:
7538 // ignored
7539 break;
7540 case pg_pool_t::TYPE_ERASURE:
7541 {
7542 ErasureCodeProfile profile =
7543 osdmap.get_erasure_code_profile(erasure_code_profile);
7544 ErasureCodeInterfaceRef erasure_code;
7545 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7546 if (err)
7547 break;
7548 uint32_t data_chunks = erasure_code->get_data_chunk_count();
11fdf7f2 7549 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7c673cae
FG
7550 auto it = profile.find("stripe_unit");
7551 if (it != profile.end()) {
7552 string err_str;
1adf2230 7553 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
11fdf7f2 7554 ceph_assert(err_str.empty());
7c673cae
FG
7555 }
7556 *stripe_width = data_chunks *
7557 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7558 }
7559 break;
7560 default:
7561 *ss << "prepare_pool_stripe_width: "
7562 << pool_type << " is not a known pool type";
7563 err = -EINVAL;
7564 break;
7565 }
7566 return err;
7567}
7568
31f18b77 7569int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
7570 const string &erasure_code_profile,
7571 const string &rule_name,
7572 int *crush_rule,
7573 ostream *ss)
7c673cae
FG
7574{
7575
31f18b77 7576 if (*crush_rule < 0) {
7c673cae
FG
7577 switch (pool_type) {
7578 case pg_pool_t::TYPE_REPLICATED:
7579 {
31f18b77 7580 if (rule_name == "") {
224ce89b 7581 // Use default rule
11fdf7f2 7582 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
31f18b77
FG
7583 if (*crush_rule < 0) {
7584 // Errors may happen e.g. if no valid rule is available
7585 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
7586 << "'osd pool default crush *' config options";
7587 return -ENOENT;
7588 }
7589 } else {
31f18b77 7590 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
7591 }
7592 }
7593 break;
7594 case pg_pool_t::TYPE_ERASURE:
7595 {
31f18b77 7596 int err = crush_rule_create_erasure(rule_name,
7c673cae 7597 erasure_code_profile,
31f18b77 7598 crush_rule, ss);
7c673cae
FG
7599 switch (err) {
7600 case -EALREADY:
31f18b77
FG
7601 dout(20) << "prepare_pool_crush_rule: rule "
7602 << rule_name << " try again" << dendl;
7c673cae
FG
7603 // fall through
7604 case 0:
7605 // need to wait for the crush rule to be proposed before proceeding
7606 err = -EAGAIN;
7607 break;
7608 case -EEXIST:
7609 err = 0;
7610 break;
7611 }
7612 return err;
7613 }
7614 break;
7615 default:
31f18b77 7616 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
7617 << " is not a known pool type";
7618 return -EINVAL;
7619 break;
7620 }
7621 } else {
31f18b77
FG
7622 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7623 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
7624 return -ENOENT;
7625 }
7626 }
7627
7628 return 0;
7629}
7630
31f18b77 7631int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
7632 int *crush_rule,
7633 ostream *ss)
7c673cae
FG
7634{
7635 int ret;
31f18b77 7636 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
7637 if (ret != -ENOENT) {
7638 // found it, use it
31f18b77 7639 *crush_rule = ret;
7c673cae
FG
7640 } else {
7641 CrushWrapper newcrush;
7642 _get_pending_crush(newcrush);
7643
31f18b77 7644 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
7645 if (ret != -ENOENT) {
7646 // found it, wait for it to be proposed
31f18b77 7647 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
7648 << " try again" << dendl;
7649 return -EAGAIN;
7650 } else {
224ce89b 7651 // Cannot find it , return error
31f18b77 7652 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
7653 return ret;
7654 }
7655 }
7656 return 0;
7657}
7658
3efd9988
FG
7659int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7660{
11fdf7f2 7661 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988
FG
7662 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7663 auto max_pgs = max_pgs_per_osd * num_osds;
7664 uint64_t projected = 0;
7665 if (pool < 0) {
7666 projected += pg_num * size;
7667 }
7668 for (const auto& i : osdmap.get_pools()) {
7669 if (i.first == pool) {
7670 projected += pg_num * size;
7671 } else {
11fdf7f2 7672 projected += i.second.get_pg_num_target() * i.second.get_size();
3efd9988
FG
7673 }
7674 }
7675 if (projected > max_pgs) {
7676 if (pool >= 0) {
7677 *ss << "pool id " << pool;
7678 }
7679 *ss << " pg_num " << pg_num << " size " << size
7680 << " would mean " << projected
7681 << " total pgs, which exceeds max " << max_pgs
7682 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7683 << " * num_in_osds " << num_osds << ")";
7684 return -ERANGE;
7685 }
7686 return 0;
7687}
7688
7c673cae
FG
7689/**
7690 * @param name The name of the new pool
31f18b77
FG
7691 * @param crush_rule The crush rule to use. If <0, will use the system default
7692 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
7693 * @param pg_num The pg_num to use. If set to 0, will use the system default
7694 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
11fdf7f2 7695 * @param repl_size Replication factor, or 0 for default
7c673cae
FG
7696 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7697 * @param pool_type TYPE_ERASURE, or TYPE_REP
7698 * @param expected_num_objects expected number of objects on the pool
7699 * @param fast_read fast read type.
7700 * @param ss human readable error message, if any.
7701 *
7702 * @return 0 on success, negative errno on failure.
7703 */
11fdf7f2 7704int OSDMonitor::prepare_new_pool(string& name,
31f18b77
FG
7705 int crush_rule,
7706 const string &crush_rule_name,
7c673cae 7707 unsigned pg_num, unsigned pgp_num,
11fdf7f2
TL
7708 unsigned pg_num_min,
7709 const uint64_t repl_size,
7710 const uint64_t target_size_bytes,
7711 const float target_size_ratio,
7c673cae
FG
7712 const string &erasure_code_profile,
7713 const unsigned pool_type,
7714 const uint64_t expected_num_objects,
7715 FastReadType fast_read,
9f95a23c 7716 const string& pg_autoscale_mode,
7c673cae
FG
7717 ostream *ss)
7718{
7719 if (name.length() == 0)
7720 return -EINVAL;
7721 if (pg_num == 0)
11fdf7f2 7722 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7c673cae 7723 if (pgp_num == 0)
11fdf7f2
TL
7724 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7725 if (!pgp_num)
7726 pgp_num = pg_num;
7727 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7c673cae 7728 *ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 7729 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7c673cae
FG
7730 << " (you may adjust 'mon max pool pg num' for higher values)";
7731 return -ERANGE;
7732 }
7733 if (pgp_num > pg_num) {
7734 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7735 << ", which in this case is " << pg_num;
7736 return -ERANGE;
7737 }
7738 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7739 *ss << "'fast_read' can only apply to erasure coding pool";
7740 return -EINVAL;
7741 }
7742 int r;
31f18b77
FG
7743 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7744 crush_rule_name, &crush_rule, ss);
7c673cae 7745 if (r) {
94b18763 7746 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
7747 return r;
7748 }
11fdf7f2 7749 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
7750 CrushWrapper newcrush;
7751 _get_pending_crush(newcrush);
7752 ostringstream err;
7753 CrushTester tester(newcrush, err);
b5b8bbf5 7754 tester.set_min_x(0);
224ce89b
WB
7755 tester.set_max_x(50);
7756 tester.set_rule(crush_rule);
b5b8bbf5 7757 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 7758 r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 7759 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 7760 if (r < 0) {
94b18763 7761 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
7762 << ": " << err.str() << dendl;
7763 *ss << "crush test failed with " << r << ": " << err.str();
7764 return r;
7765 }
181888fb 7766 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 7767 << duration << dendl;
7c673cae
FG
7768 }
7769 unsigned size, min_size;
11fdf7f2
TL
7770 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7771 &size, &min_size, ss);
7c673cae 7772 if (r) {
94b18763 7773 dout(10) << "prepare_pool_size returns " << r << dendl;
7c673cae
FG
7774 return r;
7775 }
3efd9988
FG
7776 r = check_pg_num(-1, pg_num, size, ss);
7777 if (r) {
94b18763 7778 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
7779 return r;
7780 }
7c673cae 7781
31f18b77 7782 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
7783 return -EINVAL;
7784 }
7785
7786 uint32_t stripe_width = 0;
7787 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7788 if (r) {
94b18763 7789 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
7790 return r;
7791 }
7792
7793 bool fread = false;
7794 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7795 switch (fast_read) {
7796 case FAST_READ_OFF:
7797 fread = false;
7798 break;
7799 case FAST_READ_ON:
7800 fread = true;
7801 break;
7802 case FAST_READ_DEFAULT:
11fdf7f2 7803 fread = g_conf()->osd_pool_default_ec_fast_read;
7c673cae
FG
7804 break;
7805 default:
7806 *ss << "invalid fast_read setting: " << fast_read;
7807 return -EINVAL;
7808 }
7809 }
7810
7811 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7812 p != pending_inc.new_pool_names.end();
7813 ++p) {
7814 if (p->second == name)
7815 return 0;
7816 }
7817
7818 if (-1 == pending_inc.new_pool_max)
7819 pending_inc.new_pool_max = osdmap.pool_max;
7820 int64_t pool = ++pending_inc.new_pool_max;
7821 pg_pool_t empty;
7822 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
11fdf7f2 7823 pi->create_time = ceph_clock_now();
7c673cae
FG
7824 pi->type = pool_type;
7825 pi->fast_read = fread;
11fdf7f2
TL
7826 pi->flags = g_conf()->osd_pool_default_flags;
7827 if (g_conf()->osd_pool_default_flag_hashpspool)
7c673cae 7828 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
11fdf7f2 7829 if (g_conf()->osd_pool_default_flag_nodelete)
7c673cae 7830 pi->set_flag(pg_pool_t::FLAG_NODELETE);
11fdf7f2 7831 if (g_conf()->osd_pool_default_flag_nopgchange)
7c673cae 7832 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
11fdf7f2 7833 if (g_conf()->osd_pool_default_flag_nosizechange)
7c673cae 7834 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
7835 pi->set_flag(pg_pool_t::FLAG_CREATING);
7836 if (g_conf()->osd_pool_use_gmt_hitset)
7c673cae
FG
7837 pi->use_gmt_hitset = true;
7838 else
7839 pi->use_gmt_hitset = false;
7840
7841 pi->size = size;
7842 pi->min_size = min_size;
31f18b77 7843 pi->crush_rule = crush_rule;
7c673cae
FG
7844 pi->expected_num_objects = expected_num_objects;
7845 pi->object_hash = CEPH_STR_HASH_RJENKINS;
11fdf7f2 7846
9f95a23c
TL
7847 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7848 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7849 m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7850 pi->pg_autoscale_mode = m;
7851 } else {
7852 pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
11fdf7f2
TL
7853 }
7854 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7855 pi->set_pg_num(
7856 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7857 : pg_num);
7858 pi->set_pg_num_pending(pi->get_pg_num());
7859 pi->set_pg_num_target(pg_num);
7860 pi->set_pgp_num(pi->get_pg_num());
7861 pi->set_pgp_num_target(pgp_num);
9f95a23c 7862 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
7863 pg_num_min) {
7864 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7865 }
9f95a23c
TL
7866 if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7867 pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7868 pi->pg_autoscale_mode = m;
7869 }
11fdf7f2 7870
7c673cae 7871 pi->last_change = pending_inc.epoch;
11fdf7f2
TL
7872 pi->auid = 0;
7873
7874 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7875 pi->erasure_code_profile = erasure_code_profile;
7876 } else {
7877 pi->erasure_code_profile = "";
7878 }
7c673cae 7879 pi->stripe_width = stripe_width;
11fdf7f2 7880
9f95a23c 7881 if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
7882 target_size_bytes) {
7883 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7884 // larger than int32_t max.
7885 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7886 }
7887 if (target_size_ratio > 0.0 &&
9f95a23c 7888 osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
7889 // only store for nautilus+, just to be consistent and tidy.
7890 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7891 }
7892
7c673cae 7893 pi->cache_target_dirty_ratio_micro =
11fdf7f2 7894 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7c673cae 7895 pi->cache_target_dirty_high_ratio_micro =
11fdf7f2 7896 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7c673cae 7897 pi->cache_target_full_ratio_micro =
11fdf7f2
TL
7898 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7899 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7900 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7901
7c673cae
FG
7902 pending_inc.new_pool_names[pool] = name;
7903 return 0;
7904}
7905
7906bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7907{
7908 op->mark_osdmon_event(__func__);
7909 ostringstream ss;
7910 if (pending_inc.new_flags < 0)
7911 pending_inc.new_flags = osdmap.get_flags();
7912 pending_inc.new_flags |= flag;
7913 ss << OSDMap::get_flag_string(flag) << " is set";
7914 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7915 get_last_committed() + 1));
7916 return true;
7917}
7918
7919bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7920{
7921 op->mark_osdmon_event(__func__);
7922 ostringstream ss;
7923 if (pending_inc.new_flags < 0)
7924 pending_inc.new_flags = osdmap.get_flags();
7925 pending_inc.new_flags &= ~flag;
7926 ss << OSDMap::get_flag_string(flag) << " is unset";
7927 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7928 get_last_committed() + 1));
7929 return true;
7930}
7931
11fdf7f2 7932int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7c673cae
FG
7933 stringstream& ss)
7934{
7935 string poolstr;
9f95a23c 7936 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
7937 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7938 if (pool < 0) {
7939 ss << "unrecognized pool '" << poolstr << "'";
7940 return -ENOENT;
7941 }
7942 string var;
9f95a23c 7943 cmd_getval(cmdmap, "var", var);
7c673cae
FG
7944
7945 pg_pool_t p = *osdmap.get_pg_pool(pool);
7946 if (pending_inc.new_pools.count(pool))
7947 p = pending_inc.new_pools[pool];
7948
7949 // accept val as a json string in the normal case (current
7950 // generation monitor). parse out int or float values from the
7951 // string as needed. however, if it is not a string, try to pull
7952 // out an int, in case an older monitor with an older json schema is
7953 // forwarding a request.
7954 string val;
7955 string interr, floaterr;
7956 int64_t n = 0;
7957 double f = 0;
7958 int64_t uf = 0; // micro-f
9f95a23c 7959 cmd_getval(cmdmap, "val", val);
f64942e4 7960
9f95a23c
TL
7961 auto si_options = {
7962 "target_max_objects"
7963 };
7964 auto iec_options = {
7965 "target_max_bytes",
7966 "target_size_bytes",
7967 "compression_max_blob_size",
7968 "compression_min_blob_size",
7969 "csum_max_block",
7970 "csum_min_block",
7971 };
7972 if (count(begin(si_options), end(si_options), var)) {
92f5a8d4 7973 n = strict_si_cast<int64_t>(val.c_str(), &interr);
9f95a23c 7974 } else if (count(begin(iec_options), end(iec_options), var)) {
92f5a8d4
TL
7975 n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7976 } else {
7977 // parse string as both int and float; different fields use different types.
7978 n = strict_strtoll(val.c_str(), 10, &interr);
7979 f = strict_strtod(val.c_str(), &floaterr);
7980 uf = llrintl(f * (double)1000000.0);
7981 }
7c673cae
FG
7982
7983 if (!p.is_tier() &&
7984 (var == "hit_set_type" || var == "hit_set_period" ||
7985 var == "hit_set_count" || var == "hit_set_fpp" ||
7986 var == "target_max_objects" || var == "target_max_bytes" ||
7987 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7988 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7989 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7990 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7991 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7992 return -EACCES;
7993 }
7994
7995 if (var == "size") {
7996 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7997 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7998 return -EPERM;
7999 }
8000 if (p.type == pg_pool_t::TYPE_ERASURE) {
8001 ss << "can not change the size of an erasure-coded pool";
8002 return -ENOTSUP;
8003 }
8004 if (interr.length()) {
8005 ss << "error parsing integer value '" << val << "': " << interr;
8006 return -EINVAL;
8007 }
8008 if (n <= 0 || n > 10) {
8009 ss << "pool size must be between 1 and 10";
8010 return -EINVAL;
8011 }
eafe8130
TL
8012 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8013 return -EINVAL;
8014 }
3efd9988
FG
8015 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8016 if (r < 0) {
8017 return r;
8018 }
7c673cae 8019 p.size = n;
1911f103 8020 p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
7c673cae
FG
8021 } else if (var == "min_size") {
8022 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8023 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8024 return -EPERM;
8025 }
8026 if (interr.length()) {
8027 ss << "error parsing integer value '" << val << "': " << interr;
8028 return -EINVAL;
8029 }
8030
8031 if (p.type != pg_pool_t::TYPE_ERASURE) {
8032 if (n < 1 || n > p.size) {
494da23a 8033 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7c673cae
FG
8034 return -EINVAL;
8035 }
8036 } else {
8037 ErasureCodeInterfaceRef erasure_code;
8038 int k;
8039 stringstream tmp;
8040 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8041 if (err == 0) {
8042 k = erasure_code->get_data_chunk_count();
8043 } else {
b32b8144 8044 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
8045 return err;
8046 }
8047
8048 if (n < k || n > p.size) {
494da23a 8049 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7c673cae
FG
8050 return -EINVAL;
8051 }
8052 }
8053 p.min_size = n;
11fdf7f2 8054 } else if (var == "pg_num_actual") {
7c673cae
FG
8055 if (interr.length()) {
8056 ss << "error parsing integer value '" << val << "': " << interr;
8057 return -EINVAL;
8058 }
11fdf7f2
TL
8059 if (n == (int)p.get_pg_num()) {
8060 return 0;
8061 }
8062 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8063 ss << "'pg_num' must be greater than 0 and less than or equal to "
8064 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8065 << " (you may adjust 'mon max pool pg num' for higher values)";
8066 return -ERANGE;
8067 }
8068 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8069 ss << "cannot adjust pg_num while initial PGs are being created";
8070 return -EBUSY;
8071 }
8072 if (n > (int)p.get_pg_num()) {
8073 if (p.get_pg_num() != p.get_pg_num_pending()) {
8074 // force pre-nautilus clients to resend their ops, since they
8075 // don't understand pg_num_pending changes form a new interval
8076 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8077 }
8078 p.set_pg_num(n);
8079 } else {
9f95a23c 8080 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8081 ss << "nautilus OSDs are required to adjust pg_num_pending";
8082 return -EPERM;
8083 }
8084 if (n < (int)p.get_pgp_num()) {
8085 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8086 return -EINVAL;
8087 }
8088 if (n < (int)p.get_pg_num() - 1) {
8089 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8090 << ") - 1; only single pg decrease is currently supported";
8091 return -EINVAL;
8092 }
8093 p.set_pg_num_pending(n);
8094 // force pre-nautilus clients to resend their ops, since they
8095 // don't understand pg_num_pending changes form a new interval
8096 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7c673cae 8097 }
11fdf7f2
TL
8098 // force pre-luminous clients to resend their ops, since they
8099 // don't understand that split PGs now form a new interval.
8100 p.last_force_op_resend_preluminous = pending_inc.epoch;
7c673cae
FG
8101 } else if (var == "pg_num") {
8102 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8103 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8104 return -EPERM;
8105 }
8106 if (interr.length()) {
8107 ss << "error parsing integer value '" << val << "': " << interr;
8108 return -EINVAL;
8109 }
11fdf7f2 8110 if (n == (int)p.get_pg_num_target()) {
7c673cae
FG
8111 return 0;
8112 }
11fdf7f2
TL
8113 if (n <= 0 || static_cast<uint64_t>(n) >
8114 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
c07f9fc5 8115 ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 8116 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
c07f9fc5
FG
8117 << " (you may adjust 'mon max pool pg num' for higher values)";
8118 return -ERANGE;
8119 }
11fdf7f2
TL
8120 if (n > (int)p.get_pg_num_target()) {
8121 int r = check_pg_num(pool, n, p.get_size(), &ss);
8122 if (r) {
8123 return r;
8124 }
8125 bool force = false;
9f95a23c 8126 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8127 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8128 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
8129 return -EPERM;
8130 }
8131 } else {
9f95a23c 8132 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
11fdf7f2
TL
8133 ss << "nautilus OSDs are required to decrease pg_num";
8134 return -EPERM;
8135 }
7c673cae 8136 }
9f95a23c 8137 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8138 // pre-nautilus osdmap format; increase pg_num directly
8139 assert(n > (int)p.get_pg_num());
8140 // force pre-nautilus clients to resend their ops, since they
8141 // don't understand pg_num_target changes form a new interval
8142 p.last_force_op_resend_prenautilus = pending_inc.epoch;
8143 // force pre-luminous clients to resend their ops, since they
8144 // don't understand that split PGs now form a new interval.
8145 p.last_force_op_resend_preluminous = pending_inc.epoch;
8146 p.set_pg_num(n);
8147 } else {
8148 // set targets; mgr will adjust pg_num_actual and pgp_num later.
8149 // make pgp_num track pg_num if it already matches. if it is set
8150 // differently, leave it different and let the user control it
8151 // manually.
8152 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8153 p.set_pgp_num_target(n);
8154 }
8155 p.set_pg_num_target(n);
7c673cae 8156 }
11fdf7f2 8157 } else if (var == "pgp_num_actual") {
7c673cae
FG
8158 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8159 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8160 return -EPERM;
8161 }
8162 if (interr.length()) {
8163 ss << "error parsing integer value '" << val << "': " << interr;
8164 return -EINVAL;
8165 }
8166 if (n <= 0) {
8167 ss << "specified pgp_num must > 0, but you set to " << n;
8168 return -EINVAL;
8169 }
8170 if (n > (int)p.get_pg_num()) {
8171 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8172 return -EINVAL;
8173 }
11fdf7f2
TL
8174 if (n > (int)p.get_pg_num_pending()) {
8175 ss << "specified pgp_num " << n
8176 << " > pg_num_pending " << p.get_pg_num_pending();
8177 return -EINVAL;
8178 }
7c673cae 8179 p.set_pgp_num(n);
11fdf7f2
TL
8180 } else if (var == "pgp_num") {
8181 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8182 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8183 return -EPERM;
8184 }
8185 if (interr.length()) {
8186 ss << "error parsing integer value '" << val << "': " << interr;
8187 return -EINVAL;
8188 }
8189 if (n <= 0) {
8190 ss << "specified pgp_num must > 0, but you set to " << n;
8191 return -EINVAL;
8192 }
8193 if (n > (int)p.get_pg_num_target()) {
8194 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8195 return -EINVAL;
8196 }
9f95a23c 8197 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8198 // pre-nautilus osdmap format; increase pgp_num directly
8199 p.set_pgp_num(n);
8200 } else {
8201 p.set_pgp_num_target(n);
8202 }
11fdf7f2 8203 } else if (var == "pg_autoscale_mode") {
9f95a23c
TL
8204 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8205 if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
11fdf7f2
TL
8206 ss << "specified invalid mode " << val;
8207 return -EINVAL;
8208 }
9f95a23c 8209 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
494da23a
TL
8210 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8211 return -EINVAL;
8212 }
9f95a23c 8213 p.pg_autoscale_mode = m;
7c673cae
FG
8214 } else if (var == "crush_rule") {
8215 int id = osdmap.crush->get_rule_id(val);
8216 if (id == -ENOENT) {
8217 ss << "crush rule " << val << " does not exist";
8218 return -ENOENT;
8219 }
8220 if (id < 0) {
8221 ss << cpp_strerror(id);
8222 return -ENOENT;
8223 }
8224 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8225 return -EINVAL;
8226 }
31f18b77 8227 p.crush_rule = id;
7c673cae
FG
8228 } else if (var == "nodelete" || var == "nopgchange" ||
8229 var == "nosizechange" || var == "write_fadvise_dontneed" ||
8230 var == "noscrub" || var == "nodeep-scrub") {
8231 uint64_t flag = pg_pool_t::get_flag_by_name(var);
8232 // make sure we only compare against 'n' if we didn't receive a string
8233 if (val == "true" || (interr.empty() && n == 1)) {
8234 p.set_flag(flag);
8235 } else if (val == "false" || (interr.empty() && n == 0)) {
8236 p.unset_flag(flag);
8237 } else {
8238 ss << "expecting value 'true', 'false', '0', or '1'";
8239 return -EINVAL;
8240 }
8241 } else if (var == "hashpspool") {
8242 uint64_t flag = pg_pool_t::get_flag_by_name(var);
11fdf7f2 8243 bool force = false;
9f95a23c 8244 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2
TL
8245
8246 if (!force) {
7c673cae
FG
8247 ss << "are you SURE? this will remap all placement groups in this pool,"
8248 " this triggers large data movement,"
8249 " pass --yes-i-really-mean-it if you really do.";
8250 return -EPERM;
8251 }
8252 // make sure we only compare against 'n' if we didn't receive a string
8253 if (val == "true" || (interr.empty() && n == 1)) {
8254 p.set_flag(flag);
8255 } else if (val == "false" || (interr.empty() && n == 0)) {
8256 p.unset_flag(flag);
8257 } else {
8258 ss << "expecting value 'true', 'false', '0', or '1'";
8259 return -EINVAL;
8260 }
8261 } else if (var == "hit_set_type") {
8262 if (val == "none")
8263 p.hit_set_params = HitSet::Params();
8264 else {
8265 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8266 if (err)
8267 return err;
8268 if (val == "bloom") {
8269 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 8270 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae
FG
8271 p.hit_set_params = HitSet::Params(bsp);
8272 } else if (val == "explicit_hash")
8273 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8274 else if (val == "explicit_object")
8275 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8276 else {
8277 ss << "unrecognized hit_set type '" << val << "'";
8278 return -EINVAL;
8279 }
8280 }
8281 } else if (var == "hit_set_period") {
8282 if (interr.length()) {
8283 ss << "error parsing integer value '" << val << "': " << interr;
8284 return -EINVAL;
11fdf7f2
TL
8285 } else if (n < 0) {
8286 ss << "hit_set_period should be non-negative";
8287 return -EINVAL;
7c673cae
FG
8288 }
8289 p.hit_set_period = n;
8290 } else if (var == "hit_set_count") {
8291 if (interr.length()) {
8292 ss << "error parsing integer value '" << val << "': " << interr;
8293 return -EINVAL;
11fdf7f2
TL
8294 } else if (n < 0) {
8295 ss << "hit_set_count should be non-negative";
8296 return -EINVAL;
7c673cae
FG
8297 }
8298 p.hit_set_count = n;
8299 } else if (var == "hit_set_fpp") {
8300 if (floaterr.length()) {
8301 ss << "error parsing floating point value '" << val << "': " << floaterr;
8302 return -EINVAL;
11fdf7f2
TL
8303 } else if (f < 0 || f > 1.0) {
8304 ss << "hit_set_fpp should be in the range 0..1";
8305 return -EINVAL;
7c673cae
FG
8306 }
8307 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8308 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8309 return -EINVAL;
8310 }
8311 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8312 bloomp->set_fpp(f);
8313 } else if (var == "use_gmt_hitset") {
8314 if (val == "true" || (interr.empty() && n == 1)) {
7c673cae
FG
8315 p.use_gmt_hitset = true;
8316 } else {
8317 ss << "expecting value 'true' or '1'";
8318 return -EINVAL;
8319 }
8320 } else if (var == "allow_ec_overwrites") {
8321 if (!p.is_erasure()) {
8322 ss << "ec overwrites can only be enabled for an erasure coded pool";
8323 return -EINVAL;
8324 }
224ce89b 8325 stringstream err;
11fdf7f2 8326 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
224ce89b
WB
8327 !is_pool_currently_all_bluestore(pool, p, &err)) {
8328 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8329 return -EINVAL;
8330 }
7c673cae
FG
8331 if (val == "true" || (interr.empty() && n == 1)) {
8332 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8333 } else if (val == "false" || (interr.empty() && n == 0)) {
8334 ss << "ec overwrites cannot be disabled once enabled";
8335 return -EINVAL;
8336 } else {
8337 ss << "expecting value 'true', 'false', '0', or '1'";
8338 return -EINVAL;
8339 }
7c673cae
FG
8340 } else if (var == "target_max_objects") {
8341 if (interr.length()) {
8342 ss << "error parsing int '" << val << "': " << interr;
8343 return -EINVAL;
8344 }
8345 p.target_max_objects = n;
8346 } else if (var == "target_max_bytes") {
8347 if (interr.length()) {
8348 ss << "error parsing int '" << val << "': " << interr;
8349 return -EINVAL;
8350 }
8351 p.target_max_bytes = n;
8352 } else if (var == "cache_target_dirty_ratio") {
8353 if (floaterr.length()) {
8354 ss << "error parsing float '" << val << "': " << floaterr;
8355 return -EINVAL;
8356 }
8357 if (f < 0 || f > 1.0) {
8358 ss << "value must be in the range 0..1";
8359 return -ERANGE;
8360 }
8361 p.cache_target_dirty_ratio_micro = uf;
8362 } else if (var == "cache_target_dirty_high_ratio") {
8363 if (floaterr.length()) {
8364 ss << "error parsing float '" << val << "': " << floaterr;
8365 return -EINVAL;
8366 }
8367 if (f < 0 || f > 1.0) {
8368 ss << "value must be in the range 0..1";
8369 return -ERANGE;
8370 }
8371 p.cache_target_dirty_high_ratio_micro = uf;
8372 } else if (var == "cache_target_full_ratio") {
8373 if (floaterr.length()) {
8374 ss << "error parsing float '" << val << "': " << floaterr;
8375 return -EINVAL;
8376 }
8377 if (f < 0 || f > 1.0) {
8378 ss << "value must be in the range 0..1";
8379 return -ERANGE;
8380 }
8381 p.cache_target_full_ratio_micro = uf;
8382 } else if (var == "cache_min_flush_age") {
8383 if (interr.length()) {
8384 ss << "error parsing int '" << val << "': " << interr;
8385 return -EINVAL;
8386 }
8387 p.cache_min_flush_age = n;
8388 } else if (var == "cache_min_evict_age") {
8389 if (interr.length()) {
8390 ss << "error parsing int '" << val << "': " << interr;
8391 return -EINVAL;
8392 }
8393 p.cache_min_evict_age = n;
8394 } else if (var == "min_read_recency_for_promote") {
8395 if (interr.length()) {
8396 ss << "error parsing integer value '" << val << "': " << interr;
8397 return -EINVAL;
8398 }
8399 p.min_read_recency_for_promote = n;
8400 } else if (var == "hit_set_grade_decay_rate") {
8401 if (interr.length()) {
8402 ss << "error parsing integer value '" << val << "': " << interr;
8403 return -EINVAL;
8404 }
8405 if (n > 100 || n < 0) {
8406 ss << "value out of range,valid range is 0 - 100";
8407 return -EINVAL;
8408 }
8409 p.hit_set_grade_decay_rate = n;
8410 } else if (var == "hit_set_search_last_n") {
8411 if (interr.length()) {
8412 ss << "error parsing integer value '" << val << "': " << interr;
8413 return -EINVAL;
8414 }
8415 if (n > p.hit_set_count || n < 0) {
8416 ss << "value out of range,valid range is 0 - hit_set_count";
8417 return -EINVAL;
8418 }
8419 p.hit_set_search_last_n = n;
8420 } else if (var == "min_write_recency_for_promote") {
8421 if (interr.length()) {
8422 ss << "error parsing integer value '" << val << "': " << interr;
8423 return -EINVAL;
8424 }
8425 p.min_write_recency_for_promote = n;
8426 } else if (var == "fast_read") {
8427 if (p.is_replicated()) {
8428 ss << "fast read is not supported in replication pool";
8429 return -EINVAL;
8430 }
8431 if (val == "true" || (interr.empty() && n == 1)) {
8432 p.fast_read = true;
8433 } else if (val == "false" || (interr.empty() && n == 0)) {
8434 p.fast_read = false;
8435 } else {
8436 ss << "expecting value 'true', 'false', '0', or '1'";
8437 return -EINVAL;
8438 }
8439 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 8440 bool unset = val == "unset";
7c673cae 8441 if (var == "compression_mode") {
224ce89b
WB
8442 if (!unset) {
8443 auto cmode = Compressor::get_comp_mode_type(val);
8444 if (!cmode) {
8445 ss << "unrecognized compression mode '" << val << "'";
8446 return -EINVAL;
8447 }
7c673cae
FG
8448 }
8449 } else if (var == "compression_algorithm") {
224ce89b
WB
8450 if (!unset) {
8451 auto alg = Compressor::get_comp_alg_type(val);
8452 if (!alg) {
8453 ss << "unrecognized compression_algorithm '" << val << "'";
8454 return -EINVAL;
8455 }
7c673cae
FG
8456 }
8457 } else if (var == "compression_required_ratio") {
8458 if (floaterr.length()) {
8459 ss << "error parsing float value '" << val << "': " << floaterr;
8460 return -EINVAL;
8461 }
224ce89b 8462 if (f < 0 || f > 1) {
7c673cae 8463 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 8464 return -EINVAL;
7c673cae
FG
8465 }
8466 } else if (var == "csum_type") {
224ce89b 8467 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
8468 if (t < 0 ) {
8469 ss << "unrecognized csum_type '" << val << "'";
224ce89b 8470 return -EINVAL;
7c673cae
FG
8471 }
8472 //preserve csum_type numeric value
8473 n = t;
8474 interr.clear();
8475 } else if (var == "compression_max_blob_size" ||
8476 var == "compression_min_blob_size" ||
8477 var == "csum_max_block" ||
8478 var == "csum_min_block") {
8479 if (interr.length()) {
8480 ss << "error parsing int value '" << val << "': " << interr;
8481 return -EINVAL;
8482 }
11fdf7f2
TL
8483 } else if (var == "fingerprint_algorithm") {
8484 if (!unset) {
8485 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8486 if (!alg) {
8487 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8488 return -EINVAL;
8489 }
8490 }
92f5a8d4
TL
8491 } else if (var == "target_size_bytes") {
8492 if (interr.length()) {
8493 ss << "error parsing unit value '" << val << "': " << interr;
8494 return -EINVAL;
8495 }
9f95a23c 8496 if (osdmap.require_osd_release < ceph_release_t::nautilus) {
92f5a8d4
TL
8497 ss << "must set require_osd_release to nautilus or "
8498 << "later before setting target_size_bytes";
8499 return -EINVAL;
8500 }
11fdf7f2
TL
8501 } else if (var == "pg_num_min") {
8502 if (interr.length()) {
8503 ss << "error parsing int value '" << val << "': " << interr;
8504 return -EINVAL;
8505 }
8506 if (n > (int)p.get_pg_num_target()) {
8507 ss << "specified pg_num_min " << n
8508 << " > pg_num " << p.get_pg_num_target();
8509 return -EINVAL;
8510 }
8511 } else if (var == "recovery_priority") {
8512 if (interr.length()) {
8513 ss << "error parsing int value '" << val << "': " << interr;
8514 return -EINVAL;
8515 }
81eedcae
TL
8516 if (!g_conf()->debug_allow_any_pool_priority) {
8517 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8518 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8519 << " and " << OSD_POOL_PRIORITY_MAX;
8520 return -EINVAL;
8521 }
11fdf7f2
TL
8522 }
8523 } else if (var == "pg_autoscale_bias") {
8524 if (f < 0.0 || f > 1000.0) {
8525 ss << "pg_autoscale_bias must be between 0 and 1000";
8526 return -EINVAL;
8527 }
7c673cae
FG
8528 }
8529
8530 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8531 switch (desc.type) {
8532 case pool_opts_t::STR:
224ce89b 8533 if (unset) {
7c673cae
FG
8534 p.opts.unset(desc.key);
8535 } else {
8536 p.opts.set(desc.key, static_cast<std::string>(val));
8537 }
8538 break;
8539 case pool_opts_t::INT:
8540 if (interr.length()) {
8541 ss << "error parsing integer value '" << val << "': " << interr;
8542 return -EINVAL;
8543 }
8544 if (n == 0) {
8545 p.opts.unset(desc.key);
8546 } else {
11fdf7f2 8547 p.opts.set(desc.key, static_cast<int64_t>(n));
7c673cae
FG
8548 }
8549 break;
8550 case pool_opts_t::DOUBLE:
8551 if (floaterr.length()) {
8552 ss << "error parsing floating point value '" << val << "': " << floaterr;
8553 return -EINVAL;
8554 }
8555 if (f == 0) {
8556 p.opts.unset(desc.key);
8557 } else {
8558 p.opts.set(desc.key, static_cast<double>(f));
8559 }
8560 break;
8561 default:
11fdf7f2 8562 ceph_assert(!"unknown type");
7c673cae
FG
8563 }
8564 } else {
8565 ss << "unrecognized variable '" << var << "'";
8566 return -EINVAL;
8567 }
224ce89b
WB
8568 if (val != "unset") {
8569 ss << "set pool " << pool << " " << var << " to " << val;
8570 } else {
8571 ss << "unset pool " << pool << " " << var;
8572 }
7c673cae
FG
8573 p.last_change = pending_inc.epoch;
8574 pending_inc.new_pools[pool] = p;
8575 return 0;
8576}
8577
c07f9fc5 8578int OSDMonitor::prepare_command_pool_application(const string &prefix,
11fdf7f2 8579 const cmdmap_t& cmdmap,
c07f9fc5 8580 stringstream& ss)
11fdf7f2
TL
8581{
8582 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8583}
8584
8585int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8586 const cmdmap_t& cmdmap,
8587 stringstream& ss,
8588 bool *modified)
8589{
8590 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8591}
8592
8593
8594/**
8595 * Common logic for preprocess and prepare phases of pool application
8596 * tag commands. In preprocess mode we're only detecting invalid
8597 * commands, and determining whether it was a modification or a no-op.
8598 * In prepare mode we're actually updating the pending state.
8599 */
8600int OSDMonitor::_command_pool_application(const string &prefix,
8601 const cmdmap_t& cmdmap,
8602 stringstream& ss,
8603 bool *modified,
8604 bool preparing)
c07f9fc5
FG
8605{
8606 string pool_name;
9f95a23c 8607 cmd_getval(cmdmap, "pool", pool_name);
c07f9fc5
FG
8608 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8609 if (pool < 0) {
8610 ss << "unrecognized pool '" << pool_name << "'";
8611 return -ENOENT;
8612 }
8613
8614 pg_pool_t p = *osdmap.get_pg_pool(pool);
11fdf7f2
TL
8615 if (preparing) {
8616 if (pending_inc.new_pools.count(pool)) {
8617 p = pending_inc.new_pools[pool];
8618 }
c07f9fc5
FG
8619 }
8620
8621 string app;
9f95a23c 8622 cmd_getval(cmdmap, "app", app);
c07f9fc5
FG
8623 bool app_exists = (p.application_metadata.count(app) > 0);
8624
11fdf7f2 8625 string key;
9f95a23c 8626 cmd_getval(cmdmap, "key", key);
11fdf7f2
TL
8627 if (key == "all") {
8628 ss << "key cannot be 'all'";
8629 return -EINVAL;
8630 }
8631
8632 string value;
9f95a23c 8633 cmd_getval(cmdmap, "value", value);
11fdf7f2
TL
8634 if (value == "all") {
8635 ss << "value cannot be 'all'";
8636 return -EINVAL;
8637 }
8638
c07f9fc5
FG
8639 if (boost::algorithm::ends_with(prefix, "enable")) {
8640 if (app.empty()) {
8641 ss << "application name must be provided";
8642 return -EINVAL;
8643 }
8644
8645 if (p.is_tier()) {
8646 ss << "application must be enabled on base tier";
8647 return -EINVAL;
8648 }
8649
11fdf7f2 8650 bool force = false;
9f95a23c 8651 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8652
11fdf7f2 8653 if (!app_exists && !p.application_metadata.empty() && !force) {
c07f9fc5
FG
8654 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8655 << "application; pass --yes-i-really-mean-it to proceed anyway";
8656 return -EPERM;
8657 }
8658
8659 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8660 ss << "too many enabled applications on pool '" << pool_name << "'; "
8661 << "max " << MAX_POOL_APPLICATIONS;
8662 return -EINVAL;
8663 }
8664
8665 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8666 ss << "application name '" << app << "' too long; max length "
8667 << MAX_POOL_APPLICATION_LENGTH;
8668 return -EINVAL;
8669 }
8670
8671 if (!app_exists) {
8672 p.application_metadata[app] = {};
8673 }
8674 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8675
8676 } else if (boost::algorithm::ends_with(prefix, "disable")) {
11fdf7f2 8677 bool force = false;
9f95a23c 8678 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8679
11fdf7f2 8680 if (!force) {
c07f9fc5
FG
8681 ss << "Are you SURE? Disabling an application within a pool might result "
8682 << "in loss of application functionality; pass "
8683 << "--yes-i-really-mean-it to proceed anyway";
8684 return -EPERM;
8685 }
8686
8687 if (!app_exists) {
8688 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8689 << "'";
8690 return 0; // idempotent
8691 }
8692
8693 p.application_metadata.erase(app);
8694 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8695
8696 } else if (boost::algorithm::ends_with(prefix, "set")) {
8697 if (p.is_tier()) {
8698 ss << "application metadata must be set on base tier";
8699 return -EINVAL;
8700 }
8701
8702 if (!app_exists) {
8703 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8704 << "'";
8705 return -ENOENT;
8706 }
8707
8708 string key;
9f95a23c 8709 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
8710
8711 if (key.empty()) {
8712 ss << "key must be provided";
8713 return -EINVAL;
8714 }
8715
8716 auto &app_keys = p.application_metadata[app];
8717 if (app_keys.count(key) == 0 &&
8718 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8719 ss << "too many keys set for application '" << app << "' on pool '"
8720 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8721 return -EINVAL;
8722 }
8723
8724 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8725 ss << "key '" << app << "' too long; max length "
8726 << MAX_POOL_APPLICATION_LENGTH;
8727 return -EINVAL;
8728 }
8729
8730 string value;
9f95a23c 8731 cmd_getval(cmdmap, "value", value);
c07f9fc5
FG
8732 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8733 ss << "value '" << value << "' too long; max length "
8734 << MAX_POOL_APPLICATION_LENGTH;
8735 return -EINVAL;
8736 }
8737
8738 p.application_metadata[app][key] = value;
8739 ss << "set application '" << app << "' key '" << key << "' to '"
8740 << value << "' on pool '" << pool_name << "'";
8741 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8742 if (!app_exists) {
8743 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8744 << "'";
8745 return -ENOENT;
8746 }
8747
8748 string key;
9f95a23c 8749 cmd_getval(cmdmap, "key", key);
c07f9fc5
FG
8750 auto it = p.application_metadata[app].find(key);
8751 if (it == p.application_metadata[app].end()) {
8752 ss << "application '" << app << "' on pool '" << pool_name
8753 << "' does not have key '" << key << "'";
8754 return 0; // idempotent
8755 }
8756
8757 p.application_metadata[app].erase(it);
8758 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8759 << pool_name << "'";
8760 } else {
11fdf7f2
TL
8761 ceph_abort();
8762 }
8763
8764 if (preparing) {
8765 p.last_change = pending_inc.epoch;
8766 pending_inc.new_pools[pool] = p;
8767 }
8768
8769 // Because we fell through this far, we didn't hit no-op cases,
8770 // so pool was definitely modified
8771 if (modified != nullptr) {
8772 *modified = true;
c07f9fc5
FG
8773 }
8774
c07f9fc5
FG
8775 return 0;
8776}
8777
31f18b77
FG
8778int OSDMonitor::_prepare_command_osd_crush_remove(
8779 CrushWrapper &newcrush,
8780 int32_t id,
8781 int32_t ancestor,
8782 bool has_ancestor,
8783 bool unlink_only)
8784{
8785 int err = 0;
8786
8787 if (has_ancestor) {
11fdf7f2 8788 err = newcrush.remove_item_under(cct, id, ancestor,
31f18b77
FG
8789 unlink_only);
8790 } else {
11fdf7f2 8791 err = newcrush.remove_item(cct, id, unlink_only);
31f18b77
FG
8792 }
8793 return err;
8794}
8795
8796void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8797{
8798 pending_inc.crush.clear();
8799 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8800}
8801
8802int OSDMonitor::prepare_command_osd_crush_remove(
8803 CrushWrapper &newcrush,
8804 int32_t id,
8805 int32_t ancestor,
8806 bool has_ancestor,
8807 bool unlink_only)
8808{
8809 int err = _prepare_command_osd_crush_remove(
8810 newcrush, id, ancestor,
8811 has_ancestor, unlink_only);
8812
8813 if (err < 0)
8814 return err;
8815
11fdf7f2 8816 ceph_assert(err == 0);
31f18b77
FG
8817 do_osd_crush_remove(newcrush);
8818
8819 return 0;
8820}
8821
8822int OSDMonitor::prepare_command_osd_remove(int32_t id)
8823{
8824 if (osdmap.is_up(id)) {
8825 return -EBUSY;
8826 }
8827
8828 pending_inc.new_state[id] = osdmap.get_state(id);
8829 pending_inc.new_uuid[id] = uuid_d();
8830 pending_metadata_rm.insert(id);
8831 pending_metadata.erase(id);
8832
8833 return 0;
8834}
8835
8836int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8837{
11fdf7f2 8838 ceph_assert(existing_id);
31f18b77
FG
8839 *existing_id = -1;
8840
8841 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8842 if (!osdmap.exists(i) &&
8843 pending_inc.new_up_client.count(i) == 0 &&
8844 (pending_inc.new_state.count(i) == 0 ||
8845 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8846 *existing_id = i;
8847 return -1;
8848 }
8849 }
8850
8851 if (pending_inc.new_max_osd < 0) {
8852 return osdmap.get_max_osd();
8853 }
8854 return pending_inc.new_max_osd;
8855}
8856
8857void OSDMonitor::do_osd_create(
8858 const int32_t id,
8859 const uuid_d& uuid,
3a9019d9 8860 const string& device_class,
31f18b77
FG
8861 int32_t* new_id)
8862{
8863 dout(10) << __func__ << " uuid " << uuid << dendl;
11fdf7f2 8864 ceph_assert(new_id);
31f18b77
FG
8865
8866 // We presume validation has been performed prior to calling this
8867 // function. We assert with prejudice.
8868
8869 int32_t allocated_id = -1; // declare here so we can jump
8870 int32_t existing_id = -1;
8871 if (!uuid.is_zero()) {
8872 existing_id = osdmap.identify_osd(uuid);
8873 if (existing_id >= 0) {
11fdf7f2 8874 ceph_assert(id < 0 || id == existing_id);
31f18b77
FG
8875 *new_id = existing_id;
8876 goto out;
8877 } else if (id >= 0) {
8878 // uuid does not exist, and id has been provided, so just create
8879 // the new osd.id
8880 *new_id = id;
8881 goto out;
8882 }
8883 }
8884
8885 // allocate a new id
8886 allocated_id = _allocate_osd_id(&existing_id);
8887 dout(10) << __func__ << " allocated id " << allocated_id
8888 << " existing id " << existing_id << dendl;
8889 if (existing_id >= 0) {
11fdf7f2
TL
8890 ceph_assert(existing_id < osdmap.get_max_osd());
8891 ceph_assert(allocated_id < 0);
31f18b77
FG
8892 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8893 *new_id = existing_id;
31f18b77 8894 } else if (allocated_id >= 0) {
11fdf7f2 8895 ceph_assert(existing_id < 0);
31f18b77
FG
8896 // raise max_osd
8897 if (pending_inc.new_max_osd < 0) {
8898 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8899 } else {
8900 ++pending_inc.new_max_osd;
8901 }
8902 *new_id = pending_inc.new_max_osd - 1;
11fdf7f2 8903 ceph_assert(*new_id == allocated_id);
31f18b77 8904 } else {
11fdf7f2 8905 ceph_abort_msg("unexpected condition");
31f18b77
FG
8906 }
8907
8908out:
3a9019d9
FG
8909 if (device_class.size()) {
8910 CrushWrapper newcrush;
8911 _get_pending_crush(newcrush);
8912 if (newcrush.get_max_devices() < *new_id + 1) {
8913 newcrush.set_max_devices(*new_id + 1);
8914 }
8915 string name = string("osd.") + stringify(*new_id);
8916 if (!newcrush.item_exists(*new_id)) {
8917 newcrush.set_item_name(*new_id, name);
8918 }
8919 ostringstream ss;
8920 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8921 if (r < 0) {
8922 derr << __func__ << " failed to set " << name << " device_class "
8923 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8924 << dendl;
8925 // non-fatal... this might be a replay and we want to be idempotent.
8926 } else {
8927 dout(20) << __func__ << " set " << name << " device_class " << device_class
8928 << dendl;
8929 pending_inc.crush.clear();
8930 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8931 }
8932 } else {
8933 dout(20) << __func__ << " no device_class" << dendl;
8934 }
8935
31f18b77
FG
8936 dout(10) << __func__ << " using id " << *new_id << dendl;
8937 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8938 pending_inc.new_max_osd = *new_id + 1;
8939 }
8940
8941 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8942 if (!uuid.is_zero())
8943 pending_inc.new_uuid[*new_id] = uuid;
8944}
8945
8946int OSDMonitor::validate_osd_create(
8947 const int32_t id,
8948 const uuid_d& uuid,
8949 const bool check_osd_exists,
8950 int32_t* existing_id,
8951 stringstream& ss)
8952{
8953
8954 dout(10) << __func__ << " id " << id << " uuid " << uuid
8955 << " check_osd_exists " << check_osd_exists << dendl;
8956
11fdf7f2 8957 ceph_assert(existing_id);
31f18b77
FG
8958
8959 if (id < 0 && uuid.is_zero()) {
8960 // we have nothing to validate
8961 *existing_id = -1;
8962 return 0;
8963 } else if (uuid.is_zero()) {
8964 // we have an id but we will ignore it - because that's what
8965 // `osd create` does.
8966 return 0;
8967 }
8968
8969 /*
8970 * This function will be used to validate whether we are able to
8971 * create a new osd when the `uuid` is specified.
8972 *
8973 * It will be used by both `osd create` and `osd new`, as the checks
8974 * are basically the same when it pertains to osd id and uuid validation.
8975 * However, `osd create` presumes an `uuid` is optional, for legacy
8976 * reasons, while `osd new` requires the `uuid` to be provided. This
8977 * means that `osd create` will not be idempotent if an `uuid` is not
8978 * provided, but we will always guarantee the idempotency of `osd new`.
8979 */
8980
11fdf7f2 8981 ceph_assert(!uuid.is_zero());
31f18b77
FG
8982 if (pending_inc.identify_osd(uuid) >= 0) {
8983 // osd is about to exist
8984 return -EAGAIN;
8985 }
8986
8987 int32_t i = osdmap.identify_osd(uuid);
8988 if (i >= 0) {
8989 // osd already exists
8990 if (id >= 0 && i != id) {
8991 ss << "uuid " << uuid << " already in use for different id " << i;
8992 return -EEXIST;
8993 }
8994 // return a positive errno to distinguish between a blocking error
8995 // and an error we consider to not be a problem (i.e., this would be
8996 // an idempotent operation).
8997 *existing_id = i;
8998 return EEXIST;
8999 }
9000 // i < 0
9001 if (id >= 0) {
9002 if (pending_inc.new_state.count(id)) {
9003 // osd is about to exist
9004 return -EAGAIN;
9005 }
9006 // we may not care if an osd exists if we are recreating a previously
9007 // destroyed osd.
9008 if (check_osd_exists && osdmap.exists(id)) {
9009 ss << "id " << id << " already in use and does not match uuid "
9010 << uuid;
9011 return -EINVAL;
9012 }
9013 }
9014 return 0;
9015}
9016
9017int OSDMonitor::prepare_command_osd_create(
9018 const int32_t id,
9019 const uuid_d& uuid,
9020 int32_t* existing_id,
9021 stringstream& ss)
9022{
9023 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
11fdf7f2 9024 ceph_assert(existing_id);
b5b8bbf5
FG
9025 if (osdmap.is_destroyed(id)) {
9026 ss << "ceph osd create has been deprecated. Please use ceph osd new "
9027 "instead.";
9028 return -EINVAL;
9029 }
31f18b77
FG
9030
9031 if (uuid.is_zero()) {
9032 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9033 }
9034
9035 return validate_osd_create(id, uuid, true, existing_id, ss);
9036}
9037
9038int OSDMonitor::prepare_command_osd_new(
9039 MonOpRequestRef op,
11fdf7f2 9040 const cmdmap_t& cmdmap,
3a9019d9 9041 const map<string,string>& params,
31f18b77
FG
9042 stringstream &ss,
9043 Formatter *f)
9044{
9045 uuid_d uuid;
9046 string uuidstr;
9047 int64_t id = -1;
9048
11fdf7f2 9049 ceph_assert(paxos->is_plugged());
31f18b77
FG
9050
9051 dout(10) << __func__ << " " << op << dendl;
9052
9053 /* validate command. abort now if something's wrong. */
9054
9055 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9056 *
9057 * If `id` is not specified, we will identify any existing osd based
9058 * on `uuid`. Operation will be idempotent iff secrets match.
9059 *
9060 * If `id` is specified, we will identify any existing osd based on
9061 * `uuid` and match against `id`. If they match, operation will be
9062 * idempotent iff secrets match.
9063 *
9064 * `-i secrets.json` will be optional. If supplied, will be used
9065 * to check for idempotency when `id` and `uuid` match.
9066 *
9067 * If `id` is not specified, and `uuid` does not exist, an id will
9068 * be found or allocated for the osd.
9069 *
9070 * If `id` is specified, and the osd has been previously marked
9071 * as destroyed, then the `id` will be reused.
9072 */
9f95a23c 9073 if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
31f18b77
FG
9074 ss << "requires the OSD's UUID to be specified.";
9075 return -EINVAL;
9076 } else if (!uuid.parse(uuidstr.c_str())) {
9077 ss << "invalid UUID value '" << uuidstr << "'.";
9078 return -EINVAL;
9079 }
9080
9f95a23c 9081 if (cmd_getval(cmdmap, "id", id) &&
31f18b77
FG
9082 (id < 0)) {
9083 ss << "invalid OSD id; must be greater or equal than zero.";
9084 return -EINVAL;
9085 }
9086
9087 // are we running an `osd create`-like command, or recreating
9088 // a previously destroyed osd?
9089
9090 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9091
9092 // we will care about `id` to assess whether osd is `destroyed`, or
9093 // to create a new osd.
9094 // we will need an `id` by the time we reach auth.
9095
9096 int32_t existing_id = -1;
9097 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9098 &existing_id, ss);
9099
9100 bool may_be_idempotent = false;
9101 if (err == EEXIST) {
9102 // this is idempotent from the osdmon's point-of-view
9103 may_be_idempotent = true;
11fdf7f2 9104 ceph_assert(existing_id >= 0);
31f18b77
FG
9105 id = existing_id;
9106 } else if (err < 0) {
9107 return err;
9108 }
9109
9110 if (!may_be_idempotent) {
9111 // idempotency is out of the window. We are either creating a new
9112 // osd or recreating a destroyed osd.
9113 //
9114 // We now need to figure out if we have an `id` (and if it's valid),
9115 // of find an `id` if we don't have one.
9116
9117 // NOTE: we need to consider the case where the `id` is specified for
9118 // `osd create`, and we must honor it. So this means checking if
9119 // the `id` is destroyed, and if so assume the destroy; otherwise,
9120 // check if it `exists` - in which case we complain about not being
9121 // `destroyed`. In the end, if nothing fails, we must allow the
9122 // creation, so that we are compatible with `create`.
9123 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9124 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9125 ss << "OSD " << id << " has not yet been destroyed";
9126 return -EINVAL;
9127 } else if (id < 0) {
9128 // find an `id`
9129 id = _allocate_osd_id(&existing_id);
9130 if (id < 0) {
11fdf7f2 9131 ceph_assert(existing_id >= 0);
31f18b77
FG
9132 id = existing_id;
9133 }
9134 dout(10) << __func__ << " found id " << id << " to use" << dendl;
9135 } else if (id >= 0 && osdmap.is_destroyed(id)) {
9136 dout(10) << __func__ << " recreating osd." << id << dendl;
9137 } else {
9138 dout(10) << __func__ << " creating new osd." << id << dendl;
9139 }
9140 } else {
11fdf7f2
TL
9141 ceph_assert(id >= 0);
9142 ceph_assert(osdmap.exists(id));
31f18b77
FG
9143 }
9144
9145 // we are now able to either create a brand new osd or reuse an existing
9146 // osd that has been previously destroyed.
9147
9148 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9149
3a9019d9 9150 if (may_be_idempotent && params.empty()) {
31f18b77 9151 // nothing to do, really.
3a9019d9 9152 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
11fdf7f2 9153 ceph_assert(id >= 0);
31f18b77
FG
9154 if (f) {
9155 f->open_object_section("created_osd");
9156 f->dump_int("osdid", id);
9157 f->close_section();
9158 } else {
9159 ss << id;
9160 }
9161 return EEXIST;
9162 }
9163
3a9019d9
FG
9164 string device_class;
9165 auto p = params.find("crush_device_class");
9166 if (p != params.end()) {
9167 device_class = p->second;
9168 dout(20) << __func__ << " device_class will be " << device_class << dendl;
9169 }
31f18b77
FG
9170 string cephx_secret, lockbox_secret, dmcrypt_key;
9171 bool has_lockbox = false;
3a9019d9
FG
9172 bool has_secrets = params.count("cephx_secret")
9173 || params.count("cephx_lockbox_secret")
9174 || params.count("dmcrypt_key");
31f18b77
FG
9175
9176 ConfigKeyService *svc = nullptr;
9177 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9178
9179 if (has_secrets) {
3a9019d9 9180 if (params.count("cephx_secret") == 0) {
31f18b77
FG
9181 ss << "requires a cephx secret.";
9182 return -EINVAL;
9183 }
3a9019d9 9184 cephx_secret = params.at("cephx_secret");
31f18b77 9185
3a9019d9
FG
9186 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9187 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
9188
9189 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9190 << " dmcrypt " << has_dmcrypt_key << dendl;
9191
9192 if (has_lockbox_secret && has_dmcrypt_key) {
9193 has_lockbox = true;
3a9019d9
FG
9194 lockbox_secret = params.at("cephx_lockbox_secret");
9195 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
9196 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9197 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9198 return -EINVAL;
9199 }
9200
9201 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9202
9203 err = mon->authmon()->validate_osd_new(id, uuid,
9204 cephx_secret,
9205 lockbox_secret,
9206 cephx_entity,
9207 lockbox_entity,
9208 ss);
9209 if (err < 0) {
9210 return err;
9211 } else if (may_be_idempotent && err != EEXIST) {
9212 // for this to be idempotent, `id` should already be >= 0; no need
9213 // to use validate_id.
11fdf7f2 9214 ceph_assert(id >= 0);
31f18b77
FG
9215 ss << "osd." << id << " exists but secrets do not match";
9216 return -EEXIST;
9217 }
9218
9219 if (has_lockbox) {
9220 svc = (ConfigKeyService*)mon->config_key_service;
9221 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9222 if (err < 0) {
9223 return err;
9224 } else if (may_be_idempotent && err != EEXIST) {
11fdf7f2 9225 ceph_assert(id >= 0);
31f18b77
FG
9226 ss << "osd." << id << " exists but dm-crypt key does not match.";
9227 return -EEXIST;
9228 }
9229 }
9230 }
11fdf7f2
TL
9231 ceph_assert(!has_secrets || !cephx_secret.empty());
9232 ceph_assert(!has_lockbox || !lockbox_secret.empty());
31f18b77
FG
9233
9234 if (may_be_idempotent) {
9235 // we have nothing to do for either the osdmon or the authmon,
9236 // and we have no lockbox - so the config key service will not be
9237 // touched. This is therefore an idempotent operation, and we can
9238 // just return right away.
9239 dout(10) << __func__ << " idempotent -- no op." << dendl;
11fdf7f2 9240 ceph_assert(id >= 0);
31f18b77
FG
9241 if (f) {
9242 f->open_object_section("created_osd");
9243 f->dump_int("osdid", id);
9244 f->close_section();
9245 } else {
9246 ss << id;
9247 }
9248 return EEXIST;
9249 }
11fdf7f2 9250 ceph_assert(!may_be_idempotent);
31f18b77
FG
9251
9252 // perform updates.
9253 if (has_secrets) {
11fdf7f2
TL
9254 ceph_assert(!cephx_secret.empty());
9255 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
31f18b77
FG
9256 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9257
9258 err = mon->authmon()->do_osd_new(cephx_entity,
9259 lockbox_entity,
9260 has_lockbox);
11fdf7f2 9261 ceph_assert(0 == err);
31f18b77
FG
9262
9263 if (has_lockbox) {
11fdf7f2 9264 ceph_assert(nullptr != svc);
31f18b77
FG
9265 svc->do_osd_new(uuid, dmcrypt_key);
9266 }
9267 }
9268
9269 if (is_recreate_destroyed) {
11fdf7f2
TL
9270 ceph_assert(id >= 0);
9271 ceph_assert(osdmap.is_destroyed(id));
31f18b77 9272 pending_inc.new_weight[id] = CEPH_OSD_OUT;
11fdf7f2
TL
9273 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9274 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9275 pending_inc.new_state[id] |= CEPH_OSD_NEW;
9276 }
c07f9fc5
FG
9277 if (osdmap.get_state(id) & CEPH_OSD_UP) {
9278 // due to http://tracker.ceph.com/issues/20751 some clusters may
9279 // have UP set for non-existent OSDs; make sure it is cleared
9280 // for a newly created osd.
9281 pending_inc.new_state[id] |= CEPH_OSD_UP;
9282 }
31f18b77
FG
9283 pending_inc.new_uuid[id] = uuid;
9284 } else {
11fdf7f2 9285 ceph_assert(id >= 0);
31f18b77 9286 int32_t new_id = -1;
3a9019d9 9287 do_osd_create(id, uuid, device_class, &new_id);
11fdf7f2
TL
9288 ceph_assert(new_id >= 0);
9289 ceph_assert(id == new_id);
31f18b77
FG
9290 }
9291
9292 if (f) {
9293 f->open_object_section("created_osd");
9294 f->dump_int("osdid", id);
9295 f->close_section();
9296 } else {
9297 ss << id;
9298 }
9299
9300 return 0;
9301}
9302
7c673cae
FG
9303bool OSDMonitor::prepare_command(MonOpRequestRef op)
9304{
9305 op->mark_osdmon_event(__func__);
9f95a23c 9306 auto m = op->get_req<MMonCommand>();
7c673cae 9307 stringstream ss;
11fdf7f2 9308 cmdmap_t cmdmap;
7c673cae
FG
9309 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9310 string rs = ss.str();
9311 mon->reply_command(op, -EINVAL, rs, get_last_committed());
9312 return true;
9313 }
9314
11fdf7f2 9315 MonSession *session = op->get_session();
7c673cae 9316 if (!session) {
11fdf7f2 9317 derr << __func__ << " no session" << dendl;
7c673cae
FG
9318 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9319 return true;
9320 }
9321
9322 return prepare_command_impl(op, cmdmap);
9323}
9324
9325static int parse_reweights(CephContext *cct,
11fdf7f2 9326 const cmdmap_t& cmdmap,
7c673cae
FG
9327 const OSDMap& osdmap,
9328 map<int32_t, uint32_t>* weights)
9329{
9330 string weights_str;
9f95a23c 9331 if (!cmd_getval(cmdmap, "weights", weights_str)) {
7c673cae
FG
9332 return -EINVAL;
9333 }
9334 std::replace(begin(weights_str), end(weights_str), '\'', '"');
9335 json_spirit::mValue json_value;
9336 if (!json_spirit::read(weights_str, json_value)) {
9337 return -EINVAL;
9338 }
9339 if (json_value.type() != json_spirit::obj_type) {
9340 return -EINVAL;
9341 }
9342 const auto obj = json_value.get_obj();
9343 try {
9344 for (auto& osd_weight : obj) {
9345 auto osd_id = std::stoi(osd_weight.first);
9346 if (!osdmap.exists(osd_id)) {
9347 return -ENOENT;
9348 }
9349 if (osd_weight.second.type() != json_spirit::str_type) {
9350 return -EINVAL;
9351 }
9352 auto weight = std::stoul(osd_weight.second.get_str());
9353 weights->insert({osd_id, weight});
9354 }
9355 } catch (const std::logic_error& e) {
9356 return -EINVAL;
9357 }
9358 return 0;
9359}
9360
31f18b77
FG
9361int OSDMonitor::prepare_command_osd_destroy(
9362 int32_t id,
9363 stringstream& ss)
9364{
11fdf7f2 9365 ceph_assert(paxos->is_plugged());
31f18b77
FG
9366
9367 // we check if the osd exists for the benefit of `osd purge`, which may
9368 // have previously removed the osd. If the osd does not exist, return
9369 // -ENOENT to convey this, and let the caller deal with it.
9370 //
9371 // we presume that all auth secrets and config keys were removed prior
9372 // to this command being called. if they exist by now, we also assume
9373 // they must have been created by some other command and do not pertain
9374 // to this non-existent osd.
9375 if (!osdmap.exists(id)) {
9376 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9377 return -ENOENT;
9378 }
9379
9380 uuid_d uuid = osdmap.get_uuid(id);
9381 dout(10) << __func__ << " destroying osd." << id
9382 << " uuid " << uuid << dendl;
9383
9384 // if it has been destroyed, we assume our work here is done.
9385 if (osdmap.is_destroyed(id)) {
9386 ss << "destroyed osd." << id;
9387 return 0;
9388 }
9389
9390 EntityName cephx_entity, lockbox_entity;
9391 bool idempotent_auth = false, idempotent_cks = false;
9392
9393 int err = mon->authmon()->validate_osd_destroy(id, uuid,
9394 cephx_entity,
9395 lockbox_entity,
9396 ss);
9397 if (err < 0) {
9398 if (err == -ENOENT) {
9399 idempotent_auth = true;
31f18b77
FG
9400 } else {
9401 return err;
9402 }
9403 }
9404
9405 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9406 err = svc->validate_osd_destroy(id, uuid);
9407 if (err < 0) {
11fdf7f2 9408 ceph_assert(err == -ENOENT);
31f18b77
FG
9409 err = 0;
9410 idempotent_cks = true;
9411 }
9412
9413 if (!idempotent_auth) {
9414 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
11fdf7f2 9415 ceph_assert(0 == err);
31f18b77
FG
9416 }
9417
9418 if (!idempotent_cks) {
9419 svc->do_osd_destroy(id, uuid);
9420 }
9421
9422 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9423 pending_inc.new_uuid[id] = uuid_d();
9424
9425 // we can only propose_pending() once per service, otherwise we'll be
9426 // defying PaxosService and all laws of nature. Therefore, as we may
9427 // be used during 'osd purge', let's keep the caller responsible for
9428 // proposing.
11fdf7f2 9429 ceph_assert(err == 0);
31f18b77
FG
9430 return 0;
9431}
9432
9433int OSDMonitor::prepare_command_osd_purge(
9434 int32_t id,
9435 stringstream& ss)
9436{
11fdf7f2 9437 ceph_assert(paxos->is_plugged());
31f18b77
FG
9438 dout(10) << __func__ << " purging osd." << id << dendl;
9439
11fdf7f2 9440 ceph_assert(!osdmap.is_up(id));
31f18b77
FG
9441
9442 /*
9443 * This may look a bit weird, but this is what's going to happen:
9444 *
9445 * 1. we make sure that removing from crush works
9446 * 2. we call `prepare_command_osd_destroy()`. If it returns an
9447 * error, then we abort the whole operation, as no updates
9448 * have been made. However, we this function will have
9449 * side-effects, thus we need to make sure that all operations
9450 * performed henceforth will *always* succeed.
9451 * 3. we call `prepare_command_osd_remove()`. Although this
9452 * function can return an error, it currently only checks if the
9453 * osd is up - and we have made sure that it is not so, so there
9454 * is no conflict, and it is effectively an update.
9455 * 4. finally, we call `do_osd_crush_remove()`, which will perform
9456 * the crush update we delayed from before.
9457 */
9458
9459 CrushWrapper newcrush;
9460 _get_pending_crush(newcrush);
9461
9462 bool may_be_idempotent = false;
9463
9464 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9465 if (err == -ENOENT) {
9466 err = 0;
9467 may_be_idempotent = true;
9468 } else if (err < 0) {
9469 ss << "error removing osd." << id << " from crush";
9470 return err;
9471 }
9472
9473 // no point destroying the osd again if it has already been marked destroyed
9474 if (!osdmap.is_destroyed(id)) {
9475 err = prepare_command_osd_destroy(id, ss);
9476 if (err < 0) {
9477 if (err == -ENOENT) {
9478 err = 0;
9479 } else {
9480 return err;
9481 }
9482 } else {
9483 may_be_idempotent = false;
9484 }
9485 }
11fdf7f2 9486 ceph_assert(0 == err);
31f18b77
FG
9487
9488 if (may_be_idempotent && !osdmap.exists(id)) {
9489 dout(10) << __func__ << " osd." << id << " does not exist and "
9490 << "we are idempotent." << dendl;
9491 return -ENOENT;
9492 }
9493
9494 err = prepare_command_osd_remove(id);
9495 // we should not be busy, as we should have made sure this id is not up.
11fdf7f2 9496 ceph_assert(0 == err);
31f18b77
FG
9497
9498 do_osd_crush_remove(newcrush);
9499 return 0;
9500}
9501
7c673cae 9502bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
11fdf7f2 9503 const cmdmap_t& cmdmap)
7c673cae
FG
9504{
9505 op->mark_osdmon_event(__func__);
9f95a23c 9506 auto m = op->get_req<MMonCommand>();
7c673cae
FG
9507 bool ret = false;
9508 stringstream ss;
9509 string rs;
9510 bufferlist rdata;
9511 int err = 0;
9512
9513 string format;
9f95a23c 9514 cmd_getval(cmdmap, "format", format, string("plain"));
7c673cae
FG
9515 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9516
9517 string prefix;
9f95a23c 9518 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
9519
9520 int64_t osdid;
11fdf7f2 9521 string osd_name;
b32b8144
FG
9522 bool osdid_present = false;
9523 if (prefix != "osd pg-temp" &&
9524 prefix != "osd pg-upmap" &&
9525 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
9f95a23c 9526 osdid_present = cmd_getval(cmdmap, "id", osdid);
b32b8144 9527 }
7c673cae
FG
9528 if (osdid_present) {
9529 ostringstream oss;
9530 oss << "osd." << osdid;
11fdf7f2 9531 osd_name = oss.str();
7c673cae
FG
9532 }
9533
9534 // Even if there's a pending state with changes that could affect
9535 // a command, considering that said state isn't yet committed, we
9536 // just don't care about those changes if the command currently being
9537 // handled acts as a no-op against the current committed state.
9538 // In a nutshell, we assume this command happens *before*.
9539 //
9540 // Let me make this clearer:
9541 //
9542 // - If we have only one client, and that client issues some
9543 // operation that would conflict with this operation but is
9544 // still on the pending state, then we would be sure that said
9545 // operation wouldn't have returned yet, so the client wouldn't
9546 // issue this operation (unless the client didn't wait for the
9547 // operation to finish, and that would be the client's own fault).
9548 //
9549 // - If we have more than one client, each client will observe
9550 // whatever is the state at the moment of the commit. So, if we
9551 // have two clients, one issuing an unlink and another issuing a
9552 // link, and if the link happens while the unlink is still on the
9553 // pending state, from the link's point-of-view this is a no-op.
9554 // If different clients are issuing conflicting operations and
9555 // they care about that, then the clients should make sure they
9556 // enforce some kind of concurrency mechanism -- from our
9557 // perspective that's what Douglas Adams would call an SEP.
9558 //
9559 // This should be used as a general guideline for most commands handled
9560 // in this function. Adapt as you see fit, but please bear in mind that
9561 // this is the expected behavior.
9562
9563
9564 if (prefix == "osd setcrushmap" ||
9565 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
9566 if (pending_inc.crush.length()) {
9567 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9568 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9569 return true;
9570 }
7c673cae
FG
9571 dout(10) << "prepare_command setting new crush map" << dendl;
9572 bufferlist data(m->get_data());
9573 CrushWrapper crush;
9574 try {
11fdf7f2 9575 auto bl = data.cbegin();
7c673cae
FG
9576 crush.decode(bl);
9577 }
9578 catch (const std::exception &e) {
9579 err = -EINVAL;
9580 ss << "Failed to parse crushmap: " << e.what();
9581 goto reply;
9582 }
31f18b77
FG
9583
9584 int64_t prior_version = 0;
9f95a23c 9585 if (cmd_getval(cmdmap, "prior_version", prior_version)) {
31f18b77
FG
9586 if (prior_version == osdmap.get_crush_version() - 1) {
9587 // see if we are a resend of the last update. this is imperfect
9588 // (multiple racing updaters may not both get reliable success)
9589 // but we expect crush updaters (via this interface) to be rare-ish.
9590 bufferlist current, proposed;
9591 osdmap.crush->encode(current, mon->get_quorum_con_features());
9592 crush.encode(proposed, mon->get_quorum_con_features());
9593 if (current.contents_equal(proposed)) {
9594 dout(10) << __func__
9595 << " proposed matches current and version equals previous"
9596 << dendl;
9597 err = 0;
9598 ss << osdmap.get_crush_version();
9599 goto reply;
9600 }
9601 }
9602 if (prior_version != osdmap.get_crush_version()) {
9603 err = -EPERM;
9604 ss << "prior_version " << prior_version << " != crush version "
9605 << osdmap.get_crush_version();
9606 goto reply;
9607 }
9608 }
7c673cae 9609
3efd9988 9610 if (crush.has_legacy_rule_ids()) {
31f18b77
FG
9611 err = -EINVAL;
9612 ss << "crush maps with ruleset != ruleid are no longer allowed";
9613 goto reply;
9614 }
7c673cae
FG
9615 if (!validate_crush_against_features(&crush, ss)) {
9616 err = -EINVAL;
9617 goto reply;
9618 }
31f18b77 9619
3efd9988
FG
9620 err = osdmap.validate_crush_rules(&crush, &ss);
9621 if (err < 0) {
9622 goto reply;
7c673cae
FG
9623 }
9624
11fdf7f2 9625 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
9626 // sanity check: test some inputs to make sure this map isn't
9627 // totally broken
9628 dout(10) << " testing map" << dendl;
9629 stringstream ess;
9630 CrushTester tester(crush, ess);
b5b8bbf5 9631 tester.set_min_x(0);
224ce89b 9632 tester.set_max_x(50);
b5b8bbf5 9633 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 9634 int r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 9635 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
9636 if (r < 0) {
9637 dout(10) << " tester.test_with_fork returns " << r
9638 << ": " << ess.str() << dendl;
9639 ss << "crush smoke test failed with " << r << ": " << ess.str();
9640 err = r;
9641 goto reply;
9642 }
b5b8bbf5
FG
9643 dout(10) << __func__ << " crush somke test duration: "
9644 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
9645 }
9646
7c673cae 9647 pending_inc.crush = data;
31f18b77 9648 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
9649 goto update;
9650
3efd9988
FG
9651 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9652 CrushWrapper newcrush;
9653 _get_pending_crush(newcrush);
9654 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9655 int bid = -1 - b;
9656 if (newcrush.bucket_exists(bid) &&
11fdf7f2 9657 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
3efd9988
FG
9658 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9659 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9660 }
9661 }
9662 if (!validate_crush_against_features(&newcrush, ss)) {
9663 err = -EINVAL;
9664 goto reply;
9665 }
9666 pending_inc.crush.clear();
9667 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9668 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9669 get_last_committed() + 1));
9670 return true;
7c673cae 9671 } else if (prefix == "osd crush set-device-class") {
7c673cae 9672 string device_class;
9f95a23c 9673 if (!cmd_getval(cmdmap, "class", device_class)) {
7c673cae
FG
9674 err = -EINVAL; // no value!
9675 goto reply;
9676 }
9677
224ce89b
WB
9678 bool stop = false;
9679 vector<string> idvec;
9f95a23c 9680 cmd_getval(cmdmap, "ids", idvec);
7c673cae
FG
9681 CrushWrapper newcrush;
9682 _get_pending_crush(newcrush);
224ce89b
WB
9683 set<int> updated;
9684 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9685 set<int> osds;
9686 // wildcard?
9687 if (j == 0 &&
9688 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9689 osdmap.get_all_osds(osds);
9690 stop = true;
9691 } else {
9692 // try traditional single osd way
9693 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9694 if (osd < 0) {
9695 // ss has reason for failure
9696 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9697 err = -EINVAL;
9698 continue;
9699 }
9700 osds.insert(osd);
9701 }
7c673cae 9702
224ce89b
WB
9703 for (auto &osd : osds) {
9704 if (!osdmap.exists(osd)) {
9705 ss << "osd." << osd << " does not exist. ";
9706 continue;
9707 }
7c673cae 9708
224ce89b
WB
9709 ostringstream oss;
9710 oss << "osd." << osd;
9711 string name = oss.str();
7c673cae 9712
3a9019d9
FG
9713 if (newcrush.get_max_devices() < osd + 1) {
9714 newcrush.set_max_devices(osd + 1);
9715 }
224ce89b
WB
9716 string action;
9717 if (newcrush.item_exists(osd)) {
9718 action = "updating";
9719 } else {
9720 action = "creating";
9721 newcrush.set_item_name(osd, name);
9722 }
7c673cae 9723
224ce89b
WB
9724 dout(5) << action << " crush item id " << osd << " name '" << name
9725 << "' device_class '" << device_class << "'"
9726 << dendl;
9727 err = newcrush.update_device_class(osd, device_class, name, &ss);
9728 if (err < 0) {
9729 goto reply;
9730 }
9731 if (err == 0 && !_have_pending_crush()) {
9732 if (!stop) {
9733 // for single osd only, wildcard makes too much noise
9734 ss << "set-device-class item id " << osd << " name '" << name
11fdf7f2 9735 << "' device_class '" << device_class << "': no change. ";
224ce89b
WB
9736 }
9737 } else {
9738 updated.insert(osd);
9739 }
9740 }
7c673cae
FG
9741 }
9742
224ce89b
WB
9743 if (!updated.empty()) {
9744 pending_inc.crush.clear();
9745 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9746 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9747 getline(ss, rs);
9748 wait_for_finished_proposal(op,
9749 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9750 return true;
9751 }
7c673cae 9752
c07f9fc5
FG
9753 } else if (prefix == "osd crush rm-device-class") {
9754 bool stop = false;
9755 vector<string> idvec;
9f95a23c 9756 cmd_getval(cmdmap, "ids", idvec);
c07f9fc5
FG
9757 CrushWrapper newcrush;
9758 _get_pending_crush(newcrush);
9759 set<int> updated;
9760
9761 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9762 set<int> osds;
9763
9764 // wildcard?
9765 if (j == 0 &&
9766 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9767 osdmap.get_all_osds(osds);
9768 stop = true;
9769 } else {
9770 // try traditional single osd way
9771 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9772 if (osd < 0) {
9773 // ss has reason for failure
9774 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9775 err = -EINVAL;
9776 goto reply;
9777 }
9778 osds.insert(osd);
9779 }
9780
9781 for (auto &osd : osds) {
9782 if (!osdmap.exists(osd)) {
9783 ss << "osd." << osd << " does not exist. ";
9784 continue;
9785 }
9786
9787 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
9788 if (!class_name) {
9789 ss << "osd." << osd << " belongs to no class, ";
9790 continue;
9791 }
9792 // note that we do not verify if class_is_in_use here
9793 // in case the device is misclassified and user wants
9794 // to overridely reset...
9795
11fdf7f2 9796 err = newcrush.remove_device_class(cct, osd, &ss);
c07f9fc5
FG
9797 if (err < 0) {
9798 // ss has reason for failure
9799 goto reply;
9800 }
9801 updated.insert(osd);
9802 }
9803 }
9804
9805 if (!updated.empty()) {
9806 pending_inc.crush.clear();
9807 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9808 ss << "done removing class of osd(s): " << updated;
9809 getline(ss, rs);
9810 wait_for_finished_proposal(op,
9811 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9812 return true;
9813 }
11fdf7f2
TL
9814 } else if (prefix == "osd crush class create") {
9815 string device_class;
9f95a23c 9816 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
9817 err = -EINVAL; // no value!
9818 goto reply;
9819 }
9f95a23c 9820 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
9821 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9822 << "luminous' before using crush device classes";
9823 err = -EPERM;
9824 goto reply;
9825 }
9826 if (!_have_pending_crush() &&
9827 _get_stable_crush().class_exists(device_class)) {
9828 ss << "class '" << device_class << "' already exists";
9829 goto reply;
9830 }
9831 CrushWrapper newcrush;
9832 _get_pending_crush(newcrush);
9833 if (newcrush.class_exists(device_class)) {
9834 ss << "class '" << device_class << "' already exists";
9835 goto update;
9836 }
9837 int class_id = newcrush.get_or_create_class_id(device_class);
9838 pending_inc.crush.clear();
9839 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9840 ss << "created class " << device_class << " with id " << class_id
9841 << " to crush map";
9842 goto update;
9843 } else if (prefix == "osd crush class rm") {
9844 string device_class;
9f95a23c 9845 if (!cmd_getval(cmdmap, "class", device_class)) {
11fdf7f2
TL
9846 err = -EINVAL; // no value!
9847 goto reply;
9848 }
9f95a23c 9849 if (osdmap.require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
9850 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9851 << "luminous' before using crush device classes";
9852 err = -EPERM;
9853 goto reply;
9854 }
9855
9856 if (!osdmap.crush->class_exists(device_class)) {
9857 err = 0;
9858 goto reply;
9859 }
9860
9861 CrushWrapper newcrush;
9862 _get_pending_crush(newcrush);
9863 if (!newcrush.class_exists(device_class)) {
9864 err = 0; // make command idempotent
9865 goto wait;
9866 }
9867 int class_id = newcrush.get_class_id(device_class);
9868 stringstream ts;
9869 if (newcrush.class_is_in_use(class_id, &ts)) {
9870 err = -EBUSY;
9871 ss << "class '" << device_class << "' " << ts.str();
9872 goto reply;
9873 }
9874
9875 // check if class is used by any erasure-code-profiles
9876 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9877 osdmap.get_erasure_code_profiles();
9878 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9879#ifdef HAVE_STDLIB_MAP_SPLICING
9880 ec_profiles.merge(old_ec_profiles);
9881#else
9882 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9883 make_move_iterator(end(old_ec_profiles)));
9884#endif
9885 list<string> referenced_by;
9886 for (auto &i: ec_profiles) {
9887 for (auto &j: i.second) {
9888 if ("crush-device-class" == j.first && device_class == j.second) {
9889 referenced_by.push_back(i.first);
9890 }
9891 }
9892 }
9893 if (!referenced_by.empty()) {
9894 err = -EBUSY;
9895 ss << "class '" << device_class
9896 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9897 goto reply;
9898 }
9899
9900 set<int> osds;
9901 newcrush.get_devices_by_class(device_class, &osds);
9902 for (auto& p: osds) {
9903 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9904 if (err < 0) {
9905 // ss has reason for failure
9906 goto reply;
9907 }
9908 }
9909
9910 if (osds.empty()) {
9911 // empty class, remove directly
9912 err = newcrush.remove_class_name(device_class);
9913 if (err < 0) {
9914 ss << "class '" << device_class << "' cannot be removed '"
9915 << cpp_strerror(err) << "'";
9916 goto reply;
9917 }
9918 }
9919
9920 pending_inc.crush.clear();
9921 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9922 ss << "removed class " << device_class << " with id " << class_id
9923 << " from crush map";
9924 goto update;
35e4c445
FG
9925 } else if (prefix == "osd crush class rename") {
9926 string srcname, dstname;
9f95a23c 9927 if (!cmd_getval(cmdmap, "srcname", srcname)) {
35e4c445
FG
9928 err = -EINVAL;
9929 goto reply;
9930 }
9f95a23c 9931 if (!cmd_getval(cmdmap, "dstname", dstname)) {
35e4c445
FG
9932 err = -EINVAL;
9933 goto reply;
9934 }
9935
9936 CrushWrapper newcrush;
9937 _get_pending_crush(newcrush);
181888fb
FG
9938 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9939 // suppose this is a replay and return success
9940 // so command is idempotent
9941 ss << "already renamed to '" << dstname << "'";
9942 err = 0;
35e4c445
FG
9943 goto reply;
9944 }
c07f9fc5 9945
35e4c445
FG
9946 err = newcrush.rename_class(srcname, dstname);
9947 if (err < 0) {
9948 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9949 << cpp_strerror(err);
9950 goto reply;
9951 }
9952
9953 pending_inc.crush.clear();
9954 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9955 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9956 goto update;
7c673cae
FG
9957 } else if (prefix == "osd crush add-bucket") {
9958 // os crush add-bucket <name> <type>
9959 string name, typestr;
11fdf7f2 9960 vector<string> argvec;
9f95a23c
TL
9961 cmd_getval(cmdmap, "name", name);
9962 cmd_getval(cmdmap, "type", typestr);
9963 cmd_getval(cmdmap, "args", argvec);
11fdf7f2
TL
9964 map<string,string> loc;
9965 if (!argvec.empty()) {
9966 CrushWrapper::parse_loc_map(argvec, &loc);
9967 dout(0) << "will create and move bucket '" << name
9968 << "' to location " << loc << dendl;
9969 }
7c673cae
FG
9970
9971 if (!_have_pending_crush() &&
9972 _get_stable_crush().name_exists(name)) {
9973 ss << "bucket '" << name << "' already exists";
9974 goto reply;
9975 }
9976
9977 CrushWrapper newcrush;
9978 _get_pending_crush(newcrush);
9979
9980 if (newcrush.name_exists(name)) {
9981 ss << "bucket '" << name << "' already exists";
9982 goto update;
9983 }
9984 int type = newcrush.get_type_id(typestr);
9985 if (type < 0) {
9986 ss << "type '" << typestr << "' does not exist";
9987 err = -EINVAL;
9988 goto reply;
9989 }
9990 if (type == 0) {
9991 ss << "type '" << typestr << "' is for devices, not buckets";
9992 err = -EINVAL;
9993 goto reply;
9994 }
9995 int bucketno;
9996 err = newcrush.add_bucket(0, 0,
9997 CRUSH_HASH_DEFAULT, type, 0, NULL,
9998 NULL, &bucketno);
9999 if (err < 0) {
10000 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10001 goto reply;
10002 }
10003 err = newcrush.set_item_name(bucketno, name);
10004 if (err < 0) {
10005 ss << "error setting bucket name to '" << name << "'";
10006 goto reply;
10007 }
10008
11fdf7f2
TL
10009 if (!loc.empty()) {
10010 if (!newcrush.check_item_loc(cct, bucketno, loc,
10011 (int *)NULL)) {
10012 err = newcrush.move_bucket(cct, bucketno, loc);
10013 if (err < 0) {
10014 ss << "error moving bucket '" << name << "' to location " << loc;
10015 goto reply;
10016 }
10017 } else {
10018 ss << "no need to move item id " << bucketno << " name '" << name
10019 << "' to location " << loc << " in crush map";
10020 }
10021 }
10022
7c673cae
FG
10023 pending_inc.crush.clear();
10024 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
10025 if (loc.empty()) {
10026 ss << "added bucket " << name << " type " << typestr
10027 << " to crush map";
10028 } else {
10029 ss << "added bucket " << name << " type " << typestr
10030 << " to location " << loc;
10031 }
7c673cae
FG
10032 goto update;
10033 } else if (prefix == "osd crush rename-bucket") {
10034 string srcname, dstname;
9f95a23c
TL
10035 cmd_getval(cmdmap, "srcname", srcname);
10036 cmd_getval(cmdmap, "dstname", dstname);
7c673cae
FG
10037
10038 err = crush_rename_bucket(srcname, dstname, &ss);
10039 if (err == -EALREADY) // equivalent to success for idempotency
10040 err = 0;
10041 if (err)
10042 goto reply;
10043 else
10044 goto update;
c07f9fc5
FG
10045 } else if (prefix == "osd crush weight-set create" ||
10046 prefix == "osd crush weight-set create-compat") {
10047 CrushWrapper newcrush;
10048 _get_pending_crush(newcrush);
10049 int64_t pool;
10050 int positions;
10051 if (newcrush.has_non_straw2_buckets()) {
10052 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
10053 err = -EPERM;
10054 goto reply;
10055 }
c07f9fc5 10056 if (prefix == "osd crush weight-set create") {
9f95a23c
TL
10057 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10058 osdmap.require_min_compat_client < ceph_release_t::luminous) {
c07f9fc5 10059 ss << "require_min_compat_client "
9f95a23c 10060 << osdmap.require_min_compat_client
c07f9fc5
FG
10061 << " < luminous, which is required for per-pool weight-sets. "
10062 << "Try 'ceph osd set-require-min-compat-client luminous' "
10063 << "before using the new interface";
10064 err = -EPERM;
10065 goto reply;
10066 }
10067 string poolname, mode;
9f95a23c 10068 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10069 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10070 if (pool < 0) {
10071 ss << "pool '" << poolname << "' not found";
10072 err = -ENOENT;
10073 goto reply;
10074 }
9f95a23c 10075 cmd_getval(cmdmap, "mode", mode);
c07f9fc5
FG
10076 if (mode != "flat" && mode != "positional") {
10077 ss << "unrecognized weight-set mode '" << mode << "'";
10078 err = -EINVAL;
10079 goto reply;
10080 }
10081 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10082 } else {
10083 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10084 positions = 1;
224ce89b 10085 }
11fdf7f2
TL
10086 if (!newcrush.create_choose_args(pool, positions)) {
10087 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10088 ss << "compat weight-set already created";
10089 } else {
10090 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10091 << "' already created";
10092 }
10093 goto reply;
10094 }
c07f9fc5
FG
10095 pending_inc.crush.clear();
10096 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10097 goto update;
224ce89b 10098
c07f9fc5
FG
10099 } else if (prefix == "osd crush weight-set rm" ||
10100 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
10101 CrushWrapper newcrush;
10102 _get_pending_crush(newcrush);
c07f9fc5
FG
10103 int64_t pool;
10104 if (prefix == "osd crush weight-set rm") {
10105 string poolname;
9f95a23c 10106 cmd_getval(cmdmap, "pool", poolname);
c07f9fc5
FG
10107 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10108 if (pool < 0) {
10109 ss << "pool '" << poolname << "' not found";
10110 err = -ENOENT;
10111 goto reply;
10112 }
10113 } else {
10114 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 10115 }
c07f9fc5
FG
10116 newcrush.rm_choose_args(pool);
10117 pending_inc.crush.clear();
10118 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10119 goto update;
224ce89b 10120
c07f9fc5
FG
10121 } else if (prefix == "osd crush weight-set reweight" ||
10122 prefix == "osd crush weight-set reweight-compat") {
10123 string poolname, item;
10124 vector<double> weight;
9f95a23c
TL
10125 cmd_getval(cmdmap, "pool", poolname);
10126 cmd_getval(cmdmap, "item", item);
10127 cmd_getval(cmdmap, "weight", weight);
c07f9fc5
FG
10128 CrushWrapper newcrush;
10129 _get_pending_crush(newcrush);
10130 int64_t pool;
10131 if (prefix == "osd crush weight-set reweight") {
10132 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10133 if (pool < 0) {
10134 ss << "pool '" << poolname << "' not found";
10135 err = -ENOENT;
10136 goto reply;
10137 }
10138 if (!newcrush.have_choose_args(pool)) {
10139 ss << "no weight-set for pool '" << poolname << "'";
10140 err = -ENOENT;
10141 goto reply;
10142 }
10143 auto arg_map = newcrush.choose_args_get(pool);
10144 int positions = newcrush.get_choose_args_positions(arg_map);
10145 if (weight.size() != (size_t)positions) {
10146 ss << "must specify exact " << positions << " weight values";
10147 err = -EINVAL;
10148 goto reply;
10149 }
10150 } else {
10151 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10152 if (!newcrush.have_choose_args(pool)) {
10153 ss << "no backward-compatible weight-set";
10154 err = -ENOENT;
10155 goto reply;
10156 }
224ce89b 10157 }
c07f9fc5
FG
10158 if (!newcrush.name_exists(item)) {
10159 ss << "item '" << item << "' does not exist";
10160 err = -ENOENT;
224ce89b
WB
10161 goto reply;
10162 }
c07f9fc5 10163 err = newcrush.choose_args_adjust_item_weightf(
11fdf7f2 10164 cct,
c07f9fc5
FG
10165 newcrush.choose_args_get(pool),
10166 newcrush.get_item_id(item),
10167 weight,
10168 &ss);
224ce89b 10169 if (err < 0) {
224ce89b
WB
10170 goto reply;
10171 }
c07f9fc5 10172 err = 0;
224ce89b
WB
10173 pending_inc.crush.clear();
10174 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
224ce89b 10175 goto update;
7c673cae
FG
10176 } else if (osdid_present &&
10177 (prefix == "osd crush set" || prefix == "osd crush add")) {
10178 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10179 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10180 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10181
10182 if (!osdmap.exists(osdid)) {
10183 err = -ENOENT;
11fdf7f2
TL
10184 ss << osd_name
10185 << " does not exist. Create it before updating the crush map";
7c673cae
FG
10186 goto reply;
10187 }
10188
10189 double weight;
9f95a23c 10190 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10191 ss << "unable to parse weight value '"
11fdf7f2 10192 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10193 err = -EINVAL;
10194 goto reply;
10195 }
10196
10197 string args;
10198 vector<string> argvec;
9f95a23c 10199 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10200 map<string,string> loc;
10201 CrushWrapper::parse_loc_map(argvec, &loc);
10202
10203 if (prefix == "osd crush set"
10204 && !_get_stable_crush().item_exists(osdid)) {
10205 err = -ENOENT;
11fdf7f2 10206 ss << "unable to set item id " << osdid << " name '" << osd_name
7c673cae
FG
10207 << "' weight " << weight << " at location " << loc
10208 << ": does not exist";
10209 goto reply;
10210 }
10211
10212 dout(5) << "adding/updating crush item id " << osdid << " name '"
11fdf7f2 10213 << osd_name << "' weight " << weight << " at location "
7c673cae
FG
10214 << loc << dendl;
10215 CrushWrapper newcrush;
10216 _get_pending_crush(newcrush);
10217
10218 string action;
10219 if (prefix == "osd crush set" ||
11fdf7f2 10220 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
7c673cae 10221 action = "set";
11fdf7f2 10222 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10223 } else {
10224 action = "add";
11fdf7f2 10225 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
10226 if (err == 0)
10227 err = 1;
10228 }
10229
10230 if (err < 0)
10231 goto reply;
10232
10233 if (err == 0 && !_have_pending_crush()) {
11fdf7f2
TL
10234 ss << action << " item id " << osdid << " name '" << osd_name
10235 << "' weight " << weight << " at location " << loc << ": no change";
7c673cae
FG
10236 goto reply;
10237 }
10238
10239 pending_inc.crush.clear();
10240 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
10241 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10242 << weight << " at location " << loc << " to crush map";
7c673cae
FG
10243 getline(ss, rs);
10244 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10245 get_last_committed() + 1));
10246 return true;
10247
10248 } else if (prefix == "osd crush create-or-move") {
10249 do {
10250 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10251 if (!osdmap.exists(osdid)) {
10252 err = -ENOENT;
11fdf7f2
TL
10253 ss << osd_name
10254 << " does not exist. create it before updating the crush map";
7c673cae
FG
10255 goto reply;
10256 }
10257
10258 double weight;
9f95a23c 10259 if (!cmd_getval(cmdmap, "weight", weight)) {
7c673cae 10260 ss << "unable to parse weight value '"
11fdf7f2 10261 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10262 err = -EINVAL;
10263 goto reply;
10264 }
10265
10266 string args;
10267 vector<string> argvec;
9f95a23c 10268 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10269 map<string,string> loc;
10270 CrushWrapper::parse_loc_map(argvec, &loc);
10271
11fdf7f2
TL
10272 dout(0) << "create-or-move crush item name '" << osd_name
10273 << "' initial_weight " << weight << " at location " << loc
10274 << dendl;
7c673cae
FG
10275
10276 CrushWrapper newcrush;
10277 _get_pending_crush(newcrush);
10278
11fdf7f2
TL
10279 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10280 g_conf()->osd_crush_update_weight_set);
7c673cae 10281 if (err == 0) {
11fdf7f2
TL
10282 ss << "create-or-move updated item name '" << osd_name
10283 << "' weight " << weight
7c673cae
FG
10284 << " at location " << loc << " to crush map";
10285 break;
10286 }
10287 if (err > 0) {
10288 pending_inc.crush.clear();
10289 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
10290 ss << "create-or-move updating item name '" << osd_name
10291 << "' weight " << weight
7c673cae
FG
10292 << " at location " << loc << " to crush map";
10293 getline(ss, rs);
10294 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10295 get_last_committed() + 1));
10296 return true;
10297 }
10298 } while (false);
10299
10300 } else if (prefix == "osd crush move") {
10301 do {
10302 // osd crush move <name> <loc1> [<loc2> ...]
11fdf7f2 10303 string name;
7c673cae 10304 vector<string> argvec;
9f95a23c
TL
10305 cmd_getval(cmdmap, "name", name);
10306 cmd_getval(cmdmap, "args", argvec);
7c673cae
FG
10307 map<string,string> loc;
10308 CrushWrapper::parse_loc_map(argvec, &loc);
10309
10310 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10311 CrushWrapper newcrush;
10312 _get_pending_crush(newcrush);
10313
10314 if (!newcrush.name_exists(name)) {
10315 err = -ENOENT;
10316 ss << "item " << name << " does not exist";
10317 break;
10318 }
10319 int id = newcrush.get_item_id(name);
10320
11fdf7f2 10321 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
7c673cae 10322 if (id >= 0) {
11fdf7f2
TL
10323 err = newcrush.create_or_move_item(
10324 cct, id, 0, name, loc,
10325 g_conf()->osd_crush_update_weight_set);
7c673cae 10326 } else {
11fdf7f2 10327 err = newcrush.move_bucket(cct, id, loc);
7c673cae
FG
10328 }
10329 if (err >= 0) {
10330 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10331 pending_inc.crush.clear();
10332 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10333 getline(ss, rs);
10334 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10335 get_last_committed() + 1));
10336 return true;
10337 }
10338 } else {
10339 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10340 err = 0;
10341 }
10342 } while (false);
31f18b77 10343 } else if (prefix == "osd crush swap-bucket") {
11fdf7f2 10344 string source, dest;
9f95a23c
TL
10345 cmd_getval(cmdmap, "source", source);
10346 cmd_getval(cmdmap, "dest", dest);
11fdf7f2
TL
10347
10348 bool force = false;
9f95a23c 10349 cmd_getval(cmdmap, "yes_i_really_mean_it", force);
11fdf7f2 10350
31f18b77
FG
10351 CrushWrapper newcrush;
10352 _get_pending_crush(newcrush);
10353 if (!newcrush.name_exists(source)) {
10354 ss << "source item " << source << " does not exist";
10355 err = -ENOENT;
10356 goto reply;
10357 }
10358 if (!newcrush.name_exists(dest)) {
10359 ss << "dest item " << dest << " does not exist";
10360 err = -ENOENT;
10361 goto reply;
10362 }
10363 int sid = newcrush.get_item_id(source);
10364 int did = newcrush.get_item_id(dest);
10365 int sparent;
11fdf7f2 10366 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
31f18b77
FG
10367 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10368 err = -EPERM;
10369 goto reply;
10370 }
10371 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
11fdf7f2 10372 !force) {
31f18b77
FG
10373 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10374 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10375 << "; pass --yes-i-really-mean-it to proceed anyway";
10376 err = -EPERM;
10377 goto reply;
10378 }
11fdf7f2 10379 int r = newcrush.swap_bucket(cct, sid, did);
31f18b77
FG
10380 if (r < 0) {
10381 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 10382 err = r;
31f18b77
FG
10383 goto reply;
10384 }
10385 ss << "swapped bucket of " << source << " to " << dest;
10386 pending_inc.crush.clear();
10387 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10388 wait_for_finished_proposal(op,
10389 new Monitor::C_Command(mon, op, err, ss.str(),
10390 get_last_committed() + 1));
10391 return true;
10392 } else if (prefix == "osd crush link") {
10393 // osd crush link <name> <loc1> [<loc2> ...]
10394 string name;
9f95a23c 10395 cmd_getval(cmdmap, "name", name);
31f18b77 10396 vector<string> argvec;
9f95a23c 10397 cmd_getval(cmdmap, "args", argvec);
31f18b77
FG
10398 map<string,string> loc;
10399 CrushWrapper::parse_loc_map(argvec, &loc);
10400
10401 // Need an explicit check for name_exists because get_item_id returns
10402 // 0 on unfound.
10403 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
10404 if (!osdmap.crush->name_exists(name)) {
10405 err = -ENOENT;
10406 ss << "item " << name << " does not exist";
10407 goto reply;
10408 } else {
10409 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10410 }
11fdf7f2 10411 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
7c673cae
FG
10412 ss << "no need to move item id " << id << " name '" << name
10413 << "' to location " << loc << " in crush map";
10414 err = 0;
10415 goto reply;
10416 }
10417
10418 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10419 CrushWrapper newcrush;
10420 _get_pending_crush(newcrush);
10421
10422 if (!newcrush.name_exists(name)) {
10423 err = -ENOENT;
10424 ss << "item " << name << " does not exist";
10425 goto reply;
10426 } else {
10427 int id = newcrush.get_item_id(name);
11fdf7f2
TL
10428 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10429 err = newcrush.link_bucket(cct, id, loc);
7c673cae
FG
10430 if (err >= 0) {
10431 ss << "linked item id " << id << " name '" << name
10432 << "' to location " << loc << " in crush map";
10433 pending_inc.crush.clear();
10434 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10435 } else {
10436 ss << "cannot link item id " << id << " name '" << name
10437 << "' to location " << loc;
10438 goto reply;
10439 }
10440 } else {
10441 ss << "no need to move item id " << id << " name '" << name
10442 << "' to location " << loc << " in crush map";
10443 err = 0;
10444 }
10445 }
10446 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10447 get_last_committed() + 1));
10448 return true;
10449 } else if (prefix == "osd crush rm" ||
10450 prefix == "osd crush remove" ||
10451 prefix == "osd crush unlink") {
10452 do {
10453 // osd crush rm <id> [ancestor]
10454 CrushWrapper newcrush;
10455 _get_pending_crush(newcrush);
10456
10457 string name;
9f95a23c 10458 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10459
10460 if (!osdmap.crush->name_exists(name)) {
10461 err = 0;
10462 ss << "device '" << name << "' does not appear in the crush map";
10463 break;
10464 }
10465 if (!newcrush.name_exists(name)) {
10466 err = 0;
10467 ss << "device '" << name << "' does not appear in the crush map";
10468 getline(ss, rs);
10469 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10470 get_last_committed() + 1));
10471 return true;
10472 }
10473 int id = newcrush.get_item_id(name);
31f18b77
FG
10474 int ancestor = 0;
10475
7c673cae
FG
10476 bool unlink_only = prefix == "osd crush unlink";
10477 string ancestor_str;
9f95a23c 10478 if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
7c673cae
FG
10479 if (!newcrush.name_exists(ancestor_str)) {
10480 err = -ENOENT;
10481 ss << "ancestor item '" << ancestor_str
10482 << "' does not appear in the crush map";
10483 break;
10484 }
31f18b77 10485 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 10486 }
31f18b77
FG
10487
10488 err = prepare_command_osd_crush_remove(
10489 newcrush,
10490 id, ancestor,
10491 (ancestor < 0), unlink_only);
10492
7c673cae
FG
10493 if (err == -ENOENT) {
10494 ss << "item " << id << " does not appear in that position";
10495 err = 0;
10496 break;
10497 }
10498 if (err == 0) {
81eedcae
TL
10499 if (!unlink_only)
10500 pending_inc.new_crush_node_flags[id] = 0;
7c673cae
FG
10501 ss << "removed item id " << id << " name '" << name << "' from crush map";
10502 getline(ss, rs);
10503 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10504 get_last_committed() + 1));
10505 return true;
10506 }
10507 } while (false);
10508
10509 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
10510 CrushWrapper newcrush;
10511 _get_pending_crush(newcrush);
10512
11fdf7f2 10513 newcrush.reweight(cct);
7c673cae
FG
10514 pending_inc.crush.clear();
10515 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10516 ss << "reweighted crush hierarchy";
10517 getline(ss, rs);
10518 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10519 get_last_committed() + 1));
10520 return true;
10521 } else if (prefix == "osd crush reweight") {
10522 // osd crush reweight <name> <weight>
10523 CrushWrapper newcrush;
10524 _get_pending_crush(newcrush);
10525
10526 string name;
9f95a23c 10527 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10528 if (!newcrush.name_exists(name)) {
10529 err = -ENOENT;
10530 ss << "device '" << name << "' does not appear in the crush map";
10531 goto reply;
10532 }
10533
10534 int id = newcrush.get_item_id(name);
10535 if (id < 0) {
10536 ss << "device '" << name << "' is not a leaf in the crush map";
10537 err = -EINVAL;
10538 goto reply;
10539 }
10540 double w;
9f95a23c 10541 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 10542 ss << "unable to parse weight value '"
11fdf7f2 10543 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10544 err = -EINVAL;
10545 goto reply;
10546 }
10547
11fdf7f2
TL
10548 err = newcrush.adjust_item_weightf(cct, id, w,
10549 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10550 if (err < 0)
10551 goto reply;
10552 pending_inc.crush.clear();
10553 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10554 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10555 << " in crush map";
10556 getline(ss, rs);
10557 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10558 get_last_committed() + 1));
10559 return true;
10560 } else if (prefix == "osd crush reweight-subtree") {
10561 // osd crush reweight <name> <weight>
10562 CrushWrapper newcrush;
10563 _get_pending_crush(newcrush);
10564
10565 string name;
9f95a23c 10566 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10567 if (!newcrush.name_exists(name)) {
10568 err = -ENOENT;
10569 ss << "device '" << name << "' does not appear in the crush map";
10570 goto reply;
10571 }
10572
10573 int id = newcrush.get_item_id(name);
10574 if (id >= 0) {
10575 ss << "device '" << name << "' is not a subtree in the crush map";
10576 err = -EINVAL;
10577 goto reply;
10578 }
10579 double w;
9f95a23c 10580 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 10581 ss << "unable to parse weight value '"
11fdf7f2 10582 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10583 err = -EINVAL;
10584 goto reply;
10585 }
10586
11fdf7f2
TL
10587 err = newcrush.adjust_subtree_weightf(cct, id, w,
10588 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10589 if (err < 0)
10590 goto reply;
10591 pending_inc.crush.clear();
10592 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10593 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10594 << " in crush map";
10595 getline(ss, rs);
10596 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10597 get_last_committed() + 1));
10598 return true;
10599 } else if (prefix == "osd crush tunables") {
10600 CrushWrapper newcrush;
10601 _get_pending_crush(newcrush);
10602
10603 err = 0;
10604 string profile;
9f95a23c 10605 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
10606 if (profile == "legacy" || profile == "argonaut") {
10607 newcrush.set_tunables_legacy();
10608 } else if (profile == "bobtail") {
10609 newcrush.set_tunables_bobtail();
10610 } else if (profile == "firefly") {
10611 newcrush.set_tunables_firefly();
10612 } else if (profile == "hammer") {
10613 newcrush.set_tunables_hammer();
10614 } else if (profile == "jewel") {
10615 newcrush.set_tunables_jewel();
10616 } else if (profile == "optimal") {
10617 newcrush.set_tunables_optimal();
10618 } else if (profile == "default") {
10619 newcrush.set_tunables_default();
10620 } else {
10621 ss << "unrecognized profile '" << profile << "'";
10622 err = -EINVAL;
10623 goto reply;
10624 }
10625
10626 if (!validate_crush_against_features(&newcrush, ss)) {
10627 err = -EINVAL;
10628 goto reply;
10629 }
10630
10631 pending_inc.crush.clear();
10632 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10633 ss << "adjusted tunables profile to " << profile;
10634 getline(ss, rs);
10635 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10636 get_last_committed() + 1));
10637 return true;
10638 } else if (prefix == "osd crush set-tunable") {
10639 CrushWrapper newcrush;
10640 _get_pending_crush(newcrush);
10641
10642 err = 0;
10643 string tunable;
9f95a23c 10644 cmd_getval(cmdmap, "tunable", tunable);
7c673cae
FG
10645
10646 int64_t value = -1;
9f95a23c 10647 if (!cmd_getval(cmdmap, "value", value)) {
7c673cae 10648 err = -EINVAL;
11fdf7f2
TL
10649 ss << "failed to parse integer value "
10650 << cmd_vartype_stringify(cmdmap.at("value"));
7c673cae
FG
10651 goto reply;
10652 }
10653
10654 if (tunable == "straw_calc_version") {
224ce89b 10655 if (value != 0 && value != 1) {
7c673cae
FG
10656 ss << "value must be 0 or 1; got " << value;
10657 err = -EINVAL;
10658 goto reply;
10659 }
10660 newcrush.set_straw_calc_version(value);
10661 } else {
10662 ss << "unrecognized tunable '" << tunable << "'";
10663 err = -EINVAL;
10664 goto reply;
10665 }
10666
10667 if (!validate_crush_against_features(&newcrush, ss)) {
10668 err = -EINVAL;
10669 goto reply;
10670 }
10671
10672 pending_inc.crush.clear();
10673 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10674 ss << "adjusted tunable " << tunable << " to " << value;
10675 getline(ss, rs);
10676 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10677 get_last_committed() + 1));
10678 return true;
10679
10680 } else if (prefix == "osd crush rule create-simple") {
10681 string name, root, type, mode;
9f95a23c
TL
10682 cmd_getval(cmdmap, "name", name);
10683 cmd_getval(cmdmap, "root", root);
10684 cmd_getval(cmdmap, "type", type);
10685 cmd_getval(cmdmap, "mode", mode);
7c673cae
FG
10686 if (mode == "")
10687 mode = "firstn";
10688
10689 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
10690 // The name is uniquely associated to a ruleid and the rule it contains
10691 // From the user point of view, the rule is more meaningfull.
10692 ss << "rule " << name << " already exists";
7c673cae
FG
10693 err = 0;
10694 goto reply;
10695 }
10696
10697 CrushWrapper newcrush;
10698 _get_pending_crush(newcrush);
10699
10700 if (newcrush.rule_exists(name)) {
31f18b77
FG
10701 // The name is uniquely associated to a ruleid and the rule it contains
10702 // From the user point of view, the rule is more meaningfull.
10703 ss << "rule " << name << " already exists";
7c673cae
FG
10704 err = 0;
10705 } else {
224ce89b 10706 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
10707 pg_pool_t::TYPE_REPLICATED, &ss);
10708 if (ruleno < 0) {
10709 err = ruleno;
10710 goto reply;
10711 }
10712
10713 pending_inc.crush.clear();
10714 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10715 }
10716 getline(ss, rs);
10717 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10718 get_last_committed() + 1));
10719 return true;
10720
224ce89b
WB
10721 } else if (prefix == "osd crush rule create-replicated") {
10722 string name, root, type, device_class;
9f95a23c
TL
10723 cmd_getval(cmdmap, "name", name);
10724 cmd_getval(cmdmap, "root", root);
10725 cmd_getval(cmdmap, "type", type);
10726 cmd_getval(cmdmap, "class", device_class);
224ce89b
WB
10727
10728 if (osdmap.crush->rule_exists(name)) {
10729 // The name is uniquely associated to a ruleid and the rule it contains
10730 // From the user point of view, the rule is more meaningfull.
10731 ss << "rule " << name << " already exists";
10732 err = 0;
10733 goto reply;
10734 }
10735
10736 CrushWrapper newcrush;
10737 _get_pending_crush(newcrush);
10738
10739 if (newcrush.rule_exists(name)) {
10740 // The name is uniquely associated to a ruleid and the rule it contains
10741 // From the user point of view, the rule is more meaningfull.
10742 ss << "rule " << name << " already exists";
10743 err = 0;
10744 } else {
10745 int ruleno = newcrush.add_simple_rule(
10746 name, root, type, device_class,
10747 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10748 if (ruleno < 0) {
10749 err = ruleno;
10750 goto reply;
10751 }
10752
10753 pending_inc.crush.clear();
10754 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10755 }
10756 getline(ss, rs);
10757 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10758 get_last_committed() + 1));
10759 return true;
10760
7c673cae
FG
10761 } else if (prefix == "osd erasure-code-profile rm") {
10762 string name;
9f95a23c 10763 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10764
10765 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10766 goto wait;
10767
10768 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10769 err = -EBUSY;
10770 goto reply;
10771 }
10772
10773 if (osdmap.has_erasure_code_profile(name) ||
10774 pending_inc.new_erasure_code_profiles.count(name)) {
10775 if (osdmap.has_erasure_code_profile(name)) {
10776 pending_inc.old_erasure_code_profiles.push_back(name);
10777 } else {
10778 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10779 pending_inc.new_erasure_code_profiles.erase(name);
10780 }
10781
10782 getline(ss, rs);
10783 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10784 get_last_committed() + 1));
10785 return true;
10786 } else {
10787 ss << "erasure-code-profile " << name << " does not exist";
10788 err = 0;
10789 goto reply;
10790 }
10791
10792 } else if (prefix == "osd erasure-code-profile set") {
10793 string name;
9f95a23c 10794 cmd_getval(cmdmap, "name", name);
7c673cae 10795 vector<string> profile;
9f95a23c 10796 cmd_getval(cmdmap, "profile", profile);
11fdf7f2
TL
10797
10798 bool force = false;
9f95a23c 10799 cmd_getval(cmdmap, "force", force);
11fdf7f2 10800
7c673cae
FG
10801 map<string,string> profile_map;
10802 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10803 if (err)
10804 goto reply;
10805 if (profile_map.find("plugin") == profile_map.end()) {
10806 ss << "erasure-code-profile " << profile_map
10807 << " must contain a plugin entry" << std::endl;
10808 err = -EINVAL;
10809 goto reply;
10810 }
10811 string plugin = profile_map["plugin"];
10812
10813 if (pending_inc.has_erasure_code_profile(name)) {
10814 dout(20) << "erasure code profile " << name << " try again" << dendl;
10815 goto wait;
10816 } else {
7c673cae
FG
10817 err = normalize_profile(name, profile_map, force, &ss);
10818 if (err)
10819 goto reply;
10820
10821 if (osdmap.has_erasure_code_profile(name)) {
10822 ErasureCodeProfile existing_profile_map =
10823 osdmap.get_erasure_code_profile(name);
10824 err = normalize_profile(name, existing_profile_map, force, &ss);
10825 if (err)
10826 goto reply;
10827
10828 if (existing_profile_map == profile_map) {
10829 err = 0;
10830 goto reply;
10831 }
10832 if (!force) {
10833 err = -EPERM;
10834 ss << "will not override erasure code profile " << name
10835 << " because the existing profile "
10836 << existing_profile_map
10837 << " is different from the proposed profile "
10838 << profile_map;
10839 goto reply;
10840 }
10841 }
10842
10843 dout(20) << "erasure code profile set " << name << "="
10844 << profile_map << dendl;
10845 pending_inc.set_erasure_code_profile(name, profile_map);
10846 }
10847
10848 getline(ss, rs);
10849 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10850 get_last_committed() + 1));
10851 return true;
10852
10853 } else if (prefix == "osd crush rule create-erasure") {
10854 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10855 if (err == -EAGAIN)
10856 goto wait;
10857 if (err)
10858 goto reply;
10859 string name, poolstr;
9f95a23c 10860 cmd_getval(cmdmap, "name", name);
7c673cae 10861 string profile;
9f95a23c 10862 cmd_getval(cmdmap, "profile", profile);
7c673cae
FG
10863 if (profile == "")
10864 profile = "default";
10865 if (profile == "default") {
10866 if (!osdmap.has_erasure_code_profile(profile)) {
10867 if (pending_inc.has_erasure_code_profile(profile)) {
10868 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10869 goto wait;
10870 }
10871
10872 map<string,string> profile_map;
11fdf7f2 10873 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
10874 profile_map,
10875 &ss);
10876 if (err)
10877 goto reply;
10878 err = normalize_profile(name, profile_map, true, &ss);
10879 if (err)
10880 goto reply;
10881 dout(20) << "erasure code profile set " << profile << "="
10882 << profile_map << dendl;
10883 pending_inc.set_erasure_code_profile(profile, profile_map);
10884 goto wait;
10885 }
10886 }
10887
31f18b77
FG
10888 int rule;
10889 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
10890 if (err < 0) {
10891 switch(err) {
10892 case -EEXIST: // return immediately
10893 ss << "rule " << name << " already exists";
10894 err = 0;
10895 goto reply;
10896 break;
10897 case -EALREADY: // wait for pending to be proposed
10898 ss << "rule " << name << " already exists";
10899 err = 0;
10900 break;
10901 default: // non recoverable error
10902 goto reply;
10903 break;
10904 }
10905 } else {
31f18b77 10906 ss << "created rule " << name << " at " << rule;
7c673cae
FG
10907 }
10908
10909 getline(ss, rs);
10910 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10911 get_last_committed() + 1));
10912 return true;
10913
10914 } else if (prefix == "osd crush rule rm") {
10915 string name;
9f95a23c 10916 cmd_getval(cmdmap, "name", name);
7c673cae
FG
10917
10918 if (!osdmap.crush->rule_exists(name)) {
10919 ss << "rule " << name << " does not exist";
10920 err = 0;
10921 goto reply;
10922 }
10923
10924 CrushWrapper newcrush;
10925 _get_pending_crush(newcrush);
10926
10927 if (!newcrush.rule_exists(name)) {
10928 ss << "rule " << name << " does not exist";
10929 err = 0;
10930 } else {
10931 int ruleno = newcrush.get_rule_id(name);
11fdf7f2 10932 ceph_assert(ruleno >= 0);
7c673cae
FG
10933
10934 // make sure it is not in use.
10935 // FIXME: this is ok in some situations, but let's not bother with that
10936 // complexity now.
10937 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
3efd9988 10938 if (osdmap.crush_rule_in_use(ruleset)) {
7c673cae
FG
10939 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10940 err = -EBUSY;
10941 goto reply;
10942 }
10943
10944 err = newcrush.remove_rule(ruleno);
10945 if (err < 0) {
10946 goto reply;
10947 }
10948
10949 pending_inc.crush.clear();
10950 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10951 }
10952 getline(ss, rs);
10953 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10954 get_last_committed() + 1));
10955 return true;
10956
b5b8bbf5
FG
10957 } else if (prefix == "osd crush rule rename") {
10958 string srcname;
10959 string dstname;
9f95a23c
TL
10960 cmd_getval(cmdmap, "srcname", srcname);
10961 cmd_getval(cmdmap, "dstname", dstname);
b5b8bbf5
FG
10962 if (srcname.empty() || dstname.empty()) {
10963 ss << "must specify both source rule name and destination rule name";
10964 err = -EINVAL;
10965 goto reply;
10966 }
10967 if (srcname == dstname) {
10968 ss << "destination rule name is equal to source rule name";
10969 err = 0;
10970 goto reply;
10971 }
10972
10973 CrushWrapper newcrush;
10974 _get_pending_crush(newcrush);
181888fb
FG
10975 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10976 // srcname does not exist and dstname already exists
10977 // suppose this is a replay and return success
10978 // (so this command is idempotent)
10979 ss << "already renamed to '" << dstname << "'";
10980 err = 0;
10981 goto reply;
10982 }
10983
b5b8bbf5
FG
10984 err = newcrush.rename_rule(srcname, dstname, &ss);
10985 if (err < 0) {
10986 // ss has reason for failure
10987 goto reply;
10988 }
10989 pending_inc.crush.clear();
10990 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10991 getline(ss, rs);
10992 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10993 get_last_committed() + 1));
10994 return true;
10995
7c673cae
FG
10996 } else if (prefix == "osd setmaxosd") {
10997 int64_t newmax;
9f95a23c 10998 if (!cmd_getval(cmdmap, "newmax", newmax)) {
7c673cae 10999 ss << "unable to parse 'newmax' value '"
11fdf7f2 11000 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
7c673cae
FG
11001 err = -EINVAL;
11002 goto reply;
11003 }
11004
11fdf7f2 11005 if (newmax > g_conf()->mon_max_osd) {
7c673cae
FG
11006 err = -ERANGE;
11007 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11fdf7f2 11008 << g_conf()->mon_max_osd << ")";
7c673cae
FG
11009 goto reply;
11010 }
11011
11012 // Don't allow shrinking OSD number as this will cause data loss
11013 // and may cause kernel crashes.
11014 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11015 if (newmax < osdmap.get_max_osd()) {
11016 // Check if the OSDs exist between current max and new value.
11017 // If there are any OSDs exist, then don't allow shrinking number
11018 // of OSDs.
11019 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11020 if (osdmap.exists(i)) {
11021 err = -EBUSY;
11022 ss << "cannot shrink max_osd to " << newmax
11023 << " because osd." << i << " (and possibly others) still in use";
11024 goto reply;
11025 }
11026 }
11027 }
11028
11029 pending_inc.new_max_osd = newmax;
11030 ss << "set new max_osd = " << pending_inc.new_max_osd;
11031 getline(ss, rs);
11032 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11033 get_last_committed() + 1));
11034 return true;
11035
11036 } else if (prefix == "osd set-full-ratio" ||
11037 prefix == "osd set-backfillfull-ratio" ||
11038 prefix == "osd set-nearfull-ratio") {
7c673cae 11039 double n;
9f95a23c 11040 if (!cmd_getval(cmdmap, "ratio", n)) {
7c673cae 11041 ss << "unable to parse 'ratio' value '"
11fdf7f2 11042 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
7c673cae
FG
11043 err = -EINVAL;
11044 goto reply;
11045 }
11046 if (prefix == "osd set-full-ratio")
11047 pending_inc.new_full_ratio = n;
11048 else if (prefix == "osd set-backfillfull-ratio")
11049 pending_inc.new_backfillfull_ratio = n;
11050 else if (prefix == "osd set-nearfull-ratio")
11051 pending_inc.new_nearfull_ratio = n;
11052 ss << prefix << " " << n;
11053 getline(ss, rs);
11054 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11055 get_last_committed() + 1));
11056 return true;
11057 } else if (prefix == "osd set-require-min-compat-client") {
7c673cae 11058 string v;
9f95a23c
TL
11059 cmd_getval(cmdmap, "version", v);
11060 ceph_release_t vno = ceph_release_from_name(v);
11061 if (!vno) {
7c673cae
FG
11062 ss << "version " << v << " is not recognized";
11063 err = -EINVAL;
11064 goto reply;
11065 }
11066 OSDMap newmap;
11067 newmap.deepish_copy_from(osdmap);
11068 newmap.apply_incremental(pending_inc);
31f18b77
FG
11069 newmap.require_min_compat_client = vno;
11070 auto mvno = newmap.get_min_compat_client();
11071 if (vno < mvno) {
9f95a23c
TL
11072 ss << "osdmap current utilizes features that require " << mvno
11073 << "; cannot set require_min_compat_client below that to " << vno;
7c673cae
FG
11074 err = -EPERM;
11075 goto reply;
11076 }
11fdf7f2 11077 bool sure = false;
9f95a23c 11078 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11079 if (!sure) {
31f18b77
FG
11080 FeatureMap m;
11081 mon->get_combined_feature_map(&m);
9f95a23c 11082 uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
31f18b77
FG
11083 bool first = true;
11084 bool ok = true;
11085 for (int type : {
11086 CEPH_ENTITY_TYPE_CLIENT,
11087 CEPH_ENTITY_TYPE_MDS,
11088 CEPH_ENTITY_TYPE_MGR }) {
11089 auto p = m.m.find(type);
11090 if (p == m.m.end()) {
11091 continue;
11092 }
11093 for (auto& q : p->second) {
11094 uint64_t missing = ~q.first & features;
11095 if (missing) {
11096 if (first) {
11097 ss << "cannot set require_min_compat_client to " << v << ": ";
11098 } else {
11099 ss << "; ";
11100 }
11101 first = false;
11102 ss << q.second << " connected " << ceph_entity_type_name(type)
11103 << "(s) look like " << ceph_release_name(
11104 ceph_release_from_features(q.first))
11105 << " (missing 0x" << std::hex << missing << std::dec << ")";
11106 ok = false;
11107 }
11108 }
11109 }
11110 if (!ok) {
11111 ss << "; add --yes-i-really-mean-it to do it anyway";
11112 err = -EPERM;
11113 goto reply;
11114 }
11115 }
9f95a23c 11116 ss << "set require_min_compat_client to " << vno;
31f18b77 11117 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
11118 getline(ss, rs);
11119 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11120 get_last_committed() + 1));
11121 return true;
11122 } else if (prefix == "osd pause") {
11123 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11124
11125 } else if (prefix == "osd unpause") {
11126 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11127
11128 } else if (prefix == "osd set") {
11fdf7f2 11129 bool sure = false;
9f95a23c 11130 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 11131
7c673cae 11132 string key;
9f95a23c
TL
11133 cmd_getval(cmdmap, "key", key);
11134 if (key == "pause")
7c673cae
FG
11135 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11136 else if (key == "noup")
11137 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11138 else if (key == "nodown")
11139 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11140 else if (key == "noout")
11141 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11142 else if (key == "noin")
11143 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11144 else if (key == "nobackfill")
11145 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11146 else if (key == "norebalance")
11147 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11148 else if (key == "norecover")
11149 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11150 else if (key == "noscrub")
11151 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11152 else if (key == "nodeep-scrub")
11153 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11154 else if (key == "notieragent")
11155 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11156 else if (key == "nosnaptrim")
11157 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11158 else if (key == "pglog_hardlimit") {
11159 if (!osdmap.get_num_up_osds() && !sure) {
f64942e4
AA
11160 ss << "Not advisable to continue since no OSDs are up. Pass "
11161 << "--yes-i-really-mean-it if you really wish to continue.";
11162 err = -EPERM;
11163 goto reply;
11164 }
11165 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11166 // we are reusing a jewel feature bit that was retired in luminous.
9f95a23c 11167 if (osdmap.require_osd_release >= ceph_release_t::luminous &&
f64942e4 11168 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11fdf7f2 11169 || sure)) {
f64942e4
AA
11170 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11171 } else {
11172 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11173 err = -EPERM;
11174 goto reply;
11175 }
7c673cae
FG
11176 } else {
11177 ss << "unrecognized flag '" << key << "'";
11178 err = -EINVAL;
11179 }
11180
11181 } else if (prefix == "osd unset") {
11182 string key;
9f95a23c
TL
11183 cmd_getval(cmdmap, "key", key);
11184 if (key == "pause")
7c673cae
FG
11185 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11186 else if (key == "noup")
11187 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11188 else if (key == "nodown")
11189 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11190 else if (key == "noout")
11191 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11192 else if (key == "noin")
11193 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11194 else if (key == "nobackfill")
11195 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11196 else if (key == "norebalance")
11197 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11198 else if (key == "norecover")
11199 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11200 else if (key == "noscrub")
11201 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11202 else if (key == "nodeep-scrub")
11203 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11204 else if (key == "notieragent")
11205 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
11206 else if (key == "nosnaptrim")
11207 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
224ce89b 11208 else {
7c673cae
FG
11209 ss << "unrecognized flag '" << key << "'";
11210 err = -EINVAL;
11211 }
11212
31f18b77
FG
11213 } else if (prefix == "osd require-osd-release") {
11214 string release;
9f95a23c 11215 cmd_getval(cmdmap, "release", release);
11fdf7f2 11216 bool sure = false;
9f95a23c
TL
11217 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11218 ceph_release_t rel = ceph_release_from_name(release.c_str());
11219 if (!rel) {
31f18b77
FG
11220 ss << "unrecognized release " << release;
11221 err = -EINVAL;
11222 goto reply;
11223 }
d2e6a577
FG
11224 if (rel == osdmap.require_osd_release) {
11225 // idempotent
11226 err = 0;
11227 goto reply;
11228 }
9f95a23c 11229 ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11fdf7f2
TL
11230 if (!osdmap.get_num_up_osds() && !sure) {
11231 ss << "Not advisable to continue since no OSDs are up. Pass "
11232 << "--yes-i-really-mean-it if you really wish to continue.";
11233 err = -EPERM;
11234 goto reply;
11235 }
9f95a23c 11236 if (rel == ceph_release_t::mimic) {
11fdf7f2
TL
11237 if (!mon->monmap->get_required_features().contains_all(
11238 ceph::features::mon::FEATURE_MIMIC)) {
11239 ss << "not all mons are mimic";
11240 err = -EPERM;
11241 goto reply;
11242 }
11243 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11244 && !sure) {
11245 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11246 err = -EPERM;
11247 goto reply;
11248 }
9f95a23c 11249 } else if (rel == ceph_release_t::nautilus) {
11fdf7f2
TL
11250 if (!mon->monmap->get_required_features().contains_all(
11251 ceph::features::mon::FEATURE_NAUTILUS)) {
11252 ss << "not all mons are nautilus";
11253 err = -EPERM;
11254 goto reply;
11255 }
11256 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11257 && !sure) {
11258 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
31f18b77
FG
11259 err = -EPERM;
11260 goto reply;
11261 }
9f95a23c
TL
11262 } else if (rel == ceph_release_t::octopus) {
11263 if (!mon->monmap->get_required_features().contains_all(
11264 ceph::features::mon::FEATURE_OCTOPUS)) {
11265 ss << "not all mons are octopus";
11266 err = -EPERM;
11267 goto reply;
11268 }
11269 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11270 && !sure) {
11271 ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11272 err = -EPERM;
11273 goto reply;
11274 }
31f18b77
FG
11275 } else {
11276 ss << "not supported for this release yet";
11277 err = -EPERM;
11278 goto reply;
11279 }
11280 if (rel < osdmap.require_osd_release) {
11281 ss << "require_osd_release cannot be lowered once it has been set";
11282 err = -EPERM;
11283 goto reply;
11284 }
11285 pending_inc.new_require_osd_release = rel;
11286 goto update;
7c673cae 11287 } else if (prefix == "osd down" ||
9f95a23c
TL
11288 prefix == "osd out" ||
11289 prefix == "osd in" ||
11290 prefix == "osd rm" ||
11291 prefix == "osd stop") {
7c673cae
FG
11292
11293 bool any = false;
31f18b77
FG
11294 bool stop = false;
11295 bool verbose = true;
9f95a23c 11296 bool definitely_dead = false;
7c673cae
FG
11297
11298 vector<string> idvec;
9f95a23c
TL
11299 cmd_getval(cmdmap, "ids", idvec);
11300 cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11301 derr << "definitely_dead " << (int)definitely_dead << dendl;
31f18b77
FG
11302 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11303 set<int> osds;
11304
11305 // wildcard?
11306 if (j == 0 &&
11307 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11308 if (prefix == "osd in") {
11309 // touch out osds only
81eedcae 11310 osdmap.get_out_existing_osds(osds);
31f18b77
FG
11311 } else {
11312 osdmap.get_all_osds(osds);
11313 }
11314 stop = true;
11315 verbose = false; // so the output is less noisy.
11316 } else {
11317 long osd = parse_osd_id(idvec[j].c_str(), &ss);
11318 if (osd < 0) {
11319 ss << "invalid osd id" << osd;
11320 err = -EINVAL;
11321 continue;
11322 } else if (!osdmap.exists(osd)) {
11323 ss << "osd." << osd << " does not exist. ";
11324 continue;
11325 }
11326
11327 osds.insert(osd);
7c673cae 11328 }
31f18b77
FG
11329
11330 for (auto &osd : osds) {
11331 if (prefix == "osd down") {
11332 if (osdmap.is_down(osd)) {
11333 if (verbose)
11334 ss << "osd." << osd << " is already down. ";
11335 } else {
11336 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11337 ss << "marked down osd." << osd << ". ";
11338 any = true;
11339 }
9f95a23c
TL
11340 if (definitely_dead) {
11341 if (!pending_inc.new_xinfo.count(osd)) {
11342 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11343 }
11344 if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11345 any = true;
11346 }
11347 pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11348 }
31f18b77
FG
11349 } else if (prefix == "osd out") {
11350 if (osdmap.is_out(osd)) {
11351 if (verbose)
11352 ss << "osd." << osd << " is already out. ";
11353 } else {
11354 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11355 if (osdmap.osd_weight[osd]) {
11356 if (pending_inc.new_xinfo.count(osd) == 0) {
11357 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11358 }
11359 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 11360 }
31f18b77 11361 ss << "marked out osd." << osd << ". ";
224ce89b
WB
11362 std::ostringstream msg;
11363 msg << "Client " << op->get_session()->entity_name
11364 << " marked osd." << osd << " out";
11365 if (osdmap.is_up(osd)) {
11366 msg << ", while it was still marked up";
11367 } else {
3efd9988
FG
11368 auto period = ceph_clock_now() - down_pending_out[osd];
11369 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
11370 << " seconds";
11371 }
11372
11373 mon->clog->info() << msg.str();
31f18b77 11374 any = true;
7c673cae 11375 }
31f18b77
FG
11376 } else if (prefix == "osd in") {
11377 if (osdmap.is_in(osd)) {
11378 if (verbose)
11379 ss << "osd." << osd << " is already in. ";
11380 } else {
11381 if (osdmap.osd_xinfo[osd].old_weight > 0) {
11382 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11383 if (pending_inc.new_xinfo.count(osd) == 0) {
11384 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11385 }
11386 pending_inc.new_xinfo[osd].old_weight = 0;
11387 } else {
11388 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 11389 }
31f18b77
FG
11390 ss << "marked in osd." << osd << ". ";
11391 any = true;
11392 }
11393 } else if (prefix == "osd rm") {
11394 err = prepare_command_osd_remove(osd);
11395
11396 if (err == -EBUSY) {
11397 if (any)
11398 ss << ", ";
11399 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 11400 } else {
11fdf7f2 11401 ceph_assert(err == 0);
31f18b77
FG
11402 if (any) {
11403 ss << ", osd." << osd;
11404 } else {
11405 ss << "removed osd." << osd;
11406 }
11407 any = true;
7c673cae 11408 }
9f95a23c
TL
11409 } else if (prefix == "osd stop") {
11410 if (osdmap.is_stop(osd)) {
11411 if (verbose)
11412 ss << "osd." << osd << " is already stopped. ";
11413 } else if (osdmap.is_down(osd)) {
11414 pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11415 ss << "stop down osd." << osd << ". ";
11416 any = true;
11417 } else {
11418 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11419 ss << "stop osd." << osd << ". ";
11420 any = true;
11421 }
31f18b77
FG
11422 }
11423 }
11424 }
11425 if (any) {
11426 getline(ss, rs);
11427 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11428 get_last_committed() + 1));
11429 return true;
11430 }
81eedcae
TL
11431 } else if (prefix == "osd set-group" ||
11432 prefix == "osd unset-group" ||
11433 prefix == "osd add-noup" ||
31f18b77
FG
11434 prefix == "osd add-nodown" ||
11435 prefix == "osd add-noin" ||
81eedcae
TL
11436 prefix == "osd add-noout" ||
11437 prefix == "osd rm-noup" ||
11438 prefix == "osd rm-nodown" ||
11439 prefix == "osd rm-noin" ||
11440 prefix == "osd rm-noout") {
11441 bool do_set = prefix == "osd set-group" ||
11442 prefix.find("add") != string::npos;
11443 string flag_str;
11444 unsigned flags = 0;
11445 vector<string> who;
11446 if (prefix == "osd set-group" || prefix == "osd unset-group") {
9f95a23c
TL
11447 cmd_getval(cmdmap, "flags", flag_str);
11448 cmd_getval(cmdmap, "who", who);
81eedcae
TL
11449 vector<string> raw_flags;
11450 boost::split(raw_flags, flag_str, boost::is_any_of(","));
11451 for (auto& f : raw_flags) {
11452 if (f == "noup")
11453 flags |= CEPH_OSD_NOUP;
11454 else if (f == "nodown")
11455 flags |= CEPH_OSD_NODOWN;
11456 else if (f == "noin")
11457 flags |= CEPH_OSD_NOIN;
11458 else if (f == "noout")
11459 flags |= CEPH_OSD_NOOUT;
11460 else {
11461 ss << "unrecognized flag '" << f << "', must be one of "
11462 << "{noup,nodown,noin,noout}";
11463 err = -EINVAL;
11464 goto reply;
11465 }
11466 }
31f18b77 11467 } else {
9f95a23c 11468 cmd_getval(cmdmap, "ids", who);
81eedcae
TL
11469 if (prefix.find("noup") != string::npos)
11470 flags = CEPH_OSD_NOUP;
11471 else if (prefix.find("nodown") != string::npos)
11472 flags = CEPH_OSD_NODOWN;
11473 else if (prefix.find("noin") != string::npos)
11474 flags = CEPH_OSD_NOIN;
11475 else if (prefix.find("noout") != string::npos)
11476 flags = CEPH_OSD_NOOUT;
11477 else
11478 ceph_assert(0 == "Unreachable!");
31f18b77 11479 }
81eedcae
TL
11480 if (flags == 0) {
11481 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11482 err = -EINVAL;
11483 goto reply;
11484 }
11485 if (who.empty()) {
11486 ss << "must specify at least one or more targets to set/unset";
11487 err = -EINVAL;
11488 goto reply;
11489 }
11490 set<int> osds;
11491 set<int> crush_nodes;
11492 set<int> device_classes;
11493 for (auto& w : who) {
11494 if (w == "any" || w == "all" || w == "*") {
31f18b77 11495 osdmap.get_all_osds(osds);
81eedcae 11496 break;
31f18b77 11497 }
81eedcae
TL
11498 std::stringstream ts;
11499 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11500 osds.insert(osd);
11501 } else if (osdmap.crush->name_exists(w)) {
11502 crush_nodes.insert(osdmap.crush->get_item_id(w));
11503 } else if (osdmap.crush->class_exists(w)) {
11504 device_classes.insert(osdmap.crush->get_class_id(w));
11505 } else {
11506 ss << "unable to parse osd id or crush node or device class: "
11507 << "\"" << w << "\". ";
7c673cae
FG
11508 }
11509 }
81eedcae
TL
11510 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11511 // ss has reason for failure
11512 err = -EINVAL;
11513 goto reply;
31f18b77 11514 }
31f18b77 11515 bool any = false;
81eedcae
TL
11516 for (auto osd : osds) {
11517 if (!osdmap.exists(osd)) {
11518 ss << "osd." << osd << " does not exist. ";
11519 continue;
11520 }
11521 if (do_set) {
11522 if (flags & CEPH_OSD_NOUP) {
11523 any |= osdmap.is_noup_by_osd(osd) ?
11524 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11525 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
31f18b77 11526 }
81eedcae
TL
11527 if (flags & CEPH_OSD_NODOWN) {
11528 any |= osdmap.is_nodown_by_osd(osd) ?
11529 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11530 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11531 }
11532 if (flags & CEPH_OSD_NOIN) {
11533 any |= osdmap.is_noin_by_osd(osd) ?
11534 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11535 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11536 }
11537 if (flags & CEPH_OSD_NOOUT) {
11538 any |= osdmap.is_noout_by_osd(osd) ?
11539 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11540 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
31f18b77 11541 }
31f18b77 11542 } else {
81eedcae
TL
11543 if (flags & CEPH_OSD_NOUP) {
11544 any |= osdmap.is_noup_by_osd(osd) ?
11545 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11546 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
31f18b77 11547 }
81eedcae
TL
11548 if (flags & CEPH_OSD_NODOWN) {
11549 any |= osdmap.is_nodown_by_osd(osd) ?
11550 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11551 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
31f18b77 11552 }
81eedcae
TL
11553 if (flags & CEPH_OSD_NOIN) {
11554 any |= osdmap.is_noin_by_osd(osd) ?
11555 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11556 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11557 }
11558 if (flags & CEPH_OSD_NOOUT) {
11559 any |= osdmap.is_noout_by_osd(osd) ?
11560 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11561 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
31f18b77
FG
11562 }
11563 }
11564 }
81eedcae
TL
11565 for (auto& id : crush_nodes) {
11566 auto old_flags = osdmap.get_crush_node_flags(id);
11567 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11568 pending_flags |= old_flags; // adopt existing flags first!
11569 if (do_set) {
11570 pending_flags |= flags;
11571 } else {
11572 pending_flags &= ~flags;
11573 }
11574 any = true;
11575 }
11576 for (auto& id : device_classes) {
11577 auto old_flags = osdmap.get_device_class_flags(id);
11578 auto& pending_flags = pending_inc.new_device_class_flags[id];
11579 pending_flags |= old_flags;
11580 if (do_set) {
11581 pending_flags |= flags;
11582 } else {
11583 pending_flags &= ~flags;
11584 }
11585 any = true;
11586 }
31f18b77
FG
11587 if (any) {
11588 getline(ss, rs);
11589 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11590 get_last_committed() + 1));
7c673cae
FG
11591 return true;
11592 }
11593 } else if (prefix == "osd pg-temp") {
11594 string pgidstr;
9f95a23c 11595 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11596 ss << "unable to parse 'pgid' value '"
11fdf7f2 11597 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11598 err = -EINVAL;
11599 goto reply;
11600 }
11601 pg_t pgid;
11602 if (!pgid.parse(pgidstr.c_str())) {
11603 ss << "invalid pgid '" << pgidstr << "'";
11604 err = -EINVAL;
11605 goto reply;
11606 }
11607 if (!osdmap.pg_exists(pgid)) {
11608 ss << "pg " << pgid << " does not exist";
11609 err = -ENOENT;
11610 goto reply;
11611 }
11612 if (pending_inc.new_pg_temp.count(pgid)) {
11613 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11614 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11615 return true;
11616 }
11617
11618 vector<int64_t> id_vec;
11619 vector<int32_t> new_pg_temp;
9f95a23c 11620 cmd_getval(cmdmap, "id", id_vec);
11fdf7f2
TL
11621 if (id_vec.empty()) {
11622 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11623 ss << "done cleaning up pg_temp of " << pgid;
11624 goto update;
7c673cae
FG
11625 }
11626 for (auto osd : id_vec) {
11627 if (!osdmap.exists(osd)) {
11628 ss << "osd." << osd << " does not exist";
11629 err = -ENOENT;
11630 goto reply;
11631 }
11632 new_pg_temp.push_back(osd);
11633 }
11634
224ce89b
WB
11635 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11636 if ((int)new_pg_temp.size() < pool_min_size) {
11637 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11638 << pool_min_size << ")";
11639 err = -EINVAL;
11640 goto reply;
11641 }
11642
11643 int pool_size = osdmap.get_pg_pool_size(pgid);
11644 if ((int)new_pg_temp.size() > pool_size) {
11645 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11646 << pool_size << ")";
11647 err = -EINVAL;
11648 goto reply;
11649 }
11650
7c673cae
FG
11651 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11652 new_pg_temp.begin(), new_pg_temp.end());
11653 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11654 goto update;
11655 } else if (prefix == "osd primary-temp") {
11656 string pgidstr;
9f95a23c 11657 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11658 ss << "unable to parse 'pgid' value '"
11fdf7f2 11659 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11660 err = -EINVAL;
11661 goto reply;
11662 }
11663 pg_t pgid;
11664 if (!pgid.parse(pgidstr.c_str())) {
11665 ss << "invalid pgid '" << pgidstr << "'";
11666 err = -EINVAL;
11667 goto reply;
11668 }
11669 if (!osdmap.pg_exists(pgid)) {
11670 ss << "pg " << pgid << " does not exist";
11671 err = -ENOENT;
11672 goto reply;
11673 }
11674
11675 int64_t osd;
9f95a23c 11676 if (!cmd_getval(cmdmap, "id", osd)) {
7c673cae 11677 ss << "unable to parse 'id' value '"
11fdf7f2 11678 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11679 err = -EINVAL;
11680 goto reply;
11681 }
11682 if (osd != -1 && !osdmap.exists(osd)) {
11683 ss << "osd." << osd << " does not exist";
11684 err = -ENOENT;
11685 goto reply;
11686 }
11687
9f95a23c
TL
11688 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11689 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 11690 ss << "require_min_compat_client "
9f95a23c 11691 << osdmap.require_min_compat_client
7c673cae
FG
11692 << " < firefly, which is required for primary-temp";
11693 err = -EPERM;
11694 goto reply;
7c673cae
FG
11695 }
11696
11697 pending_inc.new_primary_temp[pgid] = osd;
11698 ss << "set " << pgid << " primary_temp mapping to " << osd;
11699 goto update;
11fdf7f2
TL
11700 } else if (prefix == "pg repeer") {
11701 pg_t pgid;
11702 string pgidstr;
9f95a23c 11703 cmd_getval(cmdmap, "pgid", pgidstr);
11fdf7f2
TL
11704 if (!pgid.parse(pgidstr.c_str())) {
11705 ss << "invalid pgid '" << pgidstr << "'";
11706 err = -EINVAL;
11707 goto reply;
11708 }
11709 if (!osdmap.pg_exists(pgid)) {
11710 ss << "pg '" << pgidstr << "' does not exist";
11711 err = -ENOENT;
11712 goto reply;
11713 }
11714 vector<int> acting;
11715 int primary;
11716 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11717 if (primary < 0) {
11718 err = -EAGAIN;
11719 ss << "pg currently has no primary";
11720 goto reply;
11721 }
11722 if (acting.size() > 1) {
11723 // map to just primary; it will map back to what it wants
11724 pending_inc.new_pg_temp[pgid] = { primary };
11725 } else {
11726 // hmm, pick another arbitrary osd to induce a change. Note
11727 // that this won't work if there is only one suitable OSD in the cluster.
11728 int i;
11729 bool done = false;
11730 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11731 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11732 continue;
11733 }
11734 pending_inc.new_pg_temp[pgid] = { primary, i };
11735 done = true;
11736 break;
11737 }
11738 if (!done) {
11739 err = -EAGAIN;
11740 ss << "not enough up OSDs in the cluster to force repeer";
11741 goto reply;
11742 }
11743 }
11744 goto update;
224ce89b
WB
11745 } else if (prefix == "osd pg-upmap" ||
11746 prefix == "osd rm-pg-upmap" ||
11747 prefix == "osd pg-upmap-items" ||
11748 prefix == "osd rm-pg-upmap-items") {
9f95a23c 11749 if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
31f18b77 11750 ss << "min_compat_client "
9f95a23c 11751 << osdmap.require_min_compat_client
224ce89b
WB
11752 << " < luminous, which is required for pg-upmap. "
11753 << "Try 'ceph osd set-require-min-compat-client luminous' "
11754 << "before using the new interface";
7c673cae
FG
11755 err = -EPERM;
11756 goto reply;
11757 }
11758 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11759 if (err == -EAGAIN)
11760 goto wait;
11761 if (err < 0)
11762 goto reply;
11763 string pgidstr;
9f95a23c 11764 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
7c673cae 11765 ss << "unable to parse 'pgid' value '"
11fdf7f2 11766 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11767 err = -EINVAL;
11768 goto reply;
11769 }
11770 pg_t pgid;
11771 if (!pgid.parse(pgidstr.c_str())) {
11772 ss << "invalid pgid '" << pgidstr << "'";
11773 err = -EINVAL;
11774 goto reply;
11775 }
11776 if (!osdmap.pg_exists(pgid)) {
11777 ss << "pg " << pgid << " does not exist";
11778 err = -ENOENT;
11779 goto reply;
11780 }
94b18763
FG
11781 if (pending_inc.old_pools.count(pgid.pool())) {
11782 ss << "pool of " << pgid << " is pending removal";
11783 err = -ENOENT;
11784 getline(ss, rs);
11785 wait_for_finished_proposal(op,
11786 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11787 return true;
11788 }
224ce89b
WB
11789
11790 enum {
11791 OP_PG_UPMAP,
11792 OP_RM_PG_UPMAP,
11793 OP_PG_UPMAP_ITEMS,
11794 OP_RM_PG_UPMAP_ITEMS,
11795 } option;
11796
11797 if (prefix == "osd pg-upmap") {
11798 option = OP_PG_UPMAP;
11799 } else if (prefix == "osd rm-pg-upmap") {
11800 option = OP_RM_PG_UPMAP;
11801 } else if (prefix == "osd pg-upmap-items") {
11802 option = OP_PG_UPMAP_ITEMS;
11803 } else {
11804 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 11805 }
224ce89b
WB
11806
11807 // check pending upmap changes
11808 switch (option) {
11809 case OP_PG_UPMAP: // fall through
11810 case OP_RM_PG_UPMAP:
11811 if (pending_inc.new_pg_upmap.count(pgid) ||
11812 pending_inc.old_pg_upmap.count(pgid)) {
11813 dout(10) << __func__ << " waiting for pending update on "
11814 << pgid << dendl;
11815 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11816 return true;
7c673cae 11817 }
224ce89b 11818 break;
7c673cae 11819
224ce89b
WB
11820 case OP_PG_UPMAP_ITEMS: // fall through
11821 case OP_RM_PG_UPMAP_ITEMS:
11822 if (pending_inc.new_pg_upmap_items.count(pgid) ||
11823 pending_inc.old_pg_upmap_items.count(pgid)) {
11824 dout(10) << __func__ << " waiting for pending update on "
11825 << pgid << dendl;
11826 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11827 return true;
11828 }
11829 break;
7c673cae 11830
224ce89b 11831 default:
11fdf7f2 11832 ceph_abort_msg("invalid option");
7c673cae 11833 }
224ce89b
WB
11834
11835 switch (option) {
11836 case OP_PG_UPMAP:
11837 {
11838 vector<int64_t> id_vec;
9f95a23c 11839 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 11840 ss << "unable to parse 'id' value(s) '"
11fdf7f2 11841 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
11842 err = -EINVAL;
11843 goto reply;
11844 }
11845
11846 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11847 if ((int)id_vec.size() < pool_min_size) {
11848 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11849 << pool_min_size << ")";
11850 err = -EINVAL;
11851 goto reply;
11852 }
11853
11854 int pool_size = osdmap.get_pg_pool_size(pgid);
11855 if ((int)id_vec.size() > pool_size) {
11856 ss << "num of osds (" << id_vec.size() <<") > pool size ("
11857 << pool_size << ")";
11858 err = -EINVAL;
11859 goto reply;
11860 }
11861
11862 vector<int32_t> new_pg_upmap;
11863 for (auto osd : id_vec) {
11864 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11865 ss << "osd." << osd << " does not exist";
11866 err = -ENOENT;
11867 goto reply;
11868 }
11869 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11870 if (it != new_pg_upmap.end()) {
11871 ss << "osd." << osd << " already exists, ";
11872 continue;
11873 }
11874 new_pg_upmap.push_back(osd);
11875 }
11876
11877 if (new_pg_upmap.empty()) {
11878 ss << "no valid upmap items(pairs) is specified";
11879 err = -EINVAL;
11880 goto reply;
11881 }
11882
11883 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11884 new_pg_upmap.begin(), new_pg_upmap.end());
11885 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 11886 }
224ce89b
WB
11887 break;
11888
11889 case OP_RM_PG_UPMAP:
11890 {
11891 pending_inc.old_pg_upmap.insert(pgid);
11892 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 11893 }
224ce89b 11894 break;
7c673cae 11895
224ce89b
WB
11896 case OP_PG_UPMAP_ITEMS:
11897 {
11898 vector<int64_t> id_vec;
9f95a23c 11899 if (!cmd_getval(cmdmap, "id", id_vec)) {
224ce89b 11900 ss << "unable to parse 'id' value(s) '"
11fdf7f2 11901 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
11902 err = -EINVAL;
11903 goto reply;
11904 }
11905
11906 if (id_vec.size() % 2) {
11907 ss << "you must specify pairs of osd ids to be remapped";
11908 err = -EINVAL;
11909 goto reply;
11910 }
11911
11912 int pool_size = osdmap.get_pg_pool_size(pgid);
11913 if ((int)(id_vec.size() / 2) > pool_size) {
11914 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11915 << pool_size << ")";
11916 err = -EINVAL;
11917 goto reply;
11918 }
11919
11920 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11921 ostringstream items;
11922 items << "[";
11923 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11924 int from = *p++;
11925 int to = *p;
11926 if (from == to) {
11927 ss << "from osd." << from << " == to osd." << to << ", ";
11928 continue;
11929 }
11930 if (!osdmap.exists(from)) {
11931 ss << "osd." << from << " does not exist";
11932 err = -ENOENT;
11933 goto reply;
11934 }
11935 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11936 ss << "osd." << to << " does not exist";
11937 err = -ENOENT;
11938 goto reply;
11939 }
c07f9fc5
FG
11940 pair<int32_t,int32_t> entry = make_pair(from, to);
11941 auto it = std::find(new_pg_upmap_items.begin(),
11942 new_pg_upmap_items.end(), entry);
11943 if (it != new_pg_upmap_items.end()) {
11944 ss << "osd." << from << " -> osd." << to << " already exists, ";
11945 continue;
11946 }
11947 new_pg_upmap_items.push_back(entry);
224ce89b
WB
11948 items << from << "->" << to << ",";
11949 }
11950 string out(items.str());
11951 out.resize(out.size() - 1); // drop last ','
11952 out += "]";
11953
11954 if (new_pg_upmap_items.empty()) {
11955 ss << "no valid upmap items(pairs) is specified";
11956 err = -EINVAL;
11957 goto reply;
11958 }
11959
11960 pending_inc.new_pg_upmap_items[pgid] =
11961 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11962 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11963 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11964 }
11965 break;
11966
11967 case OP_RM_PG_UPMAP_ITEMS:
11968 {
11969 pending_inc.old_pg_upmap_items.insert(pgid);
11970 ss << "clear " << pgid << " pg_upmap_items mapping";
11971 }
11972 break;
11973
11974 default:
11fdf7f2 11975 ceph_abort_msg("invalid option");
7c673cae
FG
11976 }
11977
7c673cae
FG
11978 goto update;
11979 } else if (prefix == "osd primary-affinity") {
11980 int64_t id;
9f95a23c 11981 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 11982 ss << "invalid osd id value '"
11fdf7f2 11983 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11984 err = -EINVAL;
11985 goto reply;
11986 }
11987 double w;
9f95a23c 11988 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 11989 ss << "unable to parse 'weight' value '"
11fdf7f2 11990 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
11991 err = -EINVAL;
11992 goto reply;
11993 }
11994 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11995 if (ww < 0L) {
11996 ss << "weight must be >= 0";
11997 err = -EINVAL;
11998 goto reply;
11999 }
9f95a23c
TL
12000 if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12001 osdmap.require_min_compat_client < ceph_release_t::firefly) {
31f18b77 12002 ss << "require_min_compat_client "
9f95a23c 12003 << osdmap.require_min_compat_client
7c673cae
FG
12004 << " < firefly, which is required for primary-affinity";
12005 err = -EPERM;
12006 goto reply;
7c673cae 12007 }
7c673cae
FG
12008 if (osdmap.exists(id)) {
12009 pending_inc.new_primary_affinity[id] = ww;
12010 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
12011 getline(ss, rs);
12012 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12013 get_last_committed() + 1));
12014 return true;
12015 } else {
12016 ss << "osd." << id << " does not exist";
12017 err = -ENOENT;
12018 goto reply;
12019 }
12020 } else if (prefix == "osd reweight") {
12021 int64_t id;
9f95a23c 12022 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12023 ss << "unable to parse osd id value '"
11fdf7f2 12024 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12025 err = -EINVAL;
12026 goto reply;
12027 }
12028 double w;
9f95a23c 12029 if (!cmd_getval(cmdmap, "weight", w)) {
7c673cae 12030 ss << "unable to parse weight value '"
11fdf7f2 12031 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
12032 err = -EINVAL;
12033 goto reply;
12034 }
12035 long ww = (int)((double)CEPH_OSD_IN*w);
12036 if (ww < 0L) {
12037 ss << "weight must be >= 0";
12038 err = -EINVAL;
12039 goto reply;
12040 }
12041 if (osdmap.exists(id)) {
12042 pending_inc.new_weight[id] = ww;
12043 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12044 getline(ss, rs);
12045 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12046 get_last_committed() + 1));
12047 return true;
12048 } else {
12049 ss << "osd." << id << " does not exist";
12050 err = -ENOENT;
12051 goto reply;
12052 }
12053 } else if (prefix == "osd reweightn") {
12054 map<int32_t, uint32_t> weights;
11fdf7f2 12055 err = parse_reweights(cct, cmdmap, osdmap, &weights);
7c673cae
FG
12056 if (err) {
12057 ss << "unable to parse 'weights' value '"
11fdf7f2 12058 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
7c673cae
FG
12059 goto reply;
12060 }
12061 pending_inc.new_weight.insert(weights.begin(), weights.end());
12062 wait_for_finished_proposal(
12063 op,
12064 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 12065 return true;
7c673cae
FG
12066 } else if (prefix == "osd lost") {
12067 int64_t id;
9f95a23c 12068 if (!cmd_getval(cmdmap, "id", id)) {
7c673cae 12069 ss << "unable to parse osd id value '"
11fdf7f2 12070 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
12071 err = -EINVAL;
12072 goto reply;
12073 }
11fdf7f2 12074 bool sure = false;
9f95a23c 12075 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 12076 if (!sure) {
7c673cae
FG
12077 ss << "are you SURE? this might mean real, permanent data loss. pass "
12078 "--yes-i-really-mean-it if you really do.";
12079 err = -EPERM;
12080 goto reply;
12081 } else if (!osdmap.exists(id)) {
12082 ss << "osd." << id << " does not exist";
12083 err = -ENOENT;
12084 goto reply;
12085 } else if (!osdmap.is_down(id)) {
12086 ss << "osd." << id << " is not down";
12087 err = -EBUSY;
12088 goto reply;
12089 } else {
12090 epoch_t e = osdmap.get_info(id).down_at;
12091 pending_inc.new_lost[id] = e;
12092 ss << "marked osd lost in epoch " << e;
12093 getline(ss, rs);
12094 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12095 get_last_committed() + 1));
12096 return true;
12097 }
12098
11fdf7f2
TL
12099 } else if (prefix == "osd destroy-actual" ||
12100 prefix == "osd purge-actual" ||
12101 prefix == "osd purge-new") {
31f18b77
FG
12102 /* Destroying an OSD means that we don't expect to further make use of
12103 * the OSDs data (which may even become unreadable after this operation),
12104 * and that we are okay with scrubbing all its cephx keys and config-key
12105 * data (which may include lockbox keys, thus rendering the osd's data
12106 * unreadable).
12107 *
12108 * The OSD will not be removed. Instead, we will mark it as destroyed,
12109 * such that a subsequent call to `create` will not reuse the osd id.
12110 * This will play into being able to recreate the OSD, at the same
12111 * crush location, with minimal data movement.
12112 */
12113
12114 // make sure authmon is writeable.
12115 if (!mon->authmon()->is_writeable()) {
12116 dout(10) << __func__ << " waiting for auth mon to be writeable for "
12117 << "osd destroy" << dendl;
12118 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12119 return false;
12120 }
12121
12122 int64_t id;
9f95a23c 12123 if (!cmd_getval(cmdmap, "id", id)) {
11fdf7f2
TL
12124 auto p = cmdmap.find("id");
12125 if (p == cmdmap.end()) {
12126 ss << "no osd id specified";
12127 } else {
12128 ss << "unable to parse osd id value '"
12129 << cmd_vartype_stringify(cmdmap.at("id")) << "";
12130 }
31f18b77
FG
12131 err = -EINVAL;
12132 goto reply;
12133 }
12134
11fdf7f2 12135 bool is_destroy = (prefix == "osd destroy-actual");
31f18b77 12136 if (!is_destroy) {
11fdf7f2
TL
12137 ceph_assert("osd purge-actual" == prefix ||
12138 "osd purge-new" == prefix);
31f18b77
FG
12139 }
12140
11fdf7f2 12141 bool sure = false;
9f95a23c 12142 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
12143 if (!sure) {
12144 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
12145 << "This will mean real, permanent data loss, as well "
12146 << "as deletion of cephx and lockbox keys. "
12147 << "Pass --yes-i-really-mean-it if you really do.";
31f18b77
FG
12148 err = -EPERM;
12149 goto reply;
d2e6a577 12150 } else if (!osdmap.exists(id)) {
31f18b77 12151 ss << "osd." << id << " does not exist";
d2e6a577 12152 err = 0; // idempotent
31f18b77
FG
12153 goto reply;
12154 } else if (osdmap.is_up(id)) {
12155 ss << "osd." << id << " is not `down`.";
12156 err = -EBUSY;
12157 goto reply;
12158 } else if (is_destroy && osdmap.is_destroyed(id)) {
12159 ss << "destroyed osd." << id;
12160 err = 0;
12161 goto reply;
12162 }
12163
11fdf7f2
TL
12164 if (prefix == "osd purge-new" &&
12165 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12166 ss << "osd." << id << " is not new";
12167 err = -EPERM;
12168 goto reply;
12169 }
12170
31f18b77
FG
12171 bool goto_reply = false;
12172
12173 paxos->plug();
12174 if (is_destroy) {
12175 err = prepare_command_osd_destroy(id, ss);
12176 // we checked above that it should exist.
11fdf7f2 12177 ceph_assert(err != -ENOENT);
31f18b77
FG
12178 } else {
12179 err = prepare_command_osd_purge(id, ss);
12180 if (err == -ENOENT) {
12181 err = 0;
12182 ss << "osd." << id << " does not exist.";
12183 goto_reply = true;
12184 }
12185 }
12186 paxos->unplug();
12187
12188 if (err < 0 || goto_reply) {
12189 goto reply;
12190 }
12191
12192 if (is_destroy) {
12193 ss << "destroyed osd." << id;
12194 } else {
12195 ss << "purged osd." << id;
12196 }
12197
12198 getline(ss, rs);
12199 wait_for_finished_proposal(op,
12200 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12201 force_immediate_propose();
12202 return true;
12203
12204 } else if (prefix == "osd new") {
12205
12206 // make sure authmon is writeable.
12207 if (!mon->authmon()->is_writeable()) {
12208 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 12209 << "osd new" << dendl;
31f18b77
FG
12210 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12211 return false;
12212 }
12213
3a9019d9 12214 map<string,string> param_map;
31f18b77
FG
12215
12216 bufferlist bl = m->get_data();
3a9019d9
FG
12217 string param_json = bl.to_str();
12218 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 12219
3a9019d9 12220 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
12221 if (err < 0)
12222 goto reply;
12223
3a9019d9 12224 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77
FG
12225
12226 paxos->plug();
3a9019d9 12227 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
31f18b77
FG
12228 paxos->unplug();
12229
12230 if (err < 0) {
12231 goto reply;
12232 }
12233
12234 if (f) {
12235 f->flush(rdata);
12236 } else {
12237 rdata.append(ss);
12238 }
12239
12240 if (err == EEXIST) {
12241 // idempotent operation
12242 err = 0;
12243 goto reply;
12244 }
12245
12246 wait_for_finished_proposal(op,
12247 new Monitor::C_Command(mon, op, 0, rs, rdata,
12248 get_last_committed() + 1));
12249 force_immediate_propose();
12250 return true;
12251
7c673cae 12252 } else if (prefix == "osd create") {
7c673cae
FG
12253
12254 // optional id provided?
31f18b77 12255 int64_t id = -1, cmd_id = -1;
9f95a23c 12256 if (cmd_getval(cmdmap, "id", cmd_id)) {
31f18b77
FG
12257 if (cmd_id < 0) {
12258 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
12259 err = -EINVAL;
12260 goto reply;
12261 }
31f18b77 12262 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
12263 }
12264
7c673cae
FG
12265 uuid_d uuid;
12266 string uuidstr;
9f95a23c 12267 if (cmd_getval(cmdmap, "uuid", uuidstr)) {
7c673cae 12268 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
12269 ss << "invalid uuid value '" << uuidstr << "'";
12270 err = -EINVAL;
12271 goto reply;
7c673cae 12272 }
31f18b77
FG
12273 // we only care about the id if we also have the uuid, to
12274 // ensure the operation's idempotency.
12275 id = cmd_id;
7c673cae
FG
12276 }
12277
31f18b77
FG
12278 int32_t new_id = -1;
12279 err = prepare_command_osd_create(id, uuid, &new_id, ss);
12280 if (err < 0) {
12281 if (err == -EAGAIN) {
12282 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12283 return true;
12284 }
12285 // a check has failed; reply to the user.
12286 goto reply;
12287
12288 } else if (err == EEXIST) {
12289 // this is an idempotent operation; we can go ahead and reply.
12290 if (f) {
12291 f->open_object_section("created_osd");
12292 f->dump_int("osdid", new_id);
12293 f->close_section();
12294 f->flush(rdata);
12295 } else {
12296 ss << new_id;
12297 rdata.append(ss);
7c673cae 12298 }
31f18b77
FG
12299 err = 0;
12300 goto reply;
7c673cae
FG
12301 }
12302
3a9019d9
FG
12303 string empty_device_class;
12304 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 12305
7c673cae
FG
12306 if (f) {
12307 f->open_object_section("created_osd");
31f18b77 12308 f->dump_int("osdid", new_id);
7c673cae
FG
12309 f->close_section();
12310 f->flush(rdata);
12311 } else {
31f18b77 12312 ss << new_id;
7c673cae
FG
12313 rdata.append(ss);
12314 }
31f18b77
FG
12315 wait_for_finished_proposal(op,
12316 new Monitor::C_Command(mon, op, 0, rs, rdata,
12317 get_last_committed() + 1));
7c673cae
FG
12318 return true;
12319
12320 } else if (prefix == "osd blacklist clear") {
12321 pending_inc.new_blacklist.clear();
12322 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12323 osdmap.get_blacklist(&blacklist);
12324 for (const auto &entry : blacklist) {
12325 pending_inc.old_blacklist.push_back(entry.first);
12326 }
12327 ss << " removed all blacklist entries";
12328 getline(ss, rs);
12329 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12330 get_last_committed() + 1));
12331 return true;
12332 } else if (prefix == "osd blacklist") {
12333 string addrstr;
9f95a23c 12334 cmd_getval(cmdmap, "addr", addrstr);
7c673cae
FG
12335 entity_addr_t addr;
12336 if (!addr.parse(addrstr.c_str(), 0)) {
12337 ss << "unable to parse address " << addrstr;
12338 err = -EINVAL;
12339 goto reply;
12340 }
12341 else {
9f95a23c 12342 if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
11fdf7f2
TL
12343 // always blacklist type ANY
12344 addr.set_type(entity_addr_t::TYPE_ANY);
12345 } else {
12346 addr.set_type(entity_addr_t::TYPE_LEGACY);
12347 }
12348
7c673cae 12349 string blacklistop;
9f95a23c 12350 cmd_getval(cmdmap, "blacklistop", blacklistop);
7c673cae
FG
12351 if (blacklistop == "add") {
12352 utime_t expires = ceph_clock_now();
12353 double d;
12354 // default one hour
9f95a23c 12355 cmd_getval(cmdmap, "expire", d,
11fdf7f2 12356 g_conf()->mon_osd_blacklist_default_expire);
7c673cae
FG
12357 expires += d;
12358
12359 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
12360
12361 {
12362 // cancel any pending un-blacklisting request too
12363 auto it = std::find(pending_inc.old_blacklist.begin(),
12364 pending_inc.old_blacklist.end(), addr);
12365 if (it != pending_inc.old_blacklist.end()) {
12366 pending_inc.old_blacklist.erase(it);
12367 }
12368 }
12369
7c673cae
FG
12370 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12371 getline(ss, rs);
12372 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12373 get_last_committed() + 1));
12374 return true;
12375 } else if (blacklistop == "rm") {
12376 if (osdmap.is_blacklisted(addr) ||
12377 pending_inc.new_blacklist.count(addr)) {
12378 if (osdmap.is_blacklisted(addr))
12379 pending_inc.old_blacklist.push_back(addr);
12380 else
12381 pending_inc.new_blacklist.erase(addr);
12382 ss << "un-blacklisting " << addr;
12383 getline(ss, rs);
12384 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12385 get_last_committed() + 1));
12386 return true;
12387 }
12388 ss << addr << " isn't blacklisted";
12389 err = 0;
12390 goto reply;
12391 }
12392 }
12393 } else if (prefix == "osd pool mksnap") {
12394 string poolstr;
9f95a23c 12395 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12396 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12397 if (pool < 0) {
12398 ss << "unrecognized pool '" << poolstr << "'";
12399 err = -ENOENT;
12400 goto reply;
12401 }
12402 string snapname;
9f95a23c 12403 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
12404 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12405 if (p->is_unmanaged_snaps_mode()) {
12406 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12407 err = -EINVAL;
12408 goto reply;
12409 } else if (p->snap_exists(snapname.c_str())) {
12410 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12411 err = 0;
12412 goto reply;
12413 } else if (p->is_tier()) {
12414 ss << "pool " << poolstr << " is a cache tier";
12415 err = -EINVAL;
12416 goto reply;
12417 }
12418 pg_pool_t *pp = 0;
12419 if (pending_inc.new_pools.count(pool))
12420 pp = &pending_inc.new_pools[pool];
12421 if (!pp) {
12422 pp = &pending_inc.new_pools[pool];
12423 *pp = *p;
12424 }
12425 if (pp->snap_exists(snapname.c_str())) {
12426 ss << "pool " << poolstr << " snap " << snapname << " already exists";
12427 } else {
12428 pp->add_snap(snapname.c_str(), ceph_clock_now());
12429 pp->set_snap_epoch(pending_inc.epoch);
12430 ss << "created pool " << poolstr << " snap " << snapname;
12431 }
12432 getline(ss, rs);
12433 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12434 get_last_committed() + 1));
12435 return true;
12436 } else if (prefix == "osd pool rmsnap") {
12437 string poolstr;
9f95a23c 12438 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12439 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12440 if (pool < 0) {
12441 ss << "unrecognized pool '" << poolstr << "'";
12442 err = -ENOENT;
12443 goto reply;
12444 }
12445 string snapname;
9f95a23c 12446 cmd_getval(cmdmap, "snap", snapname);
7c673cae
FG
12447 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12448 if (p->is_unmanaged_snaps_mode()) {
12449 ss << "pool " << poolstr << " is in unmanaged snaps mode";
12450 err = -EINVAL;
12451 goto reply;
12452 } else if (!p->snap_exists(snapname.c_str())) {
12453 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12454 err = 0;
12455 goto reply;
12456 }
12457 pg_pool_t *pp = 0;
12458 if (pending_inc.new_pools.count(pool))
12459 pp = &pending_inc.new_pools[pool];
12460 if (!pp) {
12461 pp = &pending_inc.new_pools[pool];
12462 *pp = *p;
12463 }
12464 snapid_t sn = pp->snap_exists(snapname.c_str());
12465 if (sn) {
12466 pp->remove_snap(sn);
12467 pp->set_snap_epoch(pending_inc.epoch);
12468 ss << "removed pool " << poolstr << " snap " << snapname;
12469 } else {
12470 ss << "already removed pool " << poolstr << " snap " << snapname;
12471 }
12472 getline(ss, rs);
12473 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12474 get_last_committed() + 1));
12475 return true;
12476 } else if (prefix == "osd pool create") {
11fdf7f2 12477 int64_t pg_num, pg_num_min;
7c673cae 12478 int64_t pgp_num;
9f95a23c
TL
12479 cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12480 cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12481 cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
7c673cae
FG
12482
12483 string pool_type_str;
9f95a23c 12484 cmd_getval(cmdmap, "pool_type", pool_type_str);
7c673cae 12485 if (pool_type_str.empty())
11fdf7f2 12486 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
7c673cae
FG
12487
12488 string poolstr;
9f95a23c 12489 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12490 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12491 if (pool_id >= 0) {
12492 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12493 if (pool_type_str != p->get_type_name()) {
12494 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12495 err = -EINVAL;
12496 } else {
12497 ss << "pool '" << poolstr << "' already exists";
12498 err = 0;
12499 }
12500 goto reply;
12501 }
12502
12503 int pool_type;
12504 if (pool_type_str == "replicated") {
12505 pool_type = pg_pool_t::TYPE_REPLICATED;
12506 } else if (pool_type_str == "erasure") {
7c673cae
FG
12507 pool_type = pg_pool_t::TYPE_ERASURE;
12508 } else {
12509 ss << "unknown pool type '" << pool_type_str << "'";
12510 err = -EINVAL;
12511 goto reply;
12512 }
12513
31f18b77 12514 bool implicit_rule_creation = false;
94b18763 12515 int64_t expected_num_objects = 0;
31f18b77 12516 string rule_name;
9f95a23c 12517 cmd_getval(cmdmap, "rule", rule_name);
7c673cae 12518 string erasure_code_profile;
9f95a23c 12519 cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
7c673cae
FG
12520
12521 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12522 if (erasure_code_profile == "")
12523 erasure_code_profile = "default";
12524 //handle the erasure code profile
12525 if (erasure_code_profile == "default") {
12526 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12527 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12528 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12529 goto wait;
12530 }
12531
12532 map<string,string> profile_map;
11fdf7f2 12533 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
12534 profile_map,
12535 &ss);
12536 if (err)
12537 goto reply;
12538 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12539 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12540 goto wait;
12541 }
12542 }
31f18b77
FG
12543 if (rule_name == "") {
12544 implicit_rule_creation = true;
7c673cae 12545 if (erasure_code_profile == "default") {
31f18b77 12546 rule_name = "erasure-code";
7c673cae 12547 } else {
31f18b77 12548 dout(1) << "implicitly use rule named after the pool: "
7c673cae 12549 << poolstr << dendl;
31f18b77 12550 rule_name = poolstr;
7c673cae
FG
12551 }
12552 }
9f95a23c 12553 cmd_getval(cmdmap, "expected_num_objects",
94b18763 12554 expected_num_objects, int64_t(0));
7c673cae 12555 } else {
31f18b77 12556 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
12557 // and put expected_num_objects to rule field
12558 if (erasure_code_profile != "") { // cmd is from CLI
12559 if (rule_name != "") {
12560 string interr;
12561 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12562 if (interr.length()) {
12563 ss << "error parsing integer value '" << rule_name << "': " << interr;
12564 err = -EINVAL;
12565 goto reply;
12566 }
12567 }
12568 rule_name = erasure_code_profile;
12569 } else { // cmd is well-formed
9f95a23c 12570 cmd_getval(cmdmap, "expected_num_objects",
94b18763
FG
12571 expected_num_objects, int64_t(0));
12572 }
7c673cae
FG
12573 }
12574
31f18b77
FG
12575 if (!implicit_rule_creation && rule_name != "") {
12576 int rule;
12577 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
12578 if (err == -EAGAIN) {
12579 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12580 return true;
12581 }
12582 if (err)
12583 goto reply;
12584 }
12585
7c673cae
FG
12586 if (expected_num_objects < 0) {
12587 ss << "'expected_num_objects' must be non-negative";
12588 err = -EINVAL;
12589 goto reply;
12590 }
12591
f6b5b4d7
TL
12592 set<int32_t> osds;
12593 osdmap.get_all_osds(osds);
12594 bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12595 string type;
12596 if (!get_osd_objectstore_type(osd, &type)) {
12597 return type == "filestore";
12598 } else {
12599 return false;
12600 }
12601 });
12602
12603 if (has_filestore_osd &&
12604 expected_num_objects > 0 &&
12605 cct->_conf->filestore_merge_threshold > 0) {
91327a77
AA
12606 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12607 err = -EINVAL;
12608 goto reply;
12609 }
12610
f6b5b4d7
TL
12611 if (has_filestore_osd &&
12612 expected_num_objects == 0 &&
12613 cct->_conf->filestore_merge_threshold < 0) {
91327a77 12614 int osds = osdmap.get_num_osds();
f6b5b4d7
TL
12615 bool sure = false;
12616 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12617 if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
91327a77 12618 ss << "For better initial performance on pools expected to store a "
f6b5b4d7
TL
12619 << "large number of objects, consider supplying the "
12620 << "expected_num_objects parameter when creating the pool."
12621 << " Pass --yes-i-really-mean-it to ignore it";
12622 err = -EPERM;
12623 goto reply;
91327a77
AA
12624 }
12625 }
12626
7c673cae 12627 int64_t fast_read_param;
9f95a23c 12628 cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
7c673cae
FG
12629 FastReadType fast_read = FAST_READ_DEFAULT;
12630 if (fast_read_param == 0)
12631 fast_read = FAST_READ_OFF;
12632 else if (fast_read_param > 0)
12633 fast_read = FAST_READ_ON;
11fdf7f2
TL
12634
12635 int64_t repl_size = 0;
9f95a23c 12636 cmd_getval(cmdmap, "size", repl_size);
11fdf7f2
TL
12637 int64_t target_size_bytes = 0;
12638 double target_size_ratio = 0.0;
9f95a23c
TL
12639 cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12640 cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12641
12642 string pg_autoscale_mode;
12643 cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
11fdf7f2
TL
12644
12645 err = prepare_new_pool(poolstr,
7c673cae 12646 -1, // default crush rule
31f18b77 12647 rule_name,
11fdf7f2
TL
12648 pg_num, pgp_num, pg_num_min,
12649 repl_size, target_size_bytes, target_size_ratio,
7c673cae
FG
12650 erasure_code_profile, pool_type,
12651 (uint64_t)expected_num_objects,
12652 fast_read,
9f95a23c 12653 pg_autoscale_mode,
7c673cae
FG
12654 &ss);
12655 if (err < 0) {
12656 switch(err) {
12657 case -EEXIST:
12658 ss << "pool '" << poolstr << "' already exists";
12659 break;
12660 case -EAGAIN:
12661 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12662 return true;
12663 case -ERANGE:
12664 goto reply;
12665 default:
12666 goto reply;
12667 break;
12668 }
12669 } else {
12670 ss << "pool '" << poolstr << "' created";
12671 }
12672 getline(ss, rs);
12673 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12674 get_last_committed() + 1));
12675 return true;
12676
12677 } else if (prefix == "osd pool delete" ||
12678 prefix == "osd pool rm") {
12679 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12680 string poolstr, poolstr2, sure;
9f95a23c
TL
12681 cmd_getval(cmdmap, "pool", poolstr);
12682 cmd_getval(cmdmap, "pool2", poolstr2);
7c673cae
FG
12683 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12684 if (pool < 0) {
12685 ss << "pool '" << poolstr << "' does not exist";
12686 err = 0;
12687 goto reply;
12688 }
12689
11fdf7f2 12690 bool force_no_fake = false;
9f95a23c 12691 cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11fdf7f2 12692 bool force = false;
9f95a23c 12693 cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
7c673cae 12694 if (poolstr2 != poolstr ||
11fdf7f2 12695 (!force && !force_no_fake)) {
7c673cae
FG
12696 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12697 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12698 << "followed by --yes-i-really-really-mean-it.";
12699 err = -EPERM;
12700 goto reply;
12701 }
12702 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12703 if (err == -EAGAIN) {
12704 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12705 return true;
12706 }
12707 if (err < 0)
12708 goto reply;
12709 goto update;
12710 } else if (prefix == "osd pool rename") {
12711 string srcpoolstr, destpoolstr;
9f95a23c
TL
12712 cmd_getval(cmdmap, "srcpool", srcpoolstr);
12713 cmd_getval(cmdmap, "destpool", destpoolstr);
7c673cae
FG
12714 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12715 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12716
12717 if (pool_src < 0) {
12718 if (pool_dst >= 0) {
12719 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12720 // of operations, assume this rename succeeded, as it is not changing
12721 // the current state. Make sure we output something understandable
12722 // for whoever is issuing the command, if they are paying attention,
12723 // in case it was not intentional; or to avoid a "wtf?" and a bug
12724 // report in case it was intentional, while expecting a failure.
12725 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12726 << destpoolstr << "' does -- assuming successful rename";
12727 err = 0;
12728 } else {
12729 ss << "unrecognized pool '" << srcpoolstr << "'";
12730 err = -ENOENT;
12731 }
12732 goto reply;
12733 } else if (pool_dst >= 0) {
12734 // source pool exists and so does the destination pool
12735 ss << "pool '" << destpoolstr << "' already exists";
12736 err = -EEXIST;
12737 goto reply;
12738 }
12739
12740 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12741 if (ret == 0) {
12742 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12743 } else {
12744 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12745 << cpp_strerror(ret);
12746 }
12747 getline(ss, rs);
12748 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12749 get_last_committed() + 1));
12750 return true;
12751
12752 } else if (prefix == "osd pool set") {
12753 err = prepare_command_pool_set(cmdmap, ss);
12754 if (err == -EAGAIN)
12755 goto wait;
12756 if (err < 0)
12757 goto reply;
12758
12759 getline(ss, rs);
12760 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12761 get_last_committed() + 1));
12762 return true;
12763 } else if (prefix == "osd tier add") {
12764 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12765 if (err == -EAGAIN)
12766 goto wait;
12767 if (err)
12768 goto reply;
12769 string poolstr;
9f95a23c 12770 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12771 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12772 if (pool_id < 0) {
12773 ss << "unrecognized pool '" << poolstr << "'";
12774 err = -ENOENT;
12775 goto reply;
12776 }
12777 string tierpoolstr;
9f95a23c 12778 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
12779 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12780 if (tierpool_id < 0) {
12781 ss << "unrecognized pool '" << tierpoolstr << "'";
12782 err = -ENOENT;
12783 goto reply;
12784 }
12785 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12786 ceph_assert(p);
7c673cae 12787 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 12788 ceph_assert(tp);
7c673cae
FG
12789
12790 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12791 goto reply;
12792 }
12793
12794 // make sure new tier is empty
12795 string force_nonempty;
9f95a23c 12796 cmd_getval(cmdmap, "force_nonempty", force_nonempty);
11fdf7f2 12797 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 12798 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
12799 force_nonempty != "--force-nonempty") {
12800 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12801 err = -ENOTEMPTY;
12802 goto reply;
12803 }
11fdf7f2 12804 if (tp->is_erasure()) {
7c673cae
FG
12805 ss << "tier pool '" << tierpoolstr
12806 << "' is an ec pool, which cannot be a tier";
12807 err = -ENOTSUP;
12808 goto reply;
12809 }
12810 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12811 ((force_nonempty != "--force-nonempty") ||
11fdf7f2 12812 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
7c673cae
FG
12813 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12814 err = -ENOTEMPTY;
12815 goto reply;
12816 }
12817 // go
12818 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12819 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12820 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12821 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12822 return true;
12823 }
12824 np->tiers.insert(tierpool_id);
12825 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12826 ntp->tier_of = pool_id;
12827 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12828 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12829 get_last_committed() + 1));
12830 return true;
12831 } else if (prefix == "osd tier remove" ||
12832 prefix == "osd tier rm") {
12833 string poolstr;
9f95a23c 12834 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12835 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12836 if (pool_id < 0) {
12837 ss << "unrecognized pool '" << poolstr << "'";
12838 err = -ENOENT;
12839 goto reply;
12840 }
12841 string tierpoolstr;
9f95a23c 12842 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
12843 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12844 if (tierpool_id < 0) {
12845 ss << "unrecognized pool '" << tierpoolstr << "'";
12846 err = -ENOENT;
12847 goto reply;
12848 }
12849 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12850 ceph_assert(p);
7c673cae 12851 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 12852 ceph_assert(tp);
7c673cae
FG
12853
12854 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12855 goto reply;
12856 }
12857
12858 if (p->tiers.count(tierpool_id) == 0) {
12859 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12860 err = 0;
12861 goto reply;
12862 }
12863 if (tp->tier_of != pool_id) {
12864 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12865 << osdmap.get_pool_name(tp->tier_of) << "': "
12866 // be scary about it; this is an inconsistency and bells must go off
12867 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12868 err = -EINVAL;
12869 goto reply;
12870 }
12871 if (p->read_tier == tierpool_id) {
12872 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12873 err = -EBUSY;
12874 goto reply;
12875 }
12876 // go
12877 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12878 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12879 if (np->tiers.count(tierpool_id) == 0 ||
12880 ntp->tier_of != pool_id ||
12881 np->read_tier == tierpool_id) {
12882 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12883 return true;
12884 }
12885 np->tiers.erase(tierpool_id);
12886 ntp->clear_tier();
12887 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12888 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12889 get_last_committed() + 1));
12890 return true;
12891 } else if (prefix == "osd tier set-overlay") {
12892 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12893 if (err == -EAGAIN)
12894 goto wait;
12895 if (err)
12896 goto reply;
12897 string poolstr;
9f95a23c 12898 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12899 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12900 if (pool_id < 0) {
12901 ss << "unrecognized pool '" << poolstr << "'";
12902 err = -ENOENT;
12903 goto reply;
12904 }
12905 string overlaypoolstr;
9f95a23c 12906 cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
7c673cae
FG
12907 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12908 if (overlaypool_id < 0) {
12909 ss << "unrecognized pool '" << overlaypoolstr << "'";
12910 err = -ENOENT;
12911 goto reply;
12912 }
12913 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12914 ceph_assert(p);
7c673cae 12915 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11fdf7f2 12916 ceph_assert(overlay_p);
7c673cae
FG
12917 if (p->tiers.count(overlaypool_id) == 0) {
12918 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12919 err = -EINVAL;
12920 goto reply;
12921 }
12922 if (p->read_tier == overlaypool_id) {
12923 err = 0;
12924 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12925 goto reply;
12926 }
12927 if (p->has_read_tier()) {
12928 ss << "pool '" << poolstr << "' has overlay '"
12929 << osdmap.get_pool_name(p->read_tier)
12930 << "'; please remove-overlay first";
12931 err = -EINVAL;
12932 goto reply;
12933 }
12934
12935 // go
12936 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12937 np->read_tier = overlaypool_id;
12938 np->write_tier = overlaypool_id;
12939 np->set_last_force_op_resend(pending_inc.epoch);
12940 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12941 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12942 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12943 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12944 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12945 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12946 get_last_committed() + 1));
12947 return true;
12948 } else if (prefix == "osd tier remove-overlay" ||
12949 prefix == "osd tier rm-overlay") {
12950 string poolstr;
9f95a23c 12951 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12952 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12953 if (pool_id < 0) {
12954 ss << "unrecognized pool '" << poolstr << "'";
12955 err = -ENOENT;
12956 goto reply;
12957 }
12958 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12959 ceph_assert(p);
7c673cae
FG
12960 if (!p->has_read_tier()) {
12961 err = 0;
12962 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12963 goto reply;
12964 }
12965
12966 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12967 goto reply;
12968 }
12969
12970 // go
12971 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12972 if (np->has_read_tier()) {
12973 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12974 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12975 nop->set_last_force_op_resend(pending_inc.epoch);
12976 }
12977 if (np->has_write_tier()) {
12978 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12979 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12980 nop->set_last_force_op_resend(pending_inc.epoch);
12981 }
12982 np->clear_read_tier();
12983 np->clear_write_tier();
12984 np->set_last_force_op_resend(pending_inc.epoch);
12985 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12986 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12987 get_last_committed() + 1));
12988 return true;
12989 } else if (prefix == "osd tier cache-mode") {
12990 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12991 if (err == -EAGAIN)
12992 goto wait;
12993 if (err)
12994 goto reply;
12995 string poolstr;
9f95a23c 12996 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
12997 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12998 if (pool_id < 0) {
12999 ss << "unrecognized pool '" << poolstr << "'";
13000 err = -ENOENT;
13001 goto reply;
13002 }
13003 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13004 ceph_assert(p);
7c673cae
FG
13005 if (!p->is_tier()) {
13006 ss << "pool '" << poolstr << "' is not a tier";
13007 err = -EINVAL;
13008 goto reply;
13009 }
13010 string modestr;
9f95a23c 13011 cmd_getval(cmdmap, "mode", modestr);
7c673cae 13012 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 13013 if (int(mode) < 0) {
7c673cae
FG
13014 ss << "'" << modestr << "' is not a valid cache mode";
13015 err = -EINVAL;
13016 goto reply;
13017 }
13018
11fdf7f2 13019 bool sure = false;
9f95a23c 13020 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 13021
9f95a23c
TL
13022 if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13023 mode == pg_pool_t::CACHEMODE_READFORWARD) {
13024 ss << "'" << modestr << "' is no longer a supported cache mode";
13025 err = -EPERM;
13026 goto reply;
13027 }
7c673cae
FG
13028 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13029 mode != pg_pool_t::CACHEMODE_NONE &&
13030 mode != pg_pool_t::CACHEMODE_PROXY &&
13031 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11fdf7f2 13032 !sure) {
7c673cae
FG
13033 ss << "'" << modestr << "' is not a well-supported cache mode and may "
13034 << "corrupt your data. pass --yes-i-really-mean-it to force.";
13035 err = -EPERM;
13036 goto reply;
13037 }
13038
13039 // pool already has this cache-mode set and there are no pending changes
13040 if (p->cache_mode == mode &&
13041 (pending_inc.new_pools.count(pool_id) == 0 ||
13042 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13043 ss << "set cache-mode for pool '" << poolstr << "'"
13044 << " to " << pg_pool_t::get_cache_mode_name(mode);
13045 err = 0;
13046 goto reply;
13047 }
13048
13049 /* Mode description:
13050 *
13051 * none: No cache-mode defined
9f95a23c 13052 * forward: Forward all reads and writes to base pool [removed]
7c673cae
FG
13053 * writeback: Cache writes, promote reads from base pool
13054 * readonly: Forward writes to base pool
9f95a23c 13055 * readforward: Writes are in writeback mode, Reads are in forward mode [removed]
7c673cae
FG
13056 * proxy: Proxy all reads and writes to base pool
13057 * readproxy: Writes are in writeback mode, Reads are in proxy mode
13058 *
13059 * Hence, these are the allowed transitions:
13060 *
13061 * none -> any
13062 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c 13063 * proxy -> readproxy || writeback || any IF num_objects_dirty == 0
7c673cae 13064 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
9f95a23c
TL
13065 * readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13066 * writeback -> readproxy || proxy
7c673cae
FG
13067 * readonly -> any
13068 */
13069
13070 // We check if the transition is valid against the current pool mode, as
13071 // it is the only committed state thus far. We will blantly squash
13072 // whatever mode is on the pending state.
13073
13074 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
9f95a23c 13075 (mode != pg_pool_t::CACHEMODE_PROXY &&
7c673cae
FG
13076 mode != pg_pool_t::CACHEMODE_READPROXY)) {
13077 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13078 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13079 << "' pool; only '"
7c673cae
FG
13080 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13081 << "' allowed.";
13082 err = -EINVAL;
13083 goto reply;
13084 }
13085 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13086 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13087 mode != pg_pool_t::CACHEMODE_PROXY &&
13088 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13089
13090 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13091 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13092 mode != pg_pool_t::CACHEMODE_PROXY)) ||
13093
13094 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13095 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13096 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13097
13098 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13099 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
7c673cae
FG
13100 mode != pg_pool_t::CACHEMODE_PROXY &&
13101 mode != pg_pool_t::CACHEMODE_READPROXY))) {
13102
31f18b77 13103 const pool_stat_t* pstats =
11fdf7f2 13104 mon->mgrstatmon()->get_pool_stat(pool_id);
7c673cae 13105
31f18b77 13106 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
13107 ss << "unable to set cache-mode '"
13108 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13109 << "': dirty objects found";
13110 err = -EBUSY;
13111 goto reply;
13112 }
13113 }
13114 // go
13115 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13116 np->cache_mode = mode;
13117 // set this both when moving to and from cache_mode NONE. this is to
13118 // capture legacy pools that were set up before this flag existed.
13119 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13120 ss << "set cache-mode for pool '" << poolstr
13121 << "' to " << pg_pool_t::get_cache_mode_name(mode);
13122 if (mode == pg_pool_t::CACHEMODE_NONE) {
13123 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11fdf7f2 13124 ceph_assert(base_pool);
7c673cae
FG
13125 if (base_pool->read_tier == pool_id ||
13126 base_pool->write_tier == pool_id)
13127 ss <<" (WARNING: pool is still configured as read or write tier)";
13128 }
13129 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13130 get_last_committed() + 1));
13131 return true;
13132 } else if (prefix == "osd tier add-cache") {
13133 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13134 if (err == -EAGAIN)
13135 goto wait;
13136 if (err)
13137 goto reply;
13138 string poolstr;
9f95a23c 13139 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13140 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13141 if (pool_id < 0) {
13142 ss << "unrecognized pool '" << poolstr << "'";
13143 err = -ENOENT;
13144 goto reply;
13145 }
13146 string tierpoolstr;
9f95a23c 13147 cmd_getval(cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
13148 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13149 if (tierpool_id < 0) {
13150 ss << "unrecognized pool '" << tierpoolstr << "'";
13151 err = -ENOENT;
13152 goto reply;
13153 }
13154 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 13155 ceph_assert(p);
7c673cae 13156 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 13157 ceph_assert(tp);
7c673cae
FG
13158
13159 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13160 goto reply;
13161 }
13162
13163 int64_t size = 0;
9f95a23c 13164 if (!cmd_getval(cmdmap, "size", size)) {
7c673cae 13165 ss << "unable to parse 'size' value '"
11fdf7f2 13166 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
7c673cae
FG
13167 err = -EINVAL;
13168 goto reply;
13169 }
13170 // make sure new tier is empty
31f18b77 13171 const pool_stat_t *pstats =
11fdf7f2 13172 mon->mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 13173 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
13174 ss << "tier pool '" << tierpoolstr << "' is not empty";
13175 err = -ENOTEMPTY;
13176 goto reply;
13177 }
11fdf7f2 13178 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
7c673cae 13179 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
9f95a23c 13180 if (int(mode) < 0) {
7c673cae
FG
13181 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13182 err = -EINVAL;
13183 goto reply;
13184 }
13185 HitSet::Params hsp;
11fdf7f2
TL
13186 auto& cache_hit_set_type =
13187 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13188 if (cache_hit_set_type == "bloom") {
7c673cae 13189 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 13190 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae 13191 hsp = HitSet::Params(bsp);
11fdf7f2 13192 } else if (cache_hit_set_type == "explicit_hash") {
7c673cae 13193 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11fdf7f2 13194 } else if (cache_hit_set_type == "explicit_object") {
7c673cae
FG
13195 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13196 } else {
11fdf7f2
TL
13197 ss << "osd tier cache default hit set type '"
13198 << cache_hit_set_type << "' is not a known type";
7c673cae
FG
13199 err = -EINVAL;
13200 goto reply;
13201 }
13202 // go
13203 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13204 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13205 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13206 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13207 return true;
13208 }
13209 np->tiers.insert(tierpool_id);
13210 np->read_tier = np->write_tier = tierpool_id;
13211 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13212 np->set_last_force_op_resend(pending_inc.epoch);
13213 ntp->set_last_force_op_resend(pending_inc.epoch);
13214 ntp->tier_of = pool_id;
13215 ntp->cache_mode = mode;
11fdf7f2
TL
13216 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13217 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13218 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13219 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13220 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13221 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
7c673cae
FG
13222 ntp->hit_set_params = hsp;
13223 ntp->target_max_bytes = size;
13224 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13225 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13226 get_last_committed() + 1));
13227 return true;
13228 } else if (prefix == "osd pool set-quota") {
13229 string poolstr;
9f95a23c 13230 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
13231 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13232 if (pool_id < 0) {
13233 ss << "unrecognized pool '" << poolstr << "'";
13234 err = -ENOENT;
13235 goto reply;
13236 }
13237
13238 string field;
9f95a23c 13239 cmd_getval(cmdmap, "field", field);
7c673cae
FG
13240 if (field != "max_objects" && field != "max_bytes") {
13241 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13242 err = -EINVAL;
13243 goto reply;
13244 }
13245
13246 // val could contain unit designations, so we treat as a string
13247 string val;
9f95a23c 13248 cmd_getval(cmdmap, "val", val);
1adf2230
AA
13249 string tss;
13250 int64_t value;
13251 if (field == "max_objects") {
13252 value = strict_sistrtoll(val.c_str(), &tss);
13253 } else if (field == "max_bytes") {
13254 value = strict_iecstrtoll(val.c_str(), &tss);
13255 } else {
11fdf7f2 13256 ceph_abort_msg("unrecognized option");
1adf2230
AA
13257 }
13258 if (!tss.empty()) {
13259 ss << "error parsing value '" << val << "': " << tss;
13260 err = -EINVAL;
7c673cae
FG
13261 goto reply;
13262 }
13263
13264 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13265 if (field == "max_objects") {
13266 pi->quota_max_objects = value;
13267 } else if (field == "max_bytes") {
13268 pi->quota_max_bytes = value;
13269 } else {
11fdf7f2 13270 ceph_abort_msg("unrecognized option");
7c673cae
FG
13271 }
13272 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13273 rs = ss.str();
13274 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13275 get_last_committed() + 1));
13276 return true;
c07f9fc5
FG
13277 } else if (prefix == "osd pool application enable" ||
13278 prefix == "osd pool application disable" ||
13279 prefix == "osd pool application set" ||
13280 prefix == "osd pool application rm") {
13281 err = prepare_command_pool_application(prefix, cmdmap, ss);
11fdf7f2 13282 if (err == -EAGAIN) {
c07f9fc5 13283 goto wait;
11fdf7f2 13284 } else if (err < 0) {
7c673cae 13285 goto reply;
7c673cae 13286 } else {
11fdf7f2 13287 goto update;
7c673cae 13288 }
c07f9fc5
FG
13289 } else if (prefix == "osd force-create-pg") {
13290 pg_t pgid;
13291 string pgidstr;
9f95a23c 13292 cmd_getval(cmdmap, "pgid", pgidstr);
c07f9fc5
FG
13293 if (!pgid.parse(pgidstr.c_str())) {
13294 ss << "invalid pgid '" << pgidstr << "'";
13295 err = -EINVAL;
13296 goto reply;
13297 }
94b18763
FG
13298 if (!osdmap.pg_exists(pgid)) {
13299 ss << "pg " << pgid << " should not exist";
13300 err = -ENOENT;
13301 goto reply;
13302 }
11fdf7f2 13303 bool sure = false;
9f95a23c 13304 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2
TL
13305 if (!sure) {
13306 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13307 << "that the cluster will give up ever trying to recover the lost data. Do this "
13308 << "only if you are certain that all copies of the PG are in fact lost and you are "
13309 << "willing to accept that the data is permanently destroyed. Pass "
13310 << "--yes-i-really-mean-it to proceed.";
13311 err = -EPERM;
13312 goto reply;
13313 }
c07f9fc5
FG
13314 bool creating_now;
13315 {
13316 std::lock_guard<std::mutex> l(creating_pgs_lock);
9f95a23c
TL
13317 auto emplaced = creating_pgs.pgs.emplace(
13318 pgid,
13319 creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13320 ceph_clock_now()));
c07f9fc5
FG
13321 creating_now = emplaced.second;
13322 }
13323 if (creating_now) {
13324 ss << "pg " << pgidstr << " now creating, ok";
11fdf7f2
TL
13325 // set the pool's CREATING flag so that (1) the osd won't ignore our
13326 // create message and (2) we won't propose any future pg_num changes
13327 // until after the PG has been instantiated.
13328 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13329 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13330 }
13331 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
c07f9fc5
FG
13332 err = 0;
13333 goto update;
13334 } else {
13335 ss << "pg " << pgid << " already creating";
13336 err = 0;
13337 goto reply;
13338 }
7c673cae
FG
13339 } else {
13340 err = -EINVAL;
13341 }
13342
13343 reply:
13344 getline(ss, rs);
13345 if (err < 0 && rs.length() == 0)
13346 rs = cpp_strerror(err);
13347 mon->reply_command(op, err, rs, rdata, get_last_committed());
13348 return ret;
13349
13350 update:
13351 getline(ss, rs);
13352 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13353 get_last_committed() + 1));
13354 return true;
13355
13356 wait:
13357 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13358 return true;
13359}
13360
28e407b8 13361bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
7c673cae
FG
13362{
13363 op->mark_osdmon_event(__func__);
28e407b8 13364
9f95a23c 13365 auto m = op->get_req<MPoolOp>();
11fdf7f2 13366 MonSession *session = op->get_session();
28e407b8
AA
13367 if (!session) {
13368 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13369 return true;
13370 }
13371
13372 switch (m->op) {
13373 case POOL_OP_CREATE_UNMANAGED_SNAP:
13374 case POOL_OP_DELETE_UNMANAGED_SNAP:
13375 {
13376 const std::string* pool_name = nullptr;
13377 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13378 if (pg_pool != nullptr) {
13379 pool_name = &osdmap.get_pool_name(m->pool);
13380 }
13381
13382 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13383 session->entity_name, session->caps,
11fdf7f2 13384 session->get_peer_socket_addr(),
28e407b8
AA
13385 pool_name)) {
13386 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13387 << "privileges. message: " << *m << std::endl
13388 << "caps: " << session->caps << dendl;
13389 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13390 return true;
13391 }
13392 }
13393 break;
13394 default:
13395 if (!session->is_capable("osd", MON_CAP_W)) {
13396 dout(0) << "got pool op from entity with insufficient privileges. "
13397 << "message: " << *m << std::endl
13398 << "caps: " << session->caps << dendl;
13399 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13400 return true;
13401 }
13402 break;
13403 }
13404
13405 return false;
13406}
13407
13408bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13409{
13410 op->mark_osdmon_event(__func__);
9f95a23c 13411 auto m = op->get_req<MPoolOp>();
28e407b8
AA
13412
13413 if (enforce_pool_op_caps(op)) {
13414 return true;
13415 }
13416
7c673cae
FG
13417 if (m->fsid != mon->monmap->fsid) {
13418 dout(0) << __func__ << " drop message on fsid " << m->fsid
13419 << " != " << mon->monmap->fsid << " for " << *m << dendl;
13420 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13421 return true;
13422 }
13423
13424 if (m->op == POOL_OP_CREATE)
13425 return preprocess_pool_op_create(op);
13426
11fdf7f2
TL
13427 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13428 if (p == nullptr) {
7c673cae 13429 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11fdf7f2
TL
13430 if (m->op == POOL_OP_DELETE) {
13431 _pool_op_reply(op, 0, osdmap.get_epoch());
13432 } else {
13433 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13434 }
7c673cae
FG
13435 return true;
13436 }
13437
13438 // check if the snap and snapname exist
13439 bool snap_exists = false;
7c673cae
FG
13440 if (p->snap_exists(m->name.c_str()))
13441 snap_exists = true;
13442
13443 switch (m->op) {
13444 case POOL_OP_CREATE_SNAP:
13445 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13446 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13447 return true;
13448 }
13449 if (snap_exists) {
13450 _pool_op_reply(op, 0, osdmap.get_epoch());
13451 return true;
13452 }
13453 return false;
13454 case POOL_OP_CREATE_UNMANAGED_SNAP:
13455 if (p->is_pool_snaps_mode()) {
13456 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13457 return true;
13458 }
13459 return false;
13460 case POOL_OP_DELETE_SNAP:
13461 if (p->is_unmanaged_snaps_mode()) {
13462 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13463 return true;
13464 }
13465 if (!snap_exists) {
13466 _pool_op_reply(op, 0, osdmap.get_epoch());
13467 return true;
13468 }
13469 return false;
13470 case POOL_OP_DELETE_UNMANAGED_SNAP:
13471 if (p->is_pool_snaps_mode()) {
13472 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13473 return true;
13474 }
9f95a23c 13475 if (_is_removed_snap(m->pool, m->snapid)) {
7c673cae
FG
13476 _pool_op_reply(op, 0, osdmap.get_epoch());
13477 return true;
13478 }
13479 return false;
13480 case POOL_OP_DELETE:
13481 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13482 _pool_op_reply(op, 0, osdmap.get_epoch());
13483 return true;
13484 }
13485 return false;
13486 case POOL_OP_AUID_CHANGE:
13487 return false;
13488 default:
13489 ceph_abort();
13490 break;
13491 }
13492
13493 return false;
13494}
13495
9f95a23c
TL
13496bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13497{
13498 if (!osdmap.have_pg_pool(pool)) {
13499 dout(10) << __func__ << " pool " << pool << " snap " << snap
13500 << " - pool dne" << dendl;
13501 return true;
13502 }
13503 if (osdmap.in_removed_snaps_queue(pool, snap)) {
13504 dout(10) << __func__ << " pool " << pool << " snap " << snap
13505 << " - in osdmap removed_snaps_queue" << dendl;
13506 return true;
13507 }
13508 snapid_t begin, end;
13509 int r = lookup_purged_snap(pool, snap, &begin, &end);
13510 if (r == 0) {
13511 dout(10) << __func__ << " pool " << pool << " snap " << snap
13512 << " - purged, [" << begin << "," << end << ")" << dendl;
13513 return true;
13514 }
13515 return false;
13516}
13517
13518bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13519{
13520 if (pending_inc.old_pools.count(pool)) {
13521 dout(10) << __func__ << " pool " << pool << " snap " << snap
13522 << " - pool pending deletion" << dendl;
13523 return true;
13524 }
13525 if (pending_inc.in_new_removed_snaps(pool, snap)) {
13526 dout(10) << __func__ << " pool " << pool << " snap " << snap
13527 << " - in pending new_removed_snaps" << dendl;
13528 return true;
13529 }
13530 return false;
13531}
13532
7c673cae
FG
13533bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13534{
13535 op->mark_osdmon_event(__func__);
9f95a23c 13536 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13537 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13538 if (pool >= 0) {
13539 _pool_op_reply(op, 0, osdmap.get_epoch());
13540 return true;
13541 }
13542
13543 return false;
13544}
13545
13546bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13547{
13548 op->mark_osdmon_event(__func__);
9f95a23c 13549 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13550 dout(10) << "prepare_pool_op " << *m << dendl;
13551 if (m->op == POOL_OP_CREATE) {
13552 return prepare_pool_op_create(op);
13553 } else if (m->op == POOL_OP_DELETE) {
13554 return prepare_pool_op_delete(op);
13555 }
13556
13557 int ret = 0;
13558 bool changed = false;
13559
13560 if (!osdmap.have_pg_pool(m->pool)) {
13561 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13562 return false;
13563 }
13564
13565 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13566
13567 switch (m->op) {
13568 case POOL_OP_CREATE_SNAP:
13569 if (pool->is_tier()) {
13570 ret = -EINVAL;
13571 _pool_op_reply(op, ret, osdmap.get_epoch());
13572 return false;
13573 } // else, fall through
13574 case POOL_OP_DELETE_SNAP:
13575 if (!pool->is_unmanaged_snaps_mode()) {
13576 bool snap_exists = pool->snap_exists(m->name.c_str());
13577 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13578 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13579 ret = 0;
13580 } else {
13581 break;
13582 }
13583 } else {
13584 ret = -EINVAL;
13585 }
13586 _pool_op_reply(op, ret, osdmap.get_epoch());
13587 return false;
13588
13589 case POOL_OP_DELETE_UNMANAGED_SNAP:
13590 // we won't allow removal of an unmanaged snapshot from a pool
13591 // not in unmanaged snaps mode.
13592 if (!pool->is_unmanaged_snaps_mode()) {
13593 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13594 return false;
13595 }
13596 /* fall-thru */
13597 case POOL_OP_CREATE_UNMANAGED_SNAP:
13598 // but we will allow creating an unmanaged snapshot on any pool
13599 // as long as it is not in 'pool' snaps mode.
13600 if (pool->is_pool_snaps_mode()) {
13601 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13602 return false;
13603 }
13604 }
13605
13606 // projected pool info
13607 pg_pool_t pp;
13608 if (pending_inc.new_pools.count(m->pool))
13609 pp = pending_inc.new_pools[m->pool];
13610 else
13611 pp = *osdmap.get_pg_pool(m->pool);
13612
13613 bufferlist reply_data;
13614
13615 // pool snaps vs unmanaged snaps are mutually exclusive
13616 switch (m->op) {
13617 case POOL_OP_CREATE_SNAP:
13618 case POOL_OP_DELETE_SNAP:
13619 if (pp.is_unmanaged_snaps_mode()) {
13620 ret = -EINVAL;
13621 goto out;
13622 }
13623 break;
13624
13625 case POOL_OP_CREATE_UNMANAGED_SNAP:
13626 case POOL_OP_DELETE_UNMANAGED_SNAP:
13627 if (pp.is_pool_snaps_mode()) {
13628 ret = -EINVAL;
13629 goto out;
13630 }
13631 }
13632
13633 switch (m->op) {
13634 case POOL_OP_CREATE_SNAP:
13635 if (!pp.snap_exists(m->name.c_str())) {
13636 pp.add_snap(m->name.c_str(), ceph_clock_now());
11fdf7f2
TL
13637 dout(10) << "create snap in pool " << m->pool << " " << m->name
13638 << " seq " << pp.get_snap_epoch() << dendl;
7c673cae
FG
13639 changed = true;
13640 }
13641 break;
13642
13643 case POOL_OP_DELETE_SNAP:
13644 {
13645 snapid_t s = pp.snap_exists(m->name.c_str());
13646 if (s) {
13647 pp.remove_snap(s);
11fdf7f2 13648 pending_inc.new_removed_snaps[m->pool].insert(s);
7c673cae
FG
13649 changed = true;
13650 }
13651 }
13652 break;
13653
13654 case POOL_OP_CREATE_UNMANAGED_SNAP:
13655 {
9f95a23c
TL
13656 uint64_t snapid = pp.add_unmanaged_snap(
13657 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 13658 encode(snapid, reply_data);
7c673cae
FG
13659 changed = true;
13660 }
13661 break;
13662
13663 case POOL_OP_DELETE_UNMANAGED_SNAP:
9f95a23c
TL
13664 if (!_is_removed_snap(m->pool, m->snapid) &&
13665 !_is_pending_removed_snap(m->pool, m->snapid)) {
28e407b8
AA
13666 if (m->snapid > pp.get_snap_seq()) {
13667 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13668 return false;
13669 }
9f95a23c
TL
13670 pp.remove_unmanaged_snap(
13671 m->snapid,
13672 osdmap.require_osd_release < ceph_release_t::octopus);
11fdf7f2 13673 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
9f95a23c
TL
13674 // also record the new seq as purged: this avoids a discontinuity
13675 // after all of the snaps have been purged, since the seq assigned
13676 // during removal lives in the same namespace as the actual snaps.
13677 pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
7c673cae
FG
13678 changed = true;
13679 }
13680 break;
13681
13682 case POOL_OP_AUID_CHANGE:
11fdf7f2
TL
13683 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13684 return false;
7c673cae
FG
13685
13686 default:
13687 ceph_abort();
13688 break;
13689 }
13690
13691 if (changed) {
13692 pp.set_snap_epoch(pending_inc.epoch);
13693 pending_inc.new_pools[m->pool] = pp;
13694 }
13695
13696 out:
13697 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13698 return true;
13699}
13700
13701bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13702{
13703 op->mark_osdmon_event(__func__);
13704 int err = prepare_new_pool(op);
13705 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13706 return true;
13707}
13708
13709int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13710 ostream *ss)
13711{
13712 const string& poolstr = osdmap.get_pool_name(pool_id);
13713
13714 // If the Pool is in use by CephFS, refuse to delete it
28e407b8 13715 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
13716 if (pending_fsmap.pool_in_use(pool_id)) {
13717 *ss << "pool '" << poolstr << "' is in use by CephFS";
13718 return -EBUSY;
13719 }
13720
13721 if (pool.tier_of >= 0) {
13722 *ss << "pool '" << poolstr << "' is a tier of '"
13723 << osdmap.get_pool_name(pool.tier_of) << "'";
13724 return -EBUSY;
13725 }
13726 if (!pool.tiers.empty()) {
13727 *ss << "pool '" << poolstr << "' has tiers";
13728 for(auto tier : pool.tiers) {
13729 *ss << " " << osdmap.get_pool_name(tier);
13730 }
13731 return -EBUSY;
13732 }
13733
11fdf7f2 13734 if (!g_conf()->mon_allow_pool_delete) {
7c673cae
FG
13735 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13736 return -EPERM;
13737 }
13738
13739 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13740 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13741 return -EPERM;
13742 }
13743
13744 *ss << "pool '" << poolstr << "' removed";
13745 return 0;
13746}
13747
13748/**
13749 * Check if it is safe to add a tier to a base pool
13750 *
13751 * @return
13752 * True if the operation should proceed, false if we should abort here
13753 * (abort doesn't necessarily mean error, could be idempotency)
13754 */
13755bool OSDMonitor::_check_become_tier(
13756 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13757 const int64_t base_pool_id, const pg_pool_t *base_pool,
13758 int *err,
13759 ostream *ss) const
13760{
13761 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13762 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13763
28e407b8 13764 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
13765 if (pending_fsmap.pool_in_use(tier_pool_id)) {
13766 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13767 *err = -EBUSY;
13768 return false;
13769 }
13770
13771 if (base_pool->tiers.count(tier_pool_id)) {
11fdf7f2 13772 ceph_assert(tier_pool->tier_of == base_pool_id);
7c673cae
FG
13773 *err = 0;
13774 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13775 << base_pool_name << "'";
13776 return false;
13777 }
13778
13779 if (base_pool->is_tier()) {
13780 *ss << "pool '" << base_pool_name << "' is already a tier of '"
13781 << osdmap.get_pool_name(base_pool->tier_of) << "', "
13782 << "multiple tiers are not yet supported.";
13783 *err = -EINVAL;
13784 return false;
13785 }
13786
13787 if (tier_pool->has_tiers()) {
13788 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13789 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13790 it != tier_pool->tiers.end(); ++it)
13791 *ss << "'" << osdmap.get_pool_name(*it) << "',";
13792 *ss << " multiple tiers are not yet supported.";
13793 *err = -EINVAL;
13794 return false;
13795 }
13796
13797 if (tier_pool->is_tier()) {
13798 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13799 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13800 *err = -EINVAL;
13801 return false;
13802 }
13803
13804 *err = 0;
13805 return true;
13806}
13807
13808
13809/**
13810 * Check if it is safe to remove a tier from this base pool
13811 *
13812 * @return
13813 * True if the operation should proceed, false if we should abort here
13814 * (abort doesn't necessarily mean error, could be idempotency)
13815 */
13816bool OSDMonitor::_check_remove_tier(
13817 const int64_t base_pool_id, const pg_pool_t *base_pool,
13818 const pg_pool_t *tier_pool,
13819 int *err, ostream *ss) const
13820{
13821 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13822
13823 // Apply CephFS-specific checks
28e407b8 13824 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae 13825 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
13826 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13827 // If the underlying pool is erasure coded and does not allow EC
13828 // overwrites, we can't permit the removal of the replicated tier that
13829 // CephFS relies on to access it
13830 *ss << "pool '" << base_pool_name <<
13831 "' does not allow EC overwrites and is in use by CephFS"
13832 " via its tier";
7c673cae
FG
13833 *err = -EBUSY;
13834 return false;
13835 }
13836
13837 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13838 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13839 "tier is still in use as a writeback cache. Change the cache "
13840 "mode and flush the cache before removing it";
13841 *err = -EBUSY;
13842 return false;
13843 }
13844 }
13845
13846 *err = 0;
13847 return true;
13848}
13849
13850int OSDMonitor::_prepare_remove_pool(
13851 int64_t pool, ostream *ss, bool no_fake)
13852{
224ce89b 13853 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
13854 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13855 int r = _check_remove_pool(pool, *p, ss);
13856 if (r < 0)
13857 return r;
13858
13859 auto new_pool = pending_inc.new_pools.find(pool);
13860 if (new_pool != pending_inc.new_pools.end()) {
13861 // if there is a problem with the pending info, wait and retry
13862 // this op.
13863 const auto& p = new_pool->second;
13864 int r = _check_remove_pool(pool, p, ss);
13865 if (r < 0)
13866 return -EAGAIN;
13867 }
13868
13869 if (pending_inc.old_pools.count(pool)) {
224ce89b 13870 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
13871 << dendl;
13872 return 0;
13873 }
13874
11fdf7f2 13875 if (g_conf()->mon_fake_pool_delete && !no_fake) {
7c673cae
FG
13876 string old_name = osdmap.get_pool_name(pool);
13877 string new_name = old_name + "." + stringify(pool) + ".DELETED";
13878 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13879 << old_name << " -> " << new_name << dendl;
13880 pending_inc.new_pool_names[pool] = new_name;
13881 return 0;
13882 }
13883
13884 // remove
13885 pending_inc.old_pools.insert(pool);
13886
224ce89b 13887 // remove any pg_temp mappings for this pool
7c673cae
FG
13888 for (auto p = osdmap.pg_temp->begin();
13889 p != osdmap.pg_temp->end();
13890 ++p) {
11fdf7f2 13891 if (p->first.pool() == pool) {
224ce89b 13892 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
13893 << p->first << dendl;
13894 pending_inc.new_pg_temp[p->first].clear();
13895 }
13896 }
224ce89b 13897 // remove any primary_temp mappings for this pool
7c673cae
FG
13898 for (auto p = osdmap.primary_temp->begin();
13899 p != osdmap.primary_temp->end();
13900 ++p) {
11fdf7f2 13901 if (p->first.pool() == pool) {
224ce89b 13902 dout(10) << __func__ << " " << pool
7c673cae
FG
13903 << " removing obsolete primary_temp" << p->first << dendl;
13904 pending_inc.new_primary_temp[p->first] = -1;
13905 }
13906 }
224ce89b
WB
13907 // remove any pg_upmap mappings for this pool
13908 for (auto& p : osdmap.pg_upmap) {
11fdf7f2 13909 if (p.first.pool() == pool) {
224ce89b
WB
13910 dout(10) << __func__ << " " << pool
13911 << " removing obsolete pg_upmap "
13912 << p.first << dendl;
13913 pending_inc.old_pg_upmap.insert(p.first);
13914 }
13915 }
94b18763
FG
13916 // remove any pending pg_upmap mappings for this pool
13917 {
13918 auto it = pending_inc.new_pg_upmap.begin();
13919 while (it != pending_inc.new_pg_upmap.end()) {
11fdf7f2 13920 if (it->first.pool() == pool) {
94b18763
FG
13921 dout(10) << __func__ << " " << pool
13922 << " removing pending pg_upmap "
13923 << it->first << dendl;
13924 it = pending_inc.new_pg_upmap.erase(it);
13925 } else {
13926 it++;
13927 }
13928 }
13929 }
224ce89b
WB
13930 // remove any pg_upmap_items mappings for this pool
13931 for (auto& p : osdmap.pg_upmap_items) {
11fdf7f2 13932 if (p.first.pool() == pool) {
224ce89b
WB
13933 dout(10) << __func__ << " " << pool
13934 << " removing obsolete pg_upmap_items " << p.first
13935 << dendl;
13936 pending_inc.old_pg_upmap_items.insert(p.first);
13937 }
13938 }
94b18763
FG
13939 // remove any pending pg_upmap mappings for this pool
13940 {
13941 auto it = pending_inc.new_pg_upmap_items.begin();
13942 while (it != pending_inc.new_pg_upmap_items.end()) {
11fdf7f2 13943 if (it->first.pool() == pool) {
94b18763
FG
13944 dout(10) << __func__ << " " << pool
13945 << " removing pending pg_upmap_items "
13946 << it->first << dendl;
13947 it = pending_inc.new_pg_upmap_items.erase(it);
13948 } else {
13949 it++;
13950 }
13951 }
13952 }
35e4c445
FG
13953
13954 // remove any choose_args for this pool
13955 CrushWrapper newcrush;
13956 _get_pending_crush(newcrush);
13957 if (newcrush.have_choose_args(pool)) {
13958 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13959 newcrush.rm_choose_args(pool);
13960 pending_inc.crush.clear();
13961 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13962 }
7c673cae
FG
13963 return 0;
13964}
13965
13966int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13967{
13968 dout(10) << "_prepare_rename_pool " << pool << dendl;
13969 if (pending_inc.old_pools.count(pool)) {
13970 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13971 return -ENOENT;
13972 }
13973 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13974 p != pending_inc.new_pool_names.end();
13975 ++p) {
13976 if (p->second == newname && p->first != pool) {
13977 return -EEXIST;
13978 }
13979 }
13980
13981 pending_inc.new_pool_names[pool] = newname;
13982 return 0;
13983}
13984
13985bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13986{
13987 op->mark_osdmon_event(__func__);
9f95a23c 13988 auto m = op->get_req<MPoolOp>();
7c673cae
FG
13989 ostringstream ss;
13990 int ret = _prepare_remove_pool(m->pool, &ss, false);
13991 if (ret == -EAGAIN) {
13992 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13993 return true;
13994 }
13995 if (ret < 0)
13996 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13997 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13998 pending_inc.epoch));
13999 return true;
14000}
14001
14002void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14003 int ret, epoch_t epoch, bufferlist *blp)
14004{
14005 op->mark_osdmon_event(__func__);
9f95a23c 14006 auto m = op->get_req<MPoolOp>();
7c673cae
FG
14007 dout(20) << "_pool_op_reply " << ret << dendl;
14008 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14009 ret, epoch, get_last_committed(), blp);
14010 mon->send_reply(op, reply);
14011}
81eedcae
TL
14012
14013void OSDMonitor::convert_pool_priorities(void)
14014{
14015 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14016 int64_t max_prio = 0;
14017 int64_t min_prio = 0;
14018 for (const auto &i : osdmap.get_pools()) {
14019 const auto &pool = i.second;
14020
14021 if (pool.opts.is_set(key)) {
9f95a23c 14022 int64_t prio = 0;
81eedcae
TL
14023 pool.opts.get(key, &prio);
14024 if (prio > max_prio)
14025 max_prio = prio;
14026 if (prio < min_prio)
14027 min_prio = prio;
14028 }
14029 }
14030 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14031 dout(20) << __func__ << " nothing to fix" << dendl;
14032 return;
14033 }
14034 // Current pool priorities exceeds new maximum
14035 for (const auto &i : osdmap.get_pools()) {
14036 const auto pool_id = i.first;
14037 pg_pool_t pool = i.second;
14038
14039 int64_t prio = 0;
14040 pool.opts.get(key, &prio);
14041 int64_t n;
14042
14043 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14044 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14045 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14046 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14047 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
14048 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14049 } else {
14050 continue;
14051 }
14052 if (n == 0) {
14053 pool.opts.unset(key);
14054 } else {
14055 pool.opts.set(key, static_cast<int64_t>(n));
14056 }
14057 dout(10) << __func__ << " pool " << pool_id
14058 << " recovery_priority adjusted "
14059 << prio << " to " << n << dendl;
14060 pool.last_change = pending_inc.epoch;
14061 pending_inc.new_pools[pool_id] = pool;
14062 }
14063}