]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
import ceph 14.2.5
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b 20#include <boost/algorithm/string.hpp>
11fdf7f2 21#include <experimental/iterator>
224ce89b 22#include <locale>
7c673cae
FG
23#include <sstream>
24
31f18b77
FG
25#include "mon/OSDMonitor.h"
26#include "mon/Monitor.h"
27#include "mon/MDSMonitor.h"
31f18b77
FG
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
42#include "messages/MOSDFull.h"
43#include "messages/MOSDMap.h"
44#include "messages/MMonGetOSDMap.h"
45#include "messages/MOSDBoot.h"
46#include "messages/MOSDAlive.h"
47#include "messages/MPoolOp.h"
48#include "messages/MPoolOpReply.h"
49#include "messages/MOSDPGCreate.h"
11fdf7f2 50#include "messages/MOSDPGCreate2.h"
7c673cae
FG
51#include "messages/MOSDPGCreated.h"
52#include "messages/MOSDPGTemp.h"
11fdf7f2 53#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
54#include "messages/MMonCommand.h"
55#include "messages/MRemoveSnaps.h"
56#include "messages/MOSDScrub.h"
57#include "messages/MRoute.h"
58
59#include "common/TextTable.h"
60#include "common/Timer.h"
61#include "common/ceph_argparse.h"
62#include "common/perf_counters.h"
eafe8130 63#include "common/PriorityCache.h"
7c673cae 64#include "common/strtol.h"
11fdf7f2 65#include "common/numa.h"
7c673cae
FG
66
67#include "common/config.h"
68#include "common/errno.h"
69
70#include "erasure-code/ErasureCodePlugin.h"
71#include "compressor/Compressor.h"
72#include "common/Checksummer.h"
73
74#include "include/compat.h"
11fdf7f2 75#include "include/ceph_assert.h"
7c673cae
FG
76#include "include/stringify.h"
77#include "include/util.h"
78#include "common/cmdparse.h"
79#include "include/str_list.h"
80#include "include/str_map.h"
224ce89b 81#include "include/scope_guard.h"
eafe8130 82#include "perfglue/heap_profiler.h"
7c673cae 83
28e407b8
AA
84#include "auth/cephx/CephxKeyServer.h"
85#include "osd/OSDCap.h"
86
7c673cae
FG
87#include "json_spirit/json_spirit_reader.h"
88
c07f9fc5
FG
89#include <boost/algorithm/string/predicate.hpp>
90
7c673cae 91#define dout_subsys ceph_subsys_mon
3efd9988
FG
92static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
93static const string OSD_METADATA_PREFIX("osd_metadata");
11fdf7f2 94static const string OSD_SNAP_PREFIX("osd_snap");
7c673cae 95
c07f9fc5
FG
96namespace {
97
eafe8130
TL
98struct OSDMemCache : public PriorityCache::PriCache {
99 OSDMonitor *osdmon;
100 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
101 int64_t committed_bytes = 0;
102 double cache_ratio = 0;
103
104 OSDMemCache(OSDMonitor *m) : osdmon(m) {};
105
106 virtual uint64_t _get_used_bytes() const = 0;
107
108 virtual int64_t request_cache_bytes(
109 PriorityCache::Priority pri, uint64_t total_cache) const {
110 int64_t assigned = get_cache_bytes(pri);
111
112 switch (pri) {
113 // All cache items are currently set to have PRI1 priority
114 case PriorityCache::Priority::PRI1:
115 {
116 int64_t request = _get_used_bytes();
117 return (request > assigned) ? request - assigned : 0;
118 }
119 default:
120 break;
121 }
122 return -EOPNOTSUPP;
123 }
124
125 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
126 return cache_bytes[pri];
127 }
128
129 virtual int64_t get_cache_bytes() const {
130 int64_t total = 0;
131
132 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
133 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
134 total += get_cache_bytes(pri);
135 }
136 return total;
137 }
138
139 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
140 cache_bytes[pri] = bytes;
141 }
142 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
143 cache_bytes[pri] += bytes;
144 }
145 virtual int64_t commit_cache_size(uint64_t total_cache) {
146 committed_bytes = PriorityCache::get_chunk(
147 get_cache_bytes(), total_cache);
148 return committed_bytes;
149 }
150 virtual int64_t get_committed_size() const {
151 return committed_bytes;
152 }
153 virtual double get_cache_ratio() const {
154 return cache_ratio;
155 }
156 virtual void set_cache_ratio(double ratio) {
157 cache_ratio = ratio;
158 }
159 virtual string get_cache_name() const = 0;
160};
161
162struct IncCache : public OSDMemCache {
163 IncCache(OSDMonitor *m) : OSDMemCache(m) {};
164
165 virtual uint64_t _get_used_bytes() const {
166 return osdmon->inc_osd_cache.get_bytes();
167 }
168
169 virtual string get_cache_name() const {
170 return "OSDMap Inc Cache";
171 }
172
173 uint64_t _get_num_osdmaps() const {
174 return osdmon->inc_osd_cache.get_size();
175 }
176};
177
178struct FullCache : public OSDMemCache {
179 FullCache(OSDMonitor *m) : OSDMemCache(m) {};
180
181 virtual uint64_t _get_used_bytes() const {
182 return osdmon->full_osd_cache.get_bytes();
183 }
184
185 virtual string get_cache_name() const {
186 return "OSDMap Full Cache";
187 }
188
189 uint64_t _get_num_osdmaps() const {
190 return osdmon->full_osd_cache.get_size();
191 }
192};
193
194std::shared_ptr<IncCache> inc_cache;
195std::shared_ptr<FullCache> full_cache;
196
c07f9fc5
FG
197const uint32_t MAX_POOL_APPLICATIONS = 4;
198const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
199const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
200
28e407b8
AA
201bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
202 // Note: this doesn't include support for the application tag match
203 if ((grant.spec.allow & OSD_CAP_W) != 0) {
204 auto& match = grant.match;
205 if (match.is_match_all()) {
206 return true;
11fdf7f2 207 } else if (pool_name != nullptr &&
28e407b8
AA
208 !match.pool_namespace.pool_name.empty() &&
209 match.pool_namespace.pool_name == *pool_name) {
210 return true;
211 }
212 }
213 return false;
214}
215
216bool is_unmanaged_snap_op_permitted(CephContext* cct,
217 const KeyServer& key_server,
218 const EntityName& entity_name,
219 const MonCap& mon_caps,
11fdf7f2 220 const entity_addr_t& peer_socket_addr,
28e407b8
AA
221 const std::string* pool_name)
222{
223 typedef std::map<std::string, std::string> CommandArgs;
224
11fdf7f2
TL
225 if (mon_caps.is_capable(
226 cct, CEPH_ENTITY_TYPE_MON,
227 entity_name, "osd",
228 "osd pool op unmanaged-snap",
229 (pool_name == nullptr ?
230 CommandArgs{} /* pool DNE, require unrestricted cap */ :
231 CommandArgs{{"poolname", *pool_name}}),
232 false, true, false,
233 peer_socket_addr)) {
28e407b8
AA
234 return true;
235 }
236
237 AuthCapsInfo caps_info;
238 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
239 caps_info)) {
240 dout(10) << "unable to locate OSD cap data for " << entity_name
241 << " in auth db" << dendl;
242 return false;
243 }
244
245 string caps_str;
246 if (caps_info.caps.length() > 0) {
11fdf7f2 247 auto p = caps_info.caps.cbegin();
28e407b8
AA
248 try {
249 decode(caps_str, p);
250 } catch (const buffer::error &err) {
251 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
252 << dendl;
253 return false;
254 }
255 }
256
257 OSDCap osd_cap;
258 if (!osd_cap.parse(caps_str, nullptr)) {
259 dout(10) << "unable to parse OSD cap data for " << entity_name
260 << " in auth db" << dendl;
261 return false;
262 }
263
264 // if the entity has write permissions in one or all pools, permit
265 // usage of unmanaged-snapshots
266 if (osd_cap.allow_all()) {
267 return true;
268 }
269
270 for (auto& grant : osd_cap.grants) {
271 if (grant.profile.is_valid()) {
272 for (auto& profile_grant : grant.profile_grants) {
273 if (is_osd_writable(profile_grant, pool_name)) {
274 return true;
275 }
276 }
277 } else if (is_osd_writable(grant, pool_name)) {
278 return true;
279 }
280 }
281
282 return false;
283}
284
c07f9fc5
FG
285} // anonymous namespace
286
7c673cae
FG
287void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
288{
289 if (epoch_by_pg.size() <= ps) {
290 epoch_by_pg.resize(ps + 1, 0);
291 }
292 const auto old_lec = epoch_by_pg[ps];
293 if (old_lec >= last_epoch_clean) {
294 // stale lec
295 return;
296 }
297 epoch_by_pg[ps] = last_epoch_clean;
298 if (last_epoch_clean < floor) {
299 floor = last_epoch_clean;
300 } else if (last_epoch_clean > floor) {
301 if (old_lec == floor) {
302 // probably should increase floor?
303 auto new_floor = std::min_element(std::begin(epoch_by_pg),
304 std::end(epoch_by_pg));
305 floor = *new_floor;
306 }
307 }
308 if (ps != next_missing) {
309 return;
310 }
311 for (; next_missing < epoch_by_pg.size(); next_missing++) {
312 if (epoch_by_pg[next_missing] == 0) {
313 break;
314 }
315 }
316}
317
318void LastEpochClean::remove_pool(uint64_t pool)
319{
320 report_by_pool.erase(pool);
321}
322
323void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
324{
325 auto& lec = report_by_pool[pg.pool()];
326 return lec.report(pg.ps(), last_epoch_clean);
327}
328
329epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
330{
331 auto floor = latest.get_epoch();
332 for (auto& pool : latest.get_pools()) {
333 auto reported = report_by_pool.find(pool.first);
334 if (reported == report_by_pool.end()) {
335 return 0;
336 }
337 if (reported->second.next_missing < pool.second.get_pg_num()) {
338 return 0;
339 }
340 if (reported->second.floor < floor) {
341 floor = reported->second.floor;
342 }
343 }
344 return floor;
345}
346
347
11fdf7f2
TL
348class C_UpdateCreatingPGs : public Context {
349public:
7c673cae
FG
350 OSDMonitor *osdmon;
351 utime_t start;
352 epoch_t epoch;
353 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
354 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
355 void finish(int r) override {
356 if (r >= 0) {
357 utime_t end = ceph_clock_now();
358 dout(10) << "osdmap epoch " << epoch << " mapping took "
359 << (end - start) << " seconds" << dendl;
360 osdmon->update_creating_pgs();
361 osdmon->check_pg_creates_subs();
362 }
363 }
364};
365
366#undef dout_prefix
367#define dout_prefix _prefix(_dout, mon, osdmap)
368static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
369 return *_dout << "mon." << mon->name << "@" << mon->rank
370 << "(" << mon->get_state_name()
371 << ").osd e" << osdmap.get_epoch() << " ";
372}
373
374OSDMonitor::OSDMonitor(
375 CephContext *cct,
376 Monitor *mn,
377 Paxos *p,
378 const string& service_name)
379 : PaxosService(mn, p, service_name),
380 cct(cct),
11fdf7f2
TL
381 inc_osd_cache(g_conf()->mon_osd_cache_size),
382 full_osd_cache(g_conf()->mon_osd_cache_size),
383 has_osdmap_manifest(false),
384 mapper(mn->cct, &mn->cpu_tp)
eafe8130
TL
385{
386 inc_cache = std::make_shared<IncCache>(this);
387 full_cache = std::make_shared<FullCache>(this);
388 cct->_conf.add_observer(this);
389 int r = _set_cache_sizes();
390 if (r < 0) {
391 derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
392 << g_conf()->mon_osd_cache_size
393 << ") without priority cache management"
394 << dendl;
395 }
396}
397
398const char **OSDMonitor::get_tracked_conf_keys() const
399{
400 static const char* KEYS[] = {
401 "mon_memory_target",
402 "mon_memory_autotune",
403 "rocksdb_cache_size",
404 NULL
405 };
406 return KEYS;
407}
408
409void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
410 const std::set<std::string> &changed)
411{
412 dout(10) << __func__ << " " << changed << dendl;
413
414 if (changed.count("mon_memory_autotune")) {
415 _set_cache_autotuning();
416 }
417 if (changed.count("mon_memory_target") ||
418 changed.count("rocksdb_cache_size")) {
419 int r = _update_mon_cache_settings();
420 if (r < 0) {
421 derr << __func__ << " mon_memory_target:"
422 << g_conf()->mon_memory_target
423 << " rocksdb_cache_size:"
424 << g_conf()->rocksdb_cache_size
425 << ". Invalid size provided."
426 << dendl;
427 }
428 }
429}
430
431void OSDMonitor::_set_cache_autotuning()
432{
433 if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
434 // Disable cache autotuning
435 std::lock_guard l(balancer_lock);
436 pcm = nullptr;
437 }
438
439 if (g_conf()->mon_memory_autotune && pcm == nullptr) {
440 int r = register_cache_with_pcm();
441 if (r < 0) {
442 dout(10) << __func__
443 << " Error while registering osdmon caches with pcm."
444 << " Cache auto tuning not enabled."
445 << dendl;
446 mon_memory_autotune = false;
447 } else {
448 mon_memory_autotune = true;
449 }
450 }
451}
452
453int OSDMonitor::_update_mon_cache_settings()
454{
455 if (g_conf()->mon_memory_target <= 0 ||
456 g_conf()->mon_memory_target < mon_memory_min ||
457 g_conf()->rocksdb_cache_size <= 0) {
458 return -EINVAL;
459 }
460
461 uint64_t old_mon_memory_target = mon_memory_target;
462 uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
463
464 // Set the new pcm memory cache sizes
465 mon_memory_target = g_conf()->mon_memory_target;
466 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
467
468 uint64_t base = mon_memory_base;
469 double fragmentation = mon_memory_fragmentation;
470 uint64_t target = mon_memory_target;
471 uint64_t min = mon_memory_min;
472 uint64_t max = min;
473
474 uint64_t ltarget = (1.0 - fragmentation) * target;
475 if (ltarget > base + min) {
476 max = ltarget - base;
477 }
478
479 int r = _set_cache_ratios();
480 if (r < 0) {
481 derr << __func__ << " Cache ratios for pcm could not be set."
482 << " Review the kv (rocksdb) and mon_memory_target sizes."
483 << dendl;
484 mon_memory_target = old_mon_memory_target;
485 rocksdb_cache_size = old_rocksdb_cache_size;
486 return -EINVAL;
487 }
488
489 if (mon_memory_autotune && pcm != nullptr) {
490 std::lock_guard l(balancer_lock);
491 // set pcm cache levels
492 pcm->set_target_memory(target);
493 pcm->set_min_memory(min);
494 pcm->set_max_memory(max);
495 // tune memory based on new values
496 pcm->tune_memory();
497 pcm->balance();
498 _set_new_cache_sizes();
499 dout(10) << __func__ << " Updated mon cache setting."
500 << " target: " << target
501 << " min: " << min
502 << " max: " << max
503 << dendl;
504 }
505 return 0;
506}
507
508int OSDMonitor::_set_cache_sizes()
509{
510 if (g_conf()->mon_memory_autotune) {
511 // set the new osdmon cache targets to be managed by pcm
512 mon_osd_cache_size = g_conf()->mon_osd_cache_size;
513 rocksdb_cache_size = g_conf()->rocksdb_cache_size;
514 mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
515 mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
516 mon_memory_target = g_conf()->mon_memory_target;
517 mon_memory_min = g_conf()->mon_osd_cache_size_min;
518 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
519 derr << __func__ << " mon_memory_target:" << mon_memory_target
520 << " mon_memory_min:" << mon_memory_min
521 << ". Invalid size option(s) provided."
522 << dendl;
523 return -EINVAL;
524 }
525 // Set the initial inc and full LRU cache sizes
526 inc_osd_cache.set_bytes(mon_memory_min);
527 full_osd_cache.set_bytes(mon_memory_min);
528 mon_memory_autotune = g_conf()->mon_memory_autotune;
529 }
530 return 0;
531}
7c673cae
FG
532
533bool OSDMonitor::_have_pending_crush()
534{
535 return pending_inc.crush.length() > 0;
536}
537
538CrushWrapper &OSDMonitor::_get_stable_crush()
539{
540 return *osdmap.crush;
541}
542
543void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
544{
545 bufferlist bl;
546 if (pending_inc.crush.length())
547 bl = pending_inc.crush;
548 else
549 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
550
11fdf7f2 551 auto p = bl.cbegin();
7c673cae
FG
552 newcrush.decode(p);
553}
554
555void OSDMonitor::create_initial()
556{
557 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
558
559 OSDMap newmap;
560
561 bufferlist bl;
562 mon->store->get("mkfs", "osdmap", bl);
563
564 if (bl.length()) {
565 newmap.decode(bl);
566 newmap.set_fsid(mon->monmap->fsid);
567 } else {
11fdf7f2 568 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
7c673cae
FG
569 }
570 newmap.set_epoch(1);
571 newmap.created = newmap.modified = ceph_clock_now();
572
573 // new clusters should sort bitwise by default.
574 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
575
11fdf7f2
TL
576 newmap.flags |=
577 CEPH_OSDMAP_RECOVERY_DELETES |
578 CEPH_OSDMAP_PURGED_SNAPDIRS |
579 CEPH_OSDMAP_PGLOG_HARDLIMIT;
580 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
581 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
582 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
583 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
584 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
585 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
586
7c673cae 587 // new cluster should require latest by default
11fdf7f2
TL
588 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
589 if (g_conf()->mon_debug_no_require_mimic) {
590 derr << __func__ << " mon_debug_no_require_mimic=true and nautilus=true" << dendl;
591 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
592 } else {
593 derr << __func__ << " mon_debug_no_require_nautilus=true" << dendl;
594 newmap.require_osd_release = CEPH_RELEASE_MIMIC;
595 }
31f18b77 596 } else {
11fdf7f2 597 newmap.require_osd_release = CEPH_RELEASE_NAUTILUS;
31f18b77 598 int r = ceph_release_from_name(
11fdf7f2 599 g_conf()->mon_osd_initial_require_min_compat_client.c_str());
31f18b77 600 if (r <= 0) {
11fdf7f2 601 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
31f18b77
FG
602 }
603 newmap.require_min_compat_client = r;
7c673cae
FG
604 }
605
606 // encode into pending incremental
28e407b8 607 uint64_t features = newmap.get_encoding_features();
7c673cae 608 newmap.encode(pending_inc.fullmap,
28e407b8 609 features | CEPH_FEATURE_RESERVED);
7c673cae
FG
610 pending_inc.full_crc = newmap.get_crc();
611 dout(20) << " full crc " << pending_inc.full_crc << dendl;
612}
613
11fdf7f2 614void OSDMonitor::get_store_prefixes(std::set<string>& s) const
7c673cae
FG
615{
616 s.insert(service_name);
617 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 618 s.insert(OSD_METADATA_PREFIX);
11fdf7f2 619 s.insert(OSD_SNAP_PREFIX);
7c673cae
FG
620}
621
622void OSDMonitor::update_from_paxos(bool *need_bootstrap)
623{
11fdf7f2
TL
624 // we really don't care if the version has been updated, because we may
625 // have trimmed without having increased the last committed; yet, we may
626 // need to update the in-memory manifest.
627 load_osdmap_manifest();
628
7c673cae
FG
629 version_t version = get_last_committed();
630 if (version == osdmap.epoch)
631 return;
11fdf7f2 632 ceph_assert(version > osdmap.epoch);
7c673cae
FG
633
634 dout(15) << "update_from_paxos paxos e " << version
635 << ", my e " << osdmap.epoch << dendl;
636
31f18b77
FG
637 if (mapping_job) {
638 if (!mapping_job->is_done()) {
639 dout(1) << __func__ << " mapping job "
640 << mapping_job.get() << " did not complete, "
641 << mapping_job->shards << " left, canceling" << dendl;
642 mapping_job->abort();
643 }
644 mapping_job.reset();
645 }
7c673cae 646
224ce89b
WB
647 load_health();
648
7c673cae
FG
649 /*
650 * We will possibly have a stashed latest that *we* wrote, and we will
651 * always be sure to have the oldest full map in the first..last range
652 * due to encode_trim_extra(), which includes the oldest full map in the trim
653 * transaction.
654 *
655 * encode_trim_extra() does not however write the full map's
656 * version to 'full_latest'. This is only done when we are building the
657 * full maps from the incremental versions. But don't panic! We make sure
658 * that the following conditions find whichever full map version is newer.
659 */
660 version_t latest_full = get_version_latest_full();
661 if (latest_full == 0 && get_first_committed() > 1)
662 latest_full = get_first_committed();
663
664 if (get_first_committed() > 1 &&
665 latest_full < get_first_committed()) {
666 // the monitor could be just sync'ed with its peer, and the latest_full key
667 // is not encoded in the paxos commits in encode_pending(), so we need to
668 // make sure we get it pointing to a proper version.
669 version_t lc = get_last_committed();
670 version_t fc = get_first_committed();
671
672 dout(10) << __func__ << " looking for valid full map in interval"
673 << " [" << fc << ", " << lc << "]" << dendl;
674
675 latest_full = 0;
676 for (version_t v = lc; v >= fc; v--) {
677 string full_key = "full_" + stringify(v);
678 if (mon->store->exists(get_service_name(), full_key)) {
679 dout(10) << __func__ << " found latest full map v " << v << dendl;
680 latest_full = v;
681 break;
682 }
683 }
684
11fdf7f2 685 ceph_assert(latest_full > 0);
7c673cae
FG
686 auto t(std::make_shared<MonitorDBStore::Transaction>());
687 put_version_latest_full(t, latest_full);
688 mon->store->apply_transaction(t);
689 dout(10) << __func__ << " updated the on-disk full map version to "
690 << latest_full << dendl;
691 }
692
693 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
694 bufferlist latest_bl;
695 get_version_full(latest_full, latest_bl);
11fdf7f2 696 ceph_assert(latest_bl.length() != 0);
7c673cae 697 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
11fdf7f2 698 osdmap = OSDMap();
7c673cae
FG
699 osdmap.decode(latest_bl);
700 }
701
11fdf7f2
TL
702 bufferlist bl;
703 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
704 auto p = bl.cbegin();
705 std::lock_guard<std::mutex> l(creating_pgs_lock);
706 creating_pgs.decode(p);
707 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
708 << creating_pgs.last_scan_epoch
709 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
31f18b77 710 } else {
11fdf7f2
TL
711 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
712 << dendl;
31f18b77
FG
713 }
714
7c673cae
FG
715 // walk through incrementals
716 MonitorDBStore::TransactionRef t;
717 size_t tx_size = 0;
718 while (version > osdmap.epoch) {
719 bufferlist inc_bl;
720 int err = get_version(osdmap.epoch+1, inc_bl);
11fdf7f2
TL
721 ceph_assert(err == 0);
722 ceph_assert(inc_bl.length());
eafe8130
TL
723 // set priority cache manager levels if the osdmap is
724 // being populated for the first time.
725 if (mon_memory_autotune && pcm == nullptr) {
726 int r = register_cache_with_pcm();
727 if (r < 0) {
728 dout(10) << __func__
729 << " Error while registering osdmon caches with pcm."
730 << " Proceeding without cache auto tuning."
731 << dendl;
732 }
733 }
7c673cae
FG
734
735 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
736 << dendl;
737 OSDMap::Incremental inc(inc_bl);
738 err = osdmap.apply_incremental(inc);
11fdf7f2 739 ceph_assert(err == 0);
7c673cae
FG
740
741 if (!t)
742 t.reset(new MonitorDBStore::Transaction);
743
744 // Write out the full map for all past epochs. Encode the full
745 // map with the same features as the incremental. If we don't
746 // know, use the quorum features. If we don't know those either,
747 // encode with all features.
748 uint64_t f = inc.encode_features;
749 if (!f)
750 f = mon->get_quorum_con_features();
751 if (!f)
752 f = -1;
753 bufferlist full_bl;
754 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
755 tx_size += full_bl.length();
756
757 bufferlist orig_full_bl;
758 get_version_full(osdmap.epoch, orig_full_bl);
759 if (orig_full_bl.length()) {
760 // the primary provided the full map
11fdf7f2 761 ceph_assert(inc.have_crc);
7c673cae
FG
762 if (inc.full_crc != osdmap.crc) {
763 // This will happen if the mons were running mixed versions in
764 // the past or some other circumstance made the full encoded
765 // maps divergent. Reloading here will bring us back into
766 // sync with the primary for this and all future maps. OSDs
767 // will also be brought back into sync when they discover the
768 // crc mismatch and request a full map from a mon.
769 derr << __func__ << " full map CRC mismatch, resetting to canonical"
770 << dendl;
11fdf7f2
TL
771
772 dout(20) << __func__ << " my (bad) full osdmap:\n";
773 JSONFormatter jf(true);
774 jf.dump_object("osdmap", osdmap);
775 jf.flush(*_dout);
776 *_dout << "\nhexdump:\n";
777 full_bl.hexdump(*_dout);
778 *_dout << dendl;
779
7c673cae
FG
780 osdmap = OSDMap();
781 osdmap.decode(orig_full_bl);
11fdf7f2
TL
782
783 dout(20) << __func__ << " canonical full osdmap:\n";
784 JSONFormatter jf(true);
785 jf.dump_object("osdmap", osdmap);
786 jf.flush(*_dout);
787 *_dout << "\nhexdump:\n";
788 orig_full_bl.hexdump(*_dout);
789 *_dout << dendl;
7c673cae
FG
790 }
791 } else {
11fdf7f2 792 ceph_assert(!inc.have_crc);
7c673cae
FG
793 put_version_full(t, osdmap.epoch, full_bl);
794 }
795 put_version_latest_full(t, osdmap.epoch);
796
797 // share
798 dout(1) << osdmap << dendl;
799
800 if (osdmap.epoch == 1) {
801 t->erase("mkfs", "osdmap");
802 }
803
11fdf7f2 804 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
7c673cae
FG
805 mon->store->apply_transaction(t);
806 t = MonitorDBStore::TransactionRef();
807 tx_size = 0;
808 }
11fdf7f2
TL
809 for (const auto &osd_state : inc.new_state) {
810 if (osd_state.second & CEPH_OSD_UP) {
811 // could be marked up *or* down, but we're too lazy to check which
812 last_osd_report.erase(osd_state.first);
813 }
814 if (osd_state.second & CEPH_OSD_EXISTS) {
815 // could be created *or* destroyed, but we can safely drop it
816 osd_epochs.erase(osd_state.first);
7c673cae
FG
817 }
818 }
819 }
820
821 if (t) {
822 mon->store->apply_transaction(t);
823 }
824
825 for (int o = 0; o < osdmap.get_max_osd(); o++) {
826 if (osdmap.is_out(o))
827 continue;
828 auto found = down_pending_out.find(o);
829 if (osdmap.is_down(o)) {
830 // populate down -> out map
831 if (found == down_pending_out.end()) {
832 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
833 down_pending_out[o] = ceph_clock_now();
834 }
835 } else {
836 if (found != down_pending_out.end()) {
837 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
838 down_pending_out.erase(found);
839 }
840 }
841 }
842 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
843
7c673cae
FG
844 check_osdmap_subs();
845 check_pg_creates_subs();
846
847 share_map_with_random_osd();
848 update_logger();
7c673cae
FG
849 process_failures();
850
851 // make sure our feature bits reflect the latest map
852 update_msgr_features();
853
854 if (!mon->is_leader()) {
855 // will be called by on_active() on the leader, avoid doing so twice
856 start_mapping();
857 }
858}
859
eafe8130
TL
860int OSDMonitor::register_cache_with_pcm()
861{
862 if (mon_memory_target <= 0 || mon_memory_min <= 0) {
863 derr << __func__ << " Invalid memory size specified for mon caches."
864 << " Caches will not be auto-tuned."
865 << dendl;
866 return -EINVAL;
867 }
868 uint64_t base = mon_memory_base;
869 double fragmentation = mon_memory_fragmentation;
870 // For calculating total target memory, consider rocksdb cache size.
871 uint64_t target = mon_memory_target;
872 uint64_t min = mon_memory_min;
873 uint64_t max = min;
874
875 // Apply the same logic as in bluestore to set the max amount
876 // of memory to use for cache. Assume base memory for OSDMaps
877 // and then add in some overhead for fragmentation.
878 uint64_t ltarget = (1.0 - fragmentation) * target;
879 if (ltarget > base + min) {
880 max = ltarget - base;
881 }
882
883 rocksdb_binned_kv_cache = mon->store->get_priority_cache();
884 if (!rocksdb_binned_kv_cache) {
885 derr << __func__ << " not using rocksdb" << dendl;
886 return -EINVAL;
887 }
888
889 int r = _set_cache_ratios();
890 if (r < 0) {
891 derr << __func__ << " Cache ratios for pcm could not be set."
892 << " Review the kv (rocksdb) and mon_memory_target sizes."
893 << dendl;
894 return -EINVAL;
895 }
896
897 pcm = std::make_shared<PriorityCache::Manager>(
898 cct, min, max, target, true);
899 pcm->insert("kv", rocksdb_binned_kv_cache, true);
900 pcm->insert("inc", inc_cache, true);
901 pcm->insert("full", full_cache, true);
902 dout(10) << __func__ << " pcm target: " << target
903 << " pcm max: " << max
904 << " pcm min: " << min
905 << " inc_osd_cache size: " << inc_osd_cache.get_size()
906 << dendl;
907 return 0;
908}
909
910int OSDMonitor::_set_cache_ratios()
911{
912 double old_cache_kv_ratio = cache_kv_ratio;
913
914 // Set the cache ratios for kv(rocksdb), inc and full caches
915 cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
916 if (cache_kv_ratio >= 1.0) {
917 derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
918 << ") must be in range [0,<1.0]."
919 << dendl;
920 cache_kv_ratio = old_cache_kv_ratio;
921 return -EINVAL;
922 }
923 rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
924 cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
925 inc_cache->set_cache_ratio(cache_inc_ratio);
926 full_cache->set_cache_ratio(cache_full_ratio);
927
928 dout(10) << __func__ << " kv ratio " << cache_kv_ratio
929 << " inc ratio " << cache_inc_ratio
930 << " full ratio " << cache_full_ratio
931 << dendl;
932 return 0;
933}
934
7c673cae
FG
935void OSDMonitor::start_mapping()
936{
937 // initiate mapping job
938 if (mapping_job) {
939 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
940 << dendl;
941 mapping_job->abort();
942 }
224ce89b
WB
943 if (!osdmap.get_pools().empty()) {
944 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
945 mapping_job = mapping.start_update(osdmap, mapper,
11fdf7f2 946 g_conf()->mon_osd_mapping_pgs_per_chunk);
224ce89b
WB
947 dout(10) << __func__ << " started mapping job " << mapping_job.get()
948 << " at " << fin->start << dendl;
949 mapping_job->set_finish_event(fin);
950 } else {
951 dout(10) << __func__ << " no pools, no mapping job" << dendl;
952 mapping_job = nullptr;
953 }
7c673cae
FG
954}
955
956void OSDMonitor::update_msgr_features()
957{
958 set<int> types;
959 types.insert((int)entity_name_t::TYPE_OSD);
960 types.insert((int)entity_name_t::TYPE_CLIENT);
961 types.insert((int)entity_name_t::TYPE_MDS);
962 types.insert((int)entity_name_t::TYPE_MON);
963 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
964 uint64_t mask;
965 uint64_t features = osdmap.get_features(*q, &mask);
966 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
967 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
11fdf7f2 968 ceph::net::Policy p = mon->messenger->get_policy(*q);
7c673cae
FG
969 p.features_required = (p.features_required & ~mask) | features;
970 mon->messenger->set_policy(*q, p);
971 }
972 }
973}
974
975void OSDMonitor::on_active()
976{
977 update_logger();
978
979 if (mon->is_leader()) {
224ce89b 980 mon->clog->debug() << "osdmap " << osdmap;
81eedcae
TL
981 if (!priority_convert) {
982 // Only do this once at start-up
983 convert_pool_priorities();
984 priority_convert = true;
985 }
7c673cae
FG
986 } else {
987 list<MonOpRequestRef> ls;
988 take_all_failures(ls);
989 while (!ls.empty()) {
990 MonOpRequestRef op = ls.front();
991 op->mark_osdmon_event(__func__);
992 dispatch(op);
993 ls.pop_front();
994 }
995 }
996 start_mapping();
997}
998
999void OSDMonitor::on_restart()
1000{
1001 last_osd_report.clear();
1002}
1003
1004void OSDMonitor::on_shutdown()
1005{
1006 dout(10) << __func__ << dendl;
1007 if (mapping_job) {
1008 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1009 << dendl;
1010 mapping_job->abort();
1011 }
1012
1013 // discard failure info, waiters
1014 list<MonOpRequestRef> ls;
1015 take_all_failures(ls);
1016 ls.clear();
1017}
1018
1019void OSDMonitor::update_logger()
1020{
1021 dout(10) << "update_logger" << dendl;
1022
1023 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1024 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1025 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1026 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1027}
1028
7c673cae
FG
1029void OSDMonitor::create_pending()
1030{
1031 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1032 pending_inc.fsid = mon->monmap->fsid;
11fdf7f2
TL
1033 pending_metadata.clear();
1034 pending_metadata_rm.clear();
7c673cae
FG
1035
1036 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1037
11fdf7f2
TL
1038 // safety checks (this shouldn't really happen)
1039 {
1040 if (osdmap.backfillfull_ratio <= 0) {
1041 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1042 if (pending_inc.new_backfillfull_ratio > 1.0)
1043 pending_inc.new_backfillfull_ratio /= 100;
1044 dout(1) << __func__ << " setting backfillfull_ratio = "
1045 << pending_inc.new_backfillfull_ratio << dendl;
7c673cae 1046 }
7c673cae 1047 if (osdmap.full_ratio <= 0) {
11fdf7f2 1048 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
7c673cae
FG
1049 if (pending_inc.new_full_ratio > 1.0)
1050 pending_inc.new_full_ratio /= 100;
1051 dout(1) << __func__ << " setting full_ratio = "
1052 << pending_inc.new_full_ratio << dendl;
1053 }
1054 if (osdmap.nearfull_ratio <= 0) {
11fdf7f2 1055 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
7c673cae
FG
1056 if (pending_inc.new_nearfull_ratio > 1.0)
1057 pending_inc.new_nearfull_ratio /= 100;
1058 dout(1) << __func__ << " setting nearfull_ratio = "
1059 << pending_inc.new_nearfull_ratio << dendl;
1060 }
1061 }
3efd9988
FG
1062
1063 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1064 // structure.
1065 if (osdmap.crush->has_legacy_rule_ids()) {
1066 CrushWrapper newcrush;
1067 _get_pending_crush(newcrush);
1068
1069 // First, for all pools, work out which rule they really used
1070 // by resolving ruleset to rule.
1071 for (const auto &i : osdmap.get_pools()) {
1072 const auto pool_id = i.first;
1073 const auto &pool = i.second;
1074 int new_rule_id = newcrush.find_rule(pool.crush_rule,
1075 pool.type, pool.size);
1076
1077 dout(1) << __func__ << " rewriting pool "
1078 << osdmap.get_pool_name(pool_id) << " crush ruleset "
1079 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1080 if (pending_inc.new_pools.count(pool_id) == 0) {
1081 pending_inc.new_pools[pool_id] = pool;
1082 }
1083 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1084 }
1085
1086 // Now, go ahead and renumber all the rules so that their
1087 // rule_id field corresponds to their position in the array
1088 auto old_to_new = newcrush.renumber_rules();
1089 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1090 for (const auto &i : old_to_new) {
1091 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1092 }
1093 pending_inc.crush.clear();
1094 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1095 }
7c673cae
FG
1096}
1097
1098creating_pgs_t
94b18763
FG
1099OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1100 const OSDMap& nextmap)
7c673cae 1101{
31f18b77 1102 dout(10) << __func__ << dendl;
7c673cae
FG
1103 creating_pgs_t pending_creatings;
1104 {
1105 std::lock_guard<std::mutex> l(creating_pgs_lock);
1106 pending_creatings = creating_pgs;
1107 }
31f18b77
FG
1108 // check for new or old pools
1109 if (pending_creatings.last_scan_epoch < inc.epoch) {
31f18b77
FG
1110 unsigned queued = 0;
1111 queued += scan_for_creating_pgs(osdmap.get_pools(),
1112 inc.old_pools,
1113 inc.modified,
1114 &pending_creatings);
1115 queued += scan_for_creating_pgs(inc.new_pools,
1116 inc.old_pools,
1117 inc.modified,
1118 &pending_creatings);
1119 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1120 for (auto deleted_pool : inc.old_pools) {
1121 auto removed = pending_creatings.remove_pool(deleted_pool);
1122 dout(10) << __func__ << " " << removed
1123 << " pg removed because containing pool deleted: "
1124 << deleted_pool << dendl;
1125 last_epoch_clean.remove_pool(deleted_pool);
1126 }
1127 // pgmon updates its creating_pgs in check_osd_map() which is called by
1128 // on_active() and check_osd_map() could be delayed if lease expires, so its
1129 // creating_pgs could be stale in comparison with the one of osdmon. let's
1130 // trim them here. otherwise, they will be added back after being erased.
1131 unsigned removed = 0;
1132 for (auto& pg : pending_created_pgs) {
1133 dout(20) << __func__ << " noting created pg " << pg << dendl;
1134 pending_creatings.created_pools.insert(pg.pool());
1135 removed += pending_creatings.pgs.erase(pg);
1136 }
1137 pending_created_pgs.clear();
1138 dout(10) << __func__ << " " << removed
1139 << " pgs removed because they're created" << dendl;
1140 pending_creatings.last_scan_epoch = osdmap.get_epoch();
1141 }
1142
94b18763
FG
1143 // filter out any pgs that shouldn't exist.
1144 {
1145 auto i = pending_creatings.pgs.begin();
1146 while (i != pending_creatings.pgs.end()) {
1147 if (!nextmap.pg_exists(i->first)) {
1148 dout(10) << __func__ << " removing pg " << i->first
1149 << " which should not exist" << dendl;
1150 i = pending_creatings.pgs.erase(i);
1151 } else {
1152 ++i;
1153 }
1154 }
1155 }
1156
31f18b77 1157 // process queue
11fdf7f2 1158 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
31f18b77
FG
1159 const auto total = pending_creatings.pgs.size();
1160 while (pending_creatings.pgs.size() < max &&
1161 !pending_creatings.queue.empty()) {
1162 auto p = pending_creatings.queue.begin();
1163 int64_t poolid = p->first;
1164 dout(10) << __func__ << " pool " << poolid
1165 << " created " << p->second.created
1166 << " modified " << p->second.modified
1167 << " [" << p->second.start << "-" << p->second.end << ")"
1168 << dendl;
11fdf7f2
TL
1169 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1170 p->second.end - p->second.start);
31f18b77
FG
1171 ps_t first = p->second.start;
1172 ps_t end = first + n;
1173 for (ps_t ps = first; ps < end; ++ps) {
1174 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1175 // NOTE: use the *current* epoch as the PG creation epoch so that the
1176 // OSD does not have to generate a long set of PastIntervals.
1177 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
1178 p->second.modified));
1179 dout(10) << __func__ << " adding " << pgid << dendl;
1180 }
1181 p->second.start = end;
1182 if (p->second.done()) {
1183 dout(10) << __func__ << " done with queue for " << poolid << dendl;
1184 pending_creatings.queue.erase(p);
1185 } else {
1186 dout(10) << __func__ << " pool " << poolid
1187 << " now [" << p->second.start << "-" << p->second.end << ")"
1188 << dendl;
1189 }
1190 }
1191 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1192 << " pools" << dendl;
c07f9fc5
FG
1193 dout(10) << __func__
1194 << " " << (pending_creatings.pgs.size() - total)
1195 << "/" << pending_creatings.pgs.size()
31f18b77 1196 << " pgs added from queued pools" << dendl;
7c673cae
FG
1197 return pending_creatings;
1198}
1199
1200void OSDMonitor::maybe_prime_pg_temp()
1201{
1202 bool all = false;
1203 if (pending_inc.crush.length()) {
1204 dout(10) << __func__ << " new crush map, all" << dendl;
1205 all = true;
1206 }
1207
1208 if (!pending_inc.new_up_client.empty()) {
1209 dout(10) << __func__ << " new up osds, all" << dendl;
1210 all = true;
1211 }
1212
1213 // check for interesting OSDs
1214 set<int> osds;
31f18b77 1215 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
1216 !all && p != pending_inc.new_state.end();
1217 ++p) {
1218 if ((p->second & CEPH_OSD_UP) &&
1219 osdmap.is_up(p->first)) {
1220 osds.insert(p->first);
1221 }
1222 }
1223 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1224 !all && p != pending_inc.new_weight.end();
1225 ++p) {
1226 if (p->second < osdmap.get_weight(p->first)) {
1227 // weight reduction
1228 osds.insert(p->first);
1229 } else {
1230 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1231 << dendl;
1232 all = true;
1233 }
1234 }
1235
1236 if (!all && osds.empty())
1237 return;
1238
1239 if (!all) {
1240 unsigned estimate =
1241 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1242 if (estimate > mapping.get_num_pgs() *
11fdf7f2 1243 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
7c673cae
FG
1244 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1245 << osds.size() << " osds >= "
11fdf7f2 1246 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
7c673cae
FG
1247 << mapping.get_num_pgs() << " pgs, all"
1248 << dendl;
1249 all = true;
1250 } else {
1251 dout(10) << __func__ << " estimate " << estimate << " pgs on "
1252 << osds.size() << " osds" << dendl;
1253 }
1254 }
1255
1256 OSDMap next;
1257 next.deepish_copy_from(osdmap);
1258 next.apply_incremental(pending_inc);
1259
224ce89b
WB
1260 if (next.get_pools().empty()) {
1261 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1262 } else if (all) {
7c673cae 1263 PrimeTempJob job(next, this);
494da23a 1264 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
11fdf7f2 1265 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
7c673cae
FG
1266 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1267 } else {
1268 dout(10) << __func__ << " did not finish in "
11fdf7f2 1269 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1270 << ", stopping" << dendl;
1271 job.abort();
1272 }
1273 } else {
1274 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1275 utime_t stop = ceph_clock_now();
11fdf7f2 1276 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
7c673cae
FG
1277 const int chunk = 1000;
1278 int n = chunk;
1279 std::unordered_set<pg_t> did_pgs;
1280 for (auto osd : osds) {
1281 auto& pgs = mapping.get_osd_acting_pgs(osd);
1282 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1283 for (auto pgid : pgs) {
1284 if (!did_pgs.insert(pgid).second) {
1285 continue;
1286 }
1287 prime_pg_temp(next, pgid);
1288 if (--n <= 0) {
1289 n = chunk;
1290 if (ceph_clock_now() > stop) {
1291 dout(10) << __func__ << " consumed more than "
11fdf7f2 1292 << g_conf()->mon_osd_prime_pg_temp_max_time
7c673cae
FG
1293 << " seconds, stopping"
1294 << dendl;
1295 return;
1296 }
1297 }
1298 }
1299 }
1300 }
1301}
1302
1303void OSDMonitor::prime_pg_temp(
1304 const OSDMap& next,
1305 pg_t pgid)
1306{
11fdf7f2
TL
1307 // TODO: remove this creating_pgs direct access?
1308 if (creating_pgs.pgs.count(pgid)) {
1309 return;
7c673cae
FG
1310 }
1311 if (!osdmap.pg_exists(pgid)) {
1312 return;
1313 }
1314
1315 vector<int> up, acting;
1316 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1317
1318 vector<int> next_up, next_acting;
1319 int next_up_primary, next_acting_primary;
1320 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1321 &next_acting, &next_acting_primary);
f64942e4
AA
1322 if (acting == next_acting &&
1323 !(up != acting && next_up == next_acting))
7c673cae
FG
1324 return; // no change since last epoch
1325
1326 if (acting.empty())
1327 return; // if previously empty now we can be no worse off
1328 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1329 if (pool && acting.size() < pool->min_size)
1330 return; // can be no worse off than before
1331
c07f9fc5
FG
1332 if (next_up == next_acting) {
1333 acting.clear();
11fdf7f2
TL
1334 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1335 << dendl;
c07f9fc5
FG
1336 }
1337
7c673cae
FG
1338 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1339 << " -> " << next_up << "/" << next_acting
1340 << ", priming " << acting
1341 << dendl;
1342 {
11fdf7f2 1343 std::lock_guard l(prime_pg_temp_lock);
7c673cae
FG
1344 // do not touch a mapping if a change is pending
1345 pending_inc.new_pg_temp.emplace(
1346 pgid,
1347 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1348 }
1349}
1350
1351/**
1352 * @note receiving a transaction in this function gives a fair amount of
1353 * freedom to the service implementation if it does need it. It shouldn't.
1354 */
1355void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1356{
1357 dout(10) << "encode_pending e " << pending_inc.epoch
1358 << dendl;
1359
11fdf7f2
TL
1360 if (do_prune(t)) {
1361 dout(1) << __func__ << " osdmap full prune encoded e"
1362 << pending_inc.epoch << dendl;
1363 }
1364
7c673cae
FG
1365 // finalize up pending_inc
1366 pending_inc.modified = ceph_clock_now();
1367
11fdf7f2
TL
1368 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1369 ceph_assert(r == 0);
7c673cae
FG
1370
1371 if (mapping_job) {
1372 if (!mapping_job->is_done()) {
1373 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1374 << mapping_job.get() << " did not complete, "
1375 << mapping_job->shards << " left" << dendl;
1376 mapping_job->abort();
1377 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1378 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1379 << mapping_job.get() << " is prior epoch "
1380 << mapping.get_epoch() << dendl;
1381 } else {
11fdf7f2 1382 if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1383 maybe_prime_pg_temp();
1384 }
1385 }
11fdf7f2 1386 } else if (g_conf()->mon_osd_prime_pg_temp) {
7c673cae
FG
1387 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1388 << dendl;
1389 }
1390 mapping_job.reset();
1391
c07f9fc5
FG
1392 // ensure we don't have blank new_state updates. these are interrpeted as
1393 // CEPH_OSD_UP (and almost certainly not what we want!).
1394 auto p = pending_inc.new_state.begin();
1395 while (p != pending_inc.new_state.end()) {
1396 if (p->second == 0) {
1397 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1398 p = pending_inc.new_state.erase(p);
1399 } else {
11fdf7f2
TL
1400 if (p->second & CEPH_OSD_UP) {
1401 pending_inc.new_last_up_change = pending_inc.modified;
1402 }
c07f9fc5
FG
1403 ++p;
1404 }
1405 }
11fdf7f2
TL
1406 if (!pending_inc.new_up_client.empty()) {
1407 pending_inc.new_last_up_change = pending_inc.modified;
1408 }
1409 for (auto& i : pending_inc.new_weight) {
1410 if (i.first > osdmap.max_osd) {
1411 if (i.second) {
1412 // new osd is already marked in
1413 pending_inc.new_last_in_change = pending_inc.modified;
1414 }
1415 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1416 // existing osd marked in or out
1417 pending_inc.new_last_in_change = pending_inc.modified;
1418 }
1419 }
7c673cae
FG
1420
1421 {
1422 OSDMap tmp;
1423 tmp.deepish_copy_from(osdmap);
1424 tmp.apply_incremental(pending_inc);
1425
11fdf7f2
TL
1426 // clean pg_temp mappings
1427 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1428
1429 // clean inappropriate pg_upmap/pg_upmap_items (if any)
494da23a
TL
1430 {
1431 // check every upmapped pg for now
1432 // until we could reliably identify certain cases to ignore,
1433 // which is obviously the hard part TBD..
1434 vector<pg_t> pgs_to_check;
1435 tmp.get_upmap_pgs(&pgs_to_check);
1436 if (pgs_to_check.size() < g_conf()->mon_clean_pg_upmaps_per_chunk * 2) {
1437 // not enough pgs, do it inline
1438 tmp.clean_pg_upmaps(cct, &pending_inc);
1439 } else {
1440 CleanUpmapJob job(cct, tmp, pending_inc);
1441 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1442 job.wait();
1443 }
1444 }
11fdf7f2
TL
1445
1446 // update creating pgs first so that we can remove the created pgid and
1447 // process the pool flag removal below in the same osdmap epoch.
1448 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1449 bufferlist creatings_bl;
1450 encode(pending_creatings, creatings_bl);
1451 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1452
1453 // remove any old (or incompat) POOL_CREATING flags
1454 for (auto& i : tmp.get_pools()) {
1455 if (tmp.require_osd_release < CEPH_RELEASE_NAUTILUS) {
1456 // pre-nautilus OSDMaps shouldn't get this flag.
1457 if (pending_inc.new_pools.count(i.first)) {
1458 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1459 }
1460 }
1461 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1462 !pending_creatings.still_creating_pool(i.first)) {
1463 dout(10) << __func__ << " done creating pool " << i.first
1464 << ", clearing CREATING flag" << dendl;
1465 if (pending_inc.new_pools.count(i.first) == 0) {
1466 pending_inc.new_pools[i.first] = i.second;
1467 }
1468 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
3efd9988 1469 }
11fdf7f2
TL
1470 }
1471
1472 // remove any legacy osdmap nearfull/full flags
1473 {
1474 if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1475 dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1476 << dendl;
1477 remove_flag(CEPH_OSDMAP_NEARFULL);
1478 remove_flag(CEPH_OSDMAP_FULL);
1479 }
1480 }
1481 // collect which pools are currently affected by
1482 // the near/backfill/full osd(s),
1483 // and set per-pool near/backfill/full flag instead
1484 set<int64_t> full_pool_ids;
1485 set<int64_t> backfillfull_pool_ids;
1486 set<int64_t> nearfull_pool_ids;
1487 tmp.get_full_pools(cct,
1488 &full_pool_ids,
1489 &backfillfull_pool_ids,
3efd9988 1490 &nearfull_pool_ids);
11fdf7f2
TL
1491 if (full_pool_ids.empty() ||
1492 backfillfull_pool_ids.empty() ||
1493 nearfull_pool_ids.empty()) {
1494 // normal case - no nearfull, backfillfull or full osds
3efd9988
FG
1495 // try cancel any improper nearfull/backfillfull/full pool
1496 // flags first
11fdf7f2
TL
1497 for (auto &pool: tmp.get_pools()) {
1498 auto p = pool.first;
1499 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1500 nearfull_pool_ids.empty()) {
1501 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1502 << "'s nearfull flag" << dendl;
1503 if (pending_inc.new_pools.count(p) == 0) {
1504 // load original pool info first!
1505 pending_inc.new_pools[p] = pool.second;
1506 }
1507 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1508 }
1509 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1510 backfillfull_pool_ids.empty()) {
1511 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1512 << "'s backfillfull flag" << dendl;
1513 if (pending_inc.new_pools.count(p) == 0) {
1514 pending_inc.new_pools[p] = pool.second;
1515 }
1516 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1517 }
1518 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1519 full_pool_ids.empty()) {
1520 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1521 // set by EQUOTA, skipping
1522 continue;
1523 }
1524 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1525 << "'s full flag" << dendl;
1526 if (pending_inc.new_pools.count(p) == 0) {
1527 pending_inc.new_pools[p] = pool.second;
1528 }
1529 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1530 }
3efd9988 1531 }
11fdf7f2
TL
1532 }
1533 if (!full_pool_ids.empty()) {
1534 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1535 << " as full" << dendl;
1536 for (auto &p: full_pool_ids) {
1537 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1538 continue;
1539 }
1540 if (pending_inc.new_pools.count(p) == 0) {
1541 pending_inc.new_pools[p] = tmp.pools[p];
1542 }
1543 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1544 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1545 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1546 }
1547 // cancel FLAG_FULL for pools which are no longer full too
1548 for (auto &pool: tmp.get_pools()) {
1549 auto p = pool.first;
1550 if (full_pool_ids.count(p)) {
1551 // skip pools we have just marked as full above
1552 continue;
1553 }
1554 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1555 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1556 // don't touch if currently is not full
1557 // or is running out of quota (and hence considered as full)
1558 continue;
1559 }
1560 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1561 << "'s full flag" << dendl;
1562 if (pending_inc.new_pools.count(p) == 0) {
1563 pending_inc.new_pools[p] = pool.second;
1564 }
1565 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
3efd9988 1566 }
11fdf7f2
TL
1567 }
1568 if (!backfillfull_pool_ids.empty()) {
1569 for (auto &p: backfillfull_pool_ids) {
1570 if (full_pool_ids.count(p)) {
1571 // skip pools we have already considered as full above
1572 continue;
1573 }
1574 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1575 // make sure FLAG_FULL is truly set, so we are safe not
1576 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1577 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1578 continue;
1579 }
1580 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1581 // don't bother if pool is already marked as backfillfull
1582 continue;
1583 }
1584 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1585 << "'s as backfillfull" << dendl;
1586 if (pending_inc.new_pools.count(p) == 0) {
1587 pending_inc.new_pools[p] = tmp.pools[p];
1588 }
1589 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1590 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1591 }
1592 // cancel FLAG_BACKFILLFULL for pools
1593 // which are no longer backfillfull too
1594 for (auto &pool: tmp.get_pools()) {
1595 auto p = pool.first;
1596 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1597 // skip pools we have just marked as backfillfull/full above
1598 continue;
1599 }
1600 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1601 // and don't touch if currently is not backfillfull
1602 continue;
1603 }
1604 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1605 << "'s backfillfull flag" << dendl;
1606 if (pending_inc.new_pools.count(p) == 0) {
1607 pending_inc.new_pools[p] = pool.second;
1608 }
1609 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
3efd9988 1610 }
11fdf7f2
TL
1611 }
1612 if (!nearfull_pool_ids.empty()) {
1613 for (auto &p: nearfull_pool_ids) {
1614 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1615 continue;
1616 }
1617 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1618 // make sure FLAG_FULL is truly set, so we are safe not
1619 // to set a extra (redundant) FLAG_NEARFULL flag
1620 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1621 continue;
1622 }
1623 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1624 // don't bother if pool is already marked as nearfull
1625 continue;
1626 }
1627 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1628 << "'s as nearfull" << dendl;
1629 if (pending_inc.new_pools.count(p) == 0) {
1630 pending_inc.new_pools[p] = tmp.pools[p];
1631 }
1632 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1633 }
1634 // cancel FLAG_NEARFULL for pools
1635 // which are no longer nearfull too
1636 for (auto &pool: tmp.get_pools()) {
1637 auto p = pool.first;
1638 if (full_pool_ids.count(p) ||
1639 backfillfull_pool_ids.count(p) ||
1640 nearfull_pool_ids.count(p)) {
1641 // skip pools we have just marked as
1642 // nearfull/backfillfull/full above
1643 continue;
1644 }
1645 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1646 // and don't touch if currently is not nearfull
1647 continue;
1648 }
1649 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650 << "'s nearfull flag" << dendl;
1651 if (pending_inc.new_pools.count(p) == 0) {
1652 pending_inc.new_pools[p] = pool.second;
1653 }
1654 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
7c673cae 1655 }
11fdf7f2 1656 }
7c673cae 1657
11fdf7f2
TL
1658 // min_compat_client?
1659 if (tmp.require_min_compat_client == 0) {
1660 auto mv = tmp.get_min_compat_client();
1661 dout(1) << __func__ << " setting require_min_compat_client to currently "
1662 << "required " << ceph_release_name(mv) << dendl;
1663 mon->clog->info() << "setting require_min_compat_client to currently "
1664 << "required " << ceph_release_name(mv);
1665 pending_inc.new_require_min_compat_client = mv;
1666 }
1667
1668 // upgrade to mimic?
1669 if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC &&
1670 tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1671 dout(10) << __func__ << " first mimic+ epoch" << dendl;
1672 // record this epoch as the deletion for all legacy removed_snaps
1673 for (auto& p : tmp.get_pools()) {
1674 // update every pool
1675 if (pending_inc.new_pools.count(p.first) == 0) {
1676 pending_inc.new_pools[p.first] = p.second;
1677 }
1678 auto& pi = pending_inc.new_pools[p.first];
1679 if (pi.snap_seq == 0) {
1680 // no snaps on this pool
1681 continue;
1682 }
1683 if ((pi.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS |
1684 pg_pool_t::FLAG_POOL_SNAPS)) == 0) {
1685 if (!pi.removed_snaps.empty()) {
1686 pi.flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
1687 } else {
1688 pi.flags |= pg_pool_t::FLAG_POOL_SNAPS;
1689 }
1690 }
1691
1692 // Make all previously removed snaps appear to be removed in this
1693 // epoch. this populates removed_snaps_queue. The OSD will subtract
1694 // off its purged_snaps, as before, and this set will shrink over the
1695 // following epochs as the purged snaps are reported back through the
1696 // mgr.
1697 OSDMap::snap_interval_set_t removed;
1698 if (!p.second.removed_snaps.empty()) {
1699 // different flavor of interval_set :(
1700 for (auto q = p.second.removed_snaps.begin();
1701 q != p.second.removed_snaps.end();
1702 ++q) {
1703 removed.insert(q.get_start(), q.get_len());
1704 }
1705 } else {
1706 for (snapid_t s = 1; s <= pi.get_snap_seq(); s = s + 1) {
1707 if (pi.snaps.count(s) == 0) {
1708 removed.insert(s);
224ce89b
WB
1709 }
1710 }
11fdf7f2
TL
1711 }
1712 pending_inc.new_removed_snaps[p.first].union_of(removed);
1713
1714 dout(10) << __func__ << " converting pool " << p.first
1715 << " with " << p.second.removed_snaps.size()
1716 << " legacy removed_snaps" << dendl;
1717 string k = make_snap_epoch_key(p.first, pending_inc.epoch);
1718 bufferlist v;
1719 encode(p.second.removed_snaps, v);
1720 t->put(OSD_SNAP_PREFIX, k, v);
1721 for (auto q = p.second.removed_snaps.begin();
1722 q != p.second.removed_snaps.end();
1723 ++q) {
1724 bufferlist v;
1725 string k = make_snap_key_value(p.first, q.get_start(),
1726 q.get_len(), pending_inc.epoch, &v);
1727 t->put(OSD_SNAP_PREFIX, k, v);
1728 }
1729 }
1730 }
1731 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS &&
1732 tmp.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
1733 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1734 // add creating flags?
1735 for (auto& i : tmp.get_pools()) {
1736 if (pending_creatings.still_creating_pool(i.first)) {
1737 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1738 << dendl;
1739 if (pending_inc.new_pools.count(i.first) == 0) {
1740 pending_inc.new_pools[i.first] = i.second;
224ce89b 1741 }
11fdf7f2 1742 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
224ce89b 1743 }
11fdf7f2
TL
1744 }
1745 // adjust blacklist items to all be TYPE_ANY
1746 for (auto& i : tmp.blacklist) {
1747 auto a = i.first;
1748 a.set_type(entity_addr_t::TYPE_ANY);
1749 pending_inc.new_blacklist[a] = i.second;
1750 pending_inc.old_blacklist.push_back(i.first);
224ce89b 1751 }
7c673cae
FG
1752 }
1753 }
1754
1755 // tell me about it
31f18b77 1756 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1757 i != pending_inc.new_state.end();
1758 ++i) {
1759 int s = i->second ? i->second : CEPH_OSD_UP;
1760 if (s & CEPH_OSD_UP)
1761 dout(2) << " osd." << i->first << " DOWN" << dendl;
1762 if (s & CEPH_OSD_EXISTS)
1763 dout(2) << " osd." << i->first << " DNE" << dendl;
1764 }
11fdf7f2 1765 for (auto i = pending_inc.new_up_client.begin();
7c673cae
FG
1766 i != pending_inc.new_up_client.end();
1767 ++i) {
1768 //FIXME: insert cluster addresses too
1769 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1770 }
1771 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1772 i != pending_inc.new_weight.end();
1773 ++i) {
1774 if (i->second == CEPH_OSD_OUT) {
1775 dout(2) << " osd." << i->first << " OUT" << dendl;
1776 } else if (i->second == CEPH_OSD_IN) {
1777 dout(2) << " osd." << i->first << " IN" << dendl;
1778 } else {
1779 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1780 }
1781 }
1782
1783 // features for osdmap and its incremental
28e407b8 1784 uint64_t features;
7c673cae
FG
1785
1786 // encode full map and determine its crc
1787 OSDMap tmp;
1788 {
1789 tmp.deepish_copy_from(osdmap);
1790 tmp.apply_incremental(pending_inc);
1791
1792 // determine appropriate features
28e407b8
AA
1793 features = tmp.get_encoding_features();
1794 dout(10) << __func__ << " encoding full map with "
1795 << ceph_release_name(tmp.require_osd_release)
1796 << " features " << features << dendl;
1797
1798 // the features should be a subset of the mon quorum's features!
11fdf7f2 1799 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
7c673cae
FG
1800
1801 bufferlist fullbl;
11fdf7f2 1802 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
1803 pending_inc.full_crc = tmp.get_crc();
1804
1805 // include full map in the txn. note that old monitors will
1806 // overwrite this. new ones will now skip the local full map
1807 // encode and reload from this.
1808 put_version_full(t, pending_inc.epoch, fullbl);
1809 }
1810
1811 // encode
11fdf7f2
TL
1812 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1813 bufferlist bl;
1814 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
7c673cae
FG
1815
1816 dout(20) << " full_crc " << tmp.get_crc()
1817 << " inc_crc " << pending_inc.inc_crc << dendl;
1818
1819 /* put everything in the transaction */
1820 put_version(t, pending_inc.epoch, bl);
1821 put_last_committed(t, pending_inc.epoch);
1822
1823 // metadata, too!
1824 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1825 p != pending_metadata.end();
1826 ++p)
1827 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1828 for (set<int>::iterator p = pending_metadata_rm.begin();
1829 p != pending_metadata_rm.end();
1830 ++p)
1831 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1832 pending_metadata.clear();
1833 pending_metadata_rm.clear();
1834
11fdf7f2
TL
1835 // removed_snaps
1836 if (tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1837 for (auto& i : pending_inc.new_removed_snaps) {
1838 {
1839 // all snaps removed this epoch
1840 string k = make_snap_epoch_key(i.first, pending_inc.epoch);
1841 bufferlist v;
1842 encode(i.second, v);
1843 t->put(OSD_SNAP_PREFIX, k, v);
1844 }
1845 for (auto q = i.second.begin();
1846 q != i.second.end();
1847 ++q) {
1848 bufferlist v;
1849 string k = make_snap_key_value(i.first, q.get_start(),
1850 q.get_len(), pending_inc.epoch, &v);
1851 t->put(OSD_SNAP_PREFIX, k, v);
1852 }
1853 }
1854 for (auto& i : pending_inc.new_purged_snaps) {
1855 for (auto q = i.second.begin();
1856 q != i.second.end();
1857 ++q) {
1858 bufferlist v;
1859 string k = make_snap_purged_key_value(i.first, q.get_start(),
1860 q.get_len(), pending_inc.epoch,
1861 &v);
1862 t->put(OSD_SNAP_PREFIX, k, v);
1863 }
7c673cae 1864 }
7c673cae 1865 }
224ce89b
WB
1866
1867 // health
1868 health_check_map_t next;
1869 tmp.check_health(&next);
1870 encode_health(next, t);
7c673cae
FG
1871}
1872
7c673cae
FG
1873int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1874{
1875 bufferlist bl;
1876 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1877 if (r < 0)
1878 return r;
1879 try {
11fdf7f2
TL
1880 auto p = bl.cbegin();
1881 decode(m, p);
7c673cae
FG
1882 }
1883 catch (buffer::error& e) {
1884 if (err)
1885 *err << "osd." << osd << " metadata is corrupt";
1886 return -EIO;
1887 }
1888 return 0;
1889}
1890
c07f9fc5 1891void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 1892{
31f18b77
FG
1893 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1894 if (osdmap.is_up(osd)) {
1895 map<string,string> meta;
1896 load_metadata(osd, meta, nullptr);
1897 auto p = meta.find(field);
1898 if (p == meta.end()) {
c07f9fc5 1899 (*out)["unknown"]++;
31f18b77 1900 } else {
c07f9fc5 1901 (*out)[p->second]++;
31f18b77
FG
1902 }
1903 }
1904 }
c07f9fc5
FG
1905}
1906
1907void OSDMonitor::count_metadata(const string& field, Formatter *f)
1908{
1909 map<string,int> by_val;
1910 count_metadata(field, &by_val);
31f18b77
FG
1911 f->open_object_section(field.c_str());
1912 for (auto& p : by_val) {
1913 f->dump_int(p.first.c_str(), p.second);
1914 }
1915 f->close_section();
1916}
1917
7c673cae
FG
1918int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1919{
1920 map<string, string> metadata;
1921 int r = load_metadata(osd, metadata, nullptr);
1922 if (r < 0)
1923 return r;
1924
1925 auto it = metadata.find("osd_objectstore");
1926 if (it == metadata.end())
1927 return -ENOENT;
1928 *type = it->second;
1929 return 0;
1930}
1931
1932bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1933 const pg_pool_t &pool,
1934 ostream *err)
1935{
1936 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1937 // since filestore osds could always join the pool later
1938 set<int> checked_osds;
11fdf7f2 1939 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
7c673cae 1940 vector<int> up, acting;
11fdf7f2 1941 pg_t pgid(ps, pool_id);
7c673cae
FG
1942 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1943 for (int osd : up) {
1944 if (checked_osds.find(osd) != checked_osds.end())
1945 continue;
1946 string objectstore_type;
1947 int r = get_osd_objectstore_type(osd, &objectstore_type);
1948 // allow with missing metadata, e.g. due to an osd never booting yet
1949 if (r < 0 || objectstore_type == "bluestore") {
1950 checked_osds.insert(osd);
1951 continue;
1952 }
1953 *err << "osd." << osd << " uses " << objectstore_type;
1954 return false;
1955 }
1956 }
1957 return true;
1958}
1959
1960int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1961{
1962 map<string,string> m;
1963 if (int r = load_metadata(osd, m, err))
1964 return r;
1965 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1966 f->dump_string(p->first.c_str(), p->second);
1967 return 0;
1968}
1969
1970void OSDMonitor::print_nodes(Formatter *f)
1971{
1972 // group OSDs by their hosts
1973 map<string, list<int> > osds; // hostname => osd
1974 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1975 map<string, string> m;
1976 if (load_metadata(osd, m, NULL)) {
1977 continue;
1978 }
1979 map<string, string>::iterator hostname = m.find("hostname");
1980 if (hostname == m.end()) {
1981 // not likely though
1982 continue;
1983 }
1984 osds[hostname->second].push_back(osd);
1985 }
1986
1987 dump_services(f, osds, "osd");
1988}
1989
1990void OSDMonitor::share_map_with_random_osd()
1991{
1992 if (osdmap.get_num_up_osds() == 0) {
1993 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1994 return;
1995 }
1996
1997 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1998 if (!s) {
1999 dout(10) << __func__ << " no up osd on our session map" << dendl;
2000 return;
2001 }
2002
11fdf7f2
TL
2003 dout(10) << "committed, telling random " << s->name
2004 << " all about it" << dendl;
28e407b8
AA
2005
2006 // get feature of the peer
2007 // use quorum_con_features, if it's an anonymous connection.
2008 uint64_t features = s->con_features ? s->con_features :
2009 mon->get_quorum_con_features();
7c673cae 2010 // whatev, they'll request more if they need it
28e407b8 2011 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
7c673cae
FG
2012 s->con->send_message(m);
2013 // NOTE: do *not* record osd has up to this epoch (as we do
2014 // elsewhere) as they may still need to request older values.
2015}
2016
11fdf7f2 2017version_t OSDMonitor::get_trim_to() const
7c673cae 2018{
31f18b77
FG
2019 if (mon->get_quorum().empty()) {
2020 dout(10) << __func__ << ": quorum not formed" << dendl;
2021 return 0;
2022 }
7c673cae 2023
11fdf7f2
TL
2024 {
2025 std::lock_guard<std::mutex> l(creating_pgs_lock);
2026 if (!creating_pgs.pgs.empty()) {
7c673cae
FG
2027 return 0;
2028 }
7c673cae 2029 }
11fdf7f2
TL
2030
2031 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2032 dout(0) << __func__
2033 << " blocking osdmap trim"
2034 " ('mon_debug_block_osdmap_trim' set to 'true')"
2035 << dendl;
2036 return 0;
2037 }
2038
7c673cae 2039 {
11fdf7f2 2040 epoch_t floor = get_min_last_epoch_clean();
7c673cae 2041 dout(10) << " min_last_epoch_clean " << floor << dendl;
11fdf7f2
TL
2042 if (g_conf()->mon_osd_force_trim_to > 0 &&
2043 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2044 floor = g_conf()->mon_osd_force_trim_to;
7c673cae
FG
2045 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2046 }
11fdf7f2 2047 unsigned min = g_conf()->mon_min_osdmap_epochs;
7c673cae
FG
2048 if (floor + min > get_last_committed()) {
2049 if (min < get_last_committed())
2050 floor = get_last_committed() - min;
2051 else
2052 floor = 0;
2053 }
2054 if (floor > get_first_committed())
2055 return floor;
2056 }
2057 return 0;
2058}
2059
2060epoch_t OSDMonitor::get_min_last_epoch_clean() const
2061{
2062 auto floor = last_epoch_clean.get_lower_bound(osdmap);
2063 // also scan osd epochs
2064 // don't trim past the oldest reported osd epoch
2065 for (auto& osd_epoch : osd_epochs) {
2066 if (osd_epoch.second < floor) {
2067 floor = osd_epoch.second;
2068 }
2069 }
2070 return floor;
2071}
2072
2073void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2074 version_t first)
2075{
2076 dout(10) << __func__ << " including full map for e " << first << dendl;
2077 bufferlist bl;
2078 get_version_full(first, bl);
2079 put_version_full(tx, first, bl);
11fdf7f2
TL
2080
2081 if (has_osdmap_manifest &&
2082 first > osdmap_manifest.get_first_pinned()) {
2083 _prune_update_trimmed(tx, first);
2084 }
7c673cae
FG
2085}
2086
11fdf7f2
TL
2087
2088/* full osdmap prune
2089 *
2090 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2091 */
2092
2093void OSDMonitor::load_osdmap_manifest()
2094{
2095 bool store_has_manifest =
2096 mon->store->exists(get_service_name(), "osdmap_manifest");
2097
2098 if (!store_has_manifest) {
2099 if (!has_osdmap_manifest) {
2100 return;
2101 }
2102
2103 dout(20) << __func__
2104 << " dropping osdmap manifest from memory." << dendl;
2105 osdmap_manifest = osdmap_manifest_t();
2106 has_osdmap_manifest = false;
2107 return;
2108 }
2109
2110 dout(20) << __func__
2111 << " osdmap manifest detected in store; reload." << dendl;
2112
2113 bufferlist manifest_bl;
2114 int r = get_value("osdmap_manifest", manifest_bl);
2115 if (r < 0) {
2116 derr << __func__ << " unable to read osdmap version manifest" << dendl;
2117 ceph_abort_msg("error reading manifest");
2118 }
2119 osdmap_manifest.decode(manifest_bl);
2120 has_osdmap_manifest = true;
2121
2122 dout(10) << __func__ << " store osdmap manifest pinned ("
2123 << osdmap_manifest.get_first_pinned()
2124 << " .. "
2125 << osdmap_manifest.get_last_pinned()
2126 << ")"
2127 << dendl;
2128}
2129
2130bool OSDMonitor::should_prune() const
2131{
2132 version_t first = get_first_committed();
2133 version_t last = get_last_committed();
2134 version_t min_osdmap_epochs =
2135 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2136 version_t prune_min =
2137 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2138 version_t prune_interval =
2139 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2140 version_t last_pinned = osdmap_manifest.get_last_pinned();
2141 version_t last_to_pin = last - min_osdmap_epochs;
2142
2143 // Make it or break it constraints.
2144 //
2145 // If any of these conditions fails, we will not prune, regardless of
2146 // whether we have an on-disk manifest with an on-going pruning state.
2147 //
2148 if ((last - first) <= min_osdmap_epochs) {
2149 // between the first and last committed epochs, we don't have
2150 // enough epochs to trim, much less to prune.
2151 dout(10) << __func__
2152 << " currently holding only " << (last - first)
2153 << " epochs (min osdmap epochs: " << min_osdmap_epochs
2154 << "); do not prune."
2155 << dendl;
2156 return false;
2157
2158 } else if ((last_to_pin - first) < prune_min) {
2159 // between the first committed epoch and the last epoch we would prune,
2160 // we simply don't have enough versions over the minimum to prune maps.
2161 dout(10) << __func__
2162 << " could only prune " << (last_to_pin - first)
2163 << " epochs (" << first << ".." << last_to_pin << "), which"
2164 " is less than the required minimum (" << prune_min << ")"
2165 << dendl;
2166 return false;
2167
2168 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2169 dout(10) << __func__
2170 << " we have pruned as far as we can; do not prune."
2171 << dendl;
2172 return false;
2173
2174 } else if (last_pinned + prune_interval > last_to_pin) {
2175 dout(10) << __func__
2176 << " not enough epochs to form an interval (last pinned: "
2177 << last_pinned << ", last to pin: "
2178 << last_to_pin << ", interval: " << prune_interval << ")"
2179 << dendl;
2180 return false;
2181 }
2182
2183 dout(15) << __func__
2184 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2185 << " lc (" << first << ".." << last << ")"
2186 << dendl;
2187 return true;
2188}
2189
2190void OSDMonitor::_prune_update_trimmed(
2191 MonitorDBStore::TransactionRef tx,
2192 version_t first)
2193{
2194 dout(10) << __func__
2195 << " first " << first
2196 << " last_pinned " << osdmap_manifest.get_last_pinned()
2197 << " last_pinned " << osdmap_manifest.get_last_pinned()
2198 << dendl;
2199
2200 osdmap_manifest_t manifest = osdmap_manifest;
2201
2202 if (!manifest.is_pinned(first)) {
2203 manifest.pin(first);
2204 }
2205
2206 set<version_t>::iterator p_end = manifest.pinned.find(first);
2207 set<version_t>::iterator p = manifest.pinned.begin();
2208 manifest.pinned.erase(p, p_end);
2209 ceph_assert(manifest.get_first_pinned() == first);
2210
2211 if (manifest.get_last_pinned() == first+1 ||
2212 manifest.pinned.size() == 1) {
2213 // we reached the end of the line, as pinned maps go; clean up our
2214 // manifest, and let `should_prune()` decide whether we should prune
2215 // again.
2216 tx->erase(get_service_name(), "osdmap_manifest");
2217 return;
2218 }
2219
2220 bufferlist bl;
2221 manifest.encode(bl);
2222 tx->put(get_service_name(), "osdmap_manifest", bl);
2223}
2224
2225void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2226{
2227 dout(1) << __func__ << dendl;
2228
2229 version_t pin_first;
2230
2231 // verify constrainsts on stable in-memory state
2232 if (!has_osdmap_manifest) {
2233 // we must have never pruned, OR if we pruned the state must no longer
2234 // be relevant (i.e., the state must have been removed alongside with
2235 // the trim that *must* have removed past the last pinned map in a
2236 // previous prune).
2237 ceph_assert(osdmap_manifest.pinned.empty());
2238 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2239 pin_first = get_first_committed();
2240
2241 } else {
2242 // we must have pruned in the past AND its state is still relevant
2243 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2244 // and thus we still hold a manifest in the store).
2245 ceph_assert(!osdmap_manifest.pinned.empty());
2246 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2247 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2248
2249 dout(10) << __func__
2250 << " first_pinned " << osdmap_manifest.get_first_pinned()
2251 << " last_pinned " << osdmap_manifest.get_last_pinned()
2252 << dendl;
2253
2254 pin_first = osdmap_manifest.get_last_pinned();
2255 }
2256
2257 manifest.pin(pin_first);
2258}
2259
2260bool OSDMonitor::_prune_sanitize_options() const
2261{
2262 uint64_t prune_interval =
2263 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2264 uint64_t prune_min =
2265 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2266 uint64_t txsize =
2267 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2268
2269 bool r = true;
2270
2271 if (prune_interval == 0) {
2272 derr << __func__
2273 << " prune is enabled BUT prune interval is zero; abort."
2274 << dendl;
2275 r = false;
2276 } else if (prune_interval == 1) {
2277 derr << __func__
2278 << " prune interval is equal to one, which essentially means"
2279 " no pruning; abort."
2280 << dendl;
2281 r = false;
2282 }
2283 if (prune_min == 0) {
2284 derr << __func__
2285 << " prune is enabled BUT prune min is zero; abort."
2286 << dendl;
2287 r = false;
2288 }
2289 if (prune_interval > prune_min) {
2290 derr << __func__
2291 << " impossible to ascertain proper prune interval because"
2292 << " it is greater than the minimum prune epochs"
2293 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2294 << dendl;
2295 r = false;
2296 }
2297
2298 if (txsize < prune_interval - 1) {
2299 derr << __func__
2300 << "'mon_osdmap_full_prune_txsize' (" << txsize
2301 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2302 << "); abort." << dendl;
2303 r = false;
2304 }
2305 return r;
2306}
2307
2308bool OSDMonitor::is_prune_enabled() const {
2309 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2310}
2311
2312bool OSDMonitor::is_prune_supported() const {
2313 return mon->get_required_mon_features().contains_any(
2314 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2315}
2316
2317/** do_prune
2318 *
2319 * @returns true if has side-effects; false otherwise.
2320 */
2321bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2322{
2323 bool enabled = is_prune_enabled();
2324
2325 dout(1) << __func__ << " osdmap full prune "
2326 << ( enabled ? "enabled" : "disabled")
2327 << dendl;
2328
2329 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2330 return false;
2331 }
2332
2333 // we are beyond the minimum prune versions, we need to remove maps because
2334 // otherwise the store will grow unbounded and we may end up having issues
2335 // with available disk space or store hangs.
2336
2337 // we will not pin all versions. We will leave a buffer number of versions.
2338 // this allows us the monitor to trim maps without caring too much about
2339 // pinned maps, and then allow us to use another ceph-mon without these
2340 // capabilities, without having to repair the store.
2341
2342 osdmap_manifest_t manifest = osdmap_manifest;
2343
2344 version_t first = get_first_committed();
2345 version_t last = get_last_committed();
2346
2347 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2348 version_t last_pinned = manifest.get_last_pinned();
2349 uint64_t prune_interval =
2350 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2351 uint64_t txsize =
2352 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2353
2354 prune_init(manifest);
2355
2356 // we need to get rid of some osdmaps
2357
2358 dout(5) << __func__
2359 << " lc (" << first << " .. " << last << ")"
2360 << " last_pinned " << last_pinned
2361 << " interval " << prune_interval
2362 << " last_to_pin " << last_to_pin
2363 << dendl;
2364
2365 // We will be erasing maps as we go.
2366 //
2367 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2368 //
2369 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2370 // we stop pruning. We could prune the maps between `next_to_pin` and
2371 // `last_to_pin`, but by not doing it we end up with neater pruned
2372 // intervals, aligned with `prune_interval`. Besides, this should not be a
2373 // problem as long as `prune_interval` is set to a sane value, instead of
2374 // hundreds or thousands of maps.
2375
2376 auto map_exists = [this](version_t v) {
2377 string k = mon->store->combine_strings("full", v);
2378 return mon->store->exists(get_service_name(), k);
2379 };
2380
2381 // 'interval' represents the number of maps from the last pinned
2382 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2383 // version 11 next; all intermediate versions will be removed.
2384 //
2385 // 'txsize' represents the maximum number of versions we'll be removing in
2386 // this iteration. If 'txsize' is large enough to perform multiple passes
2387 // pinning and removing maps, we will do so; if not, we'll do at least one
2388 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2389 // ensure that we never go *over* the maximum.
2390
2391 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2392 uint64_t removal_interval = prune_interval - 1;
2393
2394 if (txsize < removal_interval) {
2395 dout(5) << __func__
2396 << " setting txsize to removal interval size ("
2397 << removal_interval << " versions"
2398 << dendl;
2399 txsize = removal_interval;
2400 }
2401 ceph_assert(removal_interval > 0);
2402
2403 uint64_t num_pruned = 0;
2404 while (num_pruned + removal_interval <= txsize) {
2405 last_pinned = manifest.get_last_pinned();
2406
2407 if (last_pinned + prune_interval > last_to_pin) {
2408 break;
2409 }
2410 ceph_assert(last_pinned < last_to_pin);
2411
2412 version_t next_pinned = last_pinned + prune_interval;
2413 ceph_assert(next_pinned <= last_to_pin);
2414 manifest.pin(next_pinned);
2415
2416 dout(20) << __func__
2417 << " last_pinned " << last_pinned
2418 << " next_pinned " << next_pinned
2419 << " num_pruned " << num_pruned
2420 << " removal interval (" << (last_pinned+1)
2421 << ".." << (next_pinned-1) << ")"
2422 << " txsize " << txsize << dendl;
2423
2424 ceph_assert(map_exists(last_pinned));
2425 ceph_assert(map_exists(next_pinned));
2426
2427 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2428 ceph_assert(!manifest.is_pinned(v));
2429
2430 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2431 string full_key = mon->store->combine_strings("full", v);
2432 tx->erase(get_service_name(), full_key);
2433 ++num_pruned;
2434 }
2435 }
2436
2437 ceph_assert(num_pruned > 0);
2438
2439 bufferlist bl;
2440 manifest.encode(bl);
2441 tx->put(get_service_name(), "osdmap_manifest", bl);
2442
2443 return true;
2444}
2445
2446
7c673cae
FG
2447// -------------
2448
2449bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2450{
2451 op->mark_osdmon_event(__func__);
2452 Message *m = op->get_req();
2453 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2454
2455 switch (m->get_type()) {
2456 // READs
2457 case MSG_MON_COMMAND:
f64942e4
AA
2458 try {
2459 return preprocess_command(op);
11fdf7f2 2460 } catch (const bad_cmd_get& e) {
f64942e4
AA
2461 bufferlist bl;
2462 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2463 return true;
2464 }
7c673cae
FG
2465 case CEPH_MSG_MON_GET_OSDMAP:
2466 return preprocess_get_osdmap(op);
2467
2468 // damp updates
2469 case MSG_OSD_MARK_ME_DOWN:
2470 return preprocess_mark_me_down(op);
2471 case MSG_OSD_FULL:
2472 return preprocess_full(op);
2473 case MSG_OSD_FAILURE:
2474 return preprocess_failure(op);
2475 case MSG_OSD_BOOT:
2476 return preprocess_boot(op);
2477 case MSG_OSD_ALIVE:
2478 return preprocess_alive(op);
2479 case MSG_OSD_PG_CREATED:
2480 return preprocess_pg_created(op);
11fdf7f2
TL
2481 case MSG_OSD_PG_READY_TO_MERGE:
2482 return preprocess_pg_ready_to_merge(op);
7c673cae
FG
2483 case MSG_OSD_PGTEMP:
2484 return preprocess_pgtemp(op);
2485 case MSG_OSD_BEACON:
2486 return preprocess_beacon(op);
2487
2488 case CEPH_MSG_POOLOP:
2489 return preprocess_pool_op(op);
2490
2491 case MSG_REMOVE_SNAPS:
2492 return preprocess_remove_snaps(op);
2493
2494 default:
2495 ceph_abort();
2496 return true;
2497 }
2498}
2499
2500bool OSDMonitor::prepare_update(MonOpRequestRef op)
2501{
2502 op->mark_osdmon_event(__func__);
2503 Message *m = op->get_req();
2504 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2505
2506 switch (m->get_type()) {
2507 // damp updates
2508 case MSG_OSD_MARK_ME_DOWN:
2509 return prepare_mark_me_down(op);
2510 case MSG_OSD_FULL:
2511 return prepare_full(op);
2512 case MSG_OSD_FAILURE:
2513 return prepare_failure(op);
2514 case MSG_OSD_BOOT:
2515 return prepare_boot(op);
2516 case MSG_OSD_ALIVE:
2517 return prepare_alive(op);
2518 case MSG_OSD_PG_CREATED:
2519 return prepare_pg_created(op);
2520 case MSG_OSD_PGTEMP:
2521 return prepare_pgtemp(op);
11fdf7f2
TL
2522 case MSG_OSD_PG_READY_TO_MERGE:
2523 return prepare_pg_ready_to_merge(op);
7c673cae
FG
2524 case MSG_OSD_BEACON:
2525 return prepare_beacon(op);
2526
2527 case MSG_MON_COMMAND:
f64942e4
AA
2528 try {
2529 return prepare_command(op);
11fdf7f2 2530 } catch (const bad_cmd_get& e) {
f64942e4
AA
2531 bufferlist bl;
2532 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2533 return true;
2534 }
7c673cae
FG
2535
2536 case CEPH_MSG_POOLOP:
2537 return prepare_pool_op(op);
2538
2539 case MSG_REMOVE_SNAPS:
2540 return prepare_remove_snaps(op);
2541
2542
2543 default:
2544 ceph_abort();
2545 }
2546
2547 return false;
2548}
2549
2550bool OSDMonitor::should_propose(double& delay)
2551{
2552 dout(10) << "should_propose" << dendl;
2553
2554 // if full map, propose immediately! any subsequent changes will be clobbered.
2555 if (pending_inc.fullmap.length())
2556 return true;
2557
2558 // adjust osd weights?
2559 if (!osd_weight.empty() &&
2560 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2561 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2562 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2563 delay = 0.0;
2564 osd_weight.clear();
2565 return true;
2566 }
2567
7c673cae
FG
2568 return PaxosService::should_propose(delay);
2569}
2570
2571
2572
2573// ---------------------------
2574// READs
2575
2576bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2577{
2578 op->mark_osdmon_event(__func__);
2579 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
28e407b8
AA
2580
2581 uint64_t features = mon->get_quorum_con_features();
11fdf7f2
TL
2582 if (op->get_session() && op->get_session()->con_features)
2583 features = op->get_session()->con_features;
28e407b8 2584
7c673cae 2585 dout(10) << __func__ << " " << *m << dendl;
28e407b8 2586 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
2587 epoch_t first = get_first_committed();
2588 epoch_t last = osdmap.get_epoch();
11fdf7f2
TL
2589 int max = g_conf()->osd_map_message_max;
2590 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2591 for (epoch_t e = std::max(first, m->get_full_first());
2592 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
7c673cae 2593 ++e, --max) {
11fdf7f2
TL
2594 bufferlist& bl = reply->maps[e];
2595 int r = get_version_full(e, features, bl);
2596 ceph_assert(r >= 0);
2597 max_bytes -= bl.length();
7c673cae 2598 }
11fdf7f2
TL
2599 for (epoch_t e = std::max(first, m->get_inc_first());
2600 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
7c673cae 2601 ++e, --max) {
11fdf7f2
TL
2602 bufferlist& bl = reply->incremental_maps[e];
2603 int r = get_version(e, features, bl);
2604 ceph_assert(r >= 0);
2605 max_bytes -= bl.length();
7c673cae
FG
2606 }
2607 reply->oldest_map = first;
2608 reply->newest_map = last;
2609 mon->send_reply(op, reply);
2610 return true;
2611}
2612
2613
2614// ---------------------------
2615// UPDATEs
2616
2617// failure --
2618
11fdf7f2 2619bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
7c673cae 2620 // check permissions
11fdf7f2 2621 MonSession *session = op->get_session();
7c673cae
FG
2622 if (!session)
2623 return true;
2624 if (!session->is_capable("osd", MON_CAP_X)) {
2625 dout(0) << "got MOSDFailure from entity with insufficient caps "
2626 << session->caps << dendl;
2627 return true;
2628 }
2629 if (fsid != mon->monmap->fsid) {
2630 dout(0) << "check_source: on fsid " << fsid
2631 << " != " << mon->monmap->fsid << dendl;
2632 return true;
2633 }
2634 return false;
2635}
2636
2637
2638bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2639{
2640 op->mark_osdmon_event(__func__);
2641 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2642 // who is target_osd
11fdf7f2 2643 int badboy = m->get_target_osd();
7c673cae
FG
2644
2645 // check permissions
11fdf7f2 2646 if (check_source(op, m->fsid))
7c673cae
FG
2647 goto didit;
2648
2649 // first, verify the reporting host is valid
2650 if (m->get_orig_source().is_osd()) {
2651 int from = m->get_orig_source().num();
2652 if (!osdmap.exists(from) ||
11fdf7f2 2653 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
7c673cae 2654 (osdmap.is_down(from) && m->if_osd_failed())) {
11fdf7f2
TL
2655 dout(5) << "preprocess_failure from dead osd." << from
2656 << ", ignoring" << dendl;
7c673cae
FG
2657 send_incremental(op, m->get_epoch()+1);
2658 goto didit;
2659 }
2660 }
2661
2662
2663 // weird?
2664 if (osdmap.is_down(badboy)) {
11fdf7f2
TL
2665 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2666 << " " << m->get_target_addrs()
2667 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2668 if (m->get_epoch() < osdmap.get_epoch())
2669 send_incremental(op, m->get_epoch()+1);
2670 goto didit;
2671 }
11fdf7f2
TL
2672 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2673 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2674 << " " << m->get_target_addrs()
2675 << " != map's " << osdmap.get_addrs(badboy)
2676 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2677 if (m->get_epoch() < osdmap.get_epoch())
2678 send_incremental(op, m->get_epoch()+1);
2679 goto didit;
2680 }
2681
2682 // already reported?
2683 if (osdmap.is_down(badboy) ||
2684 osdmap.get_up_from(badboy) > m->get_epoch()) {
11fdf7f2
TL
2685 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2686 << " " << m->get_target_addrs()
2687 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2688 if (m->get_epoch() < osdmap.get_epoch())
2689 send_incremental(op, m->get_epoch()+1);
2690 goto didit;
2691 }
2692
2693 if (!can_mark_down(badboy)) {
11fdf7f2
TL
2694 dout(5) << "preprocess_failure ignoring report of osd."
2695 << m->get_target_osd() << " " << m->get_target_addrs()
2696 << " from " << m->get_orig_source() << dendl;
7c673cae
FG
2697 goto didit;
2698 }
2699
11fdf7f2
TL
2700 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2701 << " " << m->get_target_addrs()
2702 << ", from " << m->get_orig_source() << dendl;
7c673cae
FG
2703 return false;
2704
2705 didit:
28e407b8 2706 mon->no_reply(op);
7c673cae
FG
2707 return true;
2708}
2709
2710class C_AckMarkedDown : public C_MonOp {
2711 OSDMonitor *osdmon;
2712public:
2713 C_AckMarkedDown(
2714 OSDMonitor *osdmon,
2715 MonOpRequestRef op)
2716 : C_MonOp(op), osdmon(osdmon) {}
2717
eafe8130
TL
2718 void _finish(int r) override {
2719 if (r == 0) {
2720 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2721 osdmon->mon->send_reply(
2722 op,
2723 new MOSDMarkMeDown(
2724 m->fsid,
2725 m->target_osd,
2726 m->target_addrs,
2727 m->get_epoch(),
2728 false)); // ACK itself does not request an ack
2729 } else if (r == -EAGAIN) {
2730 osdmon->dispatch(op);
2731 } else {
2732 ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2733 }
7c673cae
FG
2734 }
2735 ~C_AckMarkedDown() override {
2736 }
2737};
2738
2739bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2740{
2741 op->mark_osdmon_event(__func__);
2742 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
11fdf7f2 2743 int from = m->target_osd;
7c673cae
FG
2744
2745 // check permissions
11fdf7f2 2746 if (check_source(op, m->fsid))
7c673cae
FG
2747 goto reply;
2748
2749 // first, verify the reporting host is valid
2750 if (!m->get_orig_source().is_osd())
2751 goto reply;
2752
2753 if (!osdmap.exists(from) ||
2754 osdmap.is_down(from) ||
11fdf7f2 2755 osdmap.get_addrs(from) != m->target_addrs) {
7c673cae
FG
2756 dout(5) << "preprocess_mark_me_down from dead osd."
2757 << from << ", ignoring" << dendl;
2758 send_incremental(op, m->get_epoch()+1);
2759 goto reply;
2760 }
2761
2762 // no down might be set
11fdf7f2 2763 if (!can_mark_down(from))
7c673cae
FG
2764 goto reply;
2765
11fdf7f2
TL
2766 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2767 << " " << m->target_addrs << dendl;
7c673cae
FG
2768 return false;
2769
2770 reply:
2771 if (m->request_ack) {
2772 Context *c(new C_AckMarkedDown(this, op));
2773 c->complete(0);
2774 }
2775 return true;
2776}
2777
2778bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2779{
2780 op->mark_osdmon_event(__func__);
2781 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
11fdf7f2 2782 int target_osd = m->target_osd;
7c673cae 2783
11fdf7f2
TL
2784 ceph_assert(osdmap.is_up(target_osd));
2785 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
7c673cae
FG
2786
2787 mon->clog->info() << "osd." << target_osd << " marked itself down";
2788 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2789 if (m->request_ack)
2790 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2791 return true;
2792}
2793
2794bool OSDMonitor::can_mark_down(int i)
2795{
31f18b77
FG
2796 if (osdmap.is_nodown(i)) {
2797 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
2798 << "will not mark it down" << dendl;
7c673cae
FG
2799 return false;
2800 }
31f18b77 2801
7c673cae
FG
2802 int num_osds = osdmap.get_num_osds();
2803 if (num_osds == 0) {
31f18b77 2804 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
2805 return false;
2806 }
2807 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
2808 float up_ratio = (float)up / (float)num_osds;
11fdf7f2 2809 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
31f18b77 2810 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
11fdf7f2 2811 << g_conf()->mon_osd_min_up_ratio
7c673cae
FG
2812 << ", will not mark osd." << i << " down" << dendl;
2813 return false;
2814 }
2815 return true;
2816}
2817
2818bool OSDMonitor::can_mark_up(int i)
2819{
31f18b77
FG
2820 if (osdmap.is_noup(i)) {
2821 dout(5) << __func__ << " osd." << i << " is marked as noup, "
2822 << "will not mark it up" << dendl;
7c673cae
FG
2823 return false;
2824 }
31f18b77 2825
7c673cae
FG
2826 return true;
2827}
2828
2829/**
2830 * @note the parameter @p i apparently only exists here so we can output the
2831 * osd's id on messages.
2832 */
2833bool OSDMonitor::can_mark_out(int i)
2834{
31f18b77
FG
2835 if (osdmap.is_noout(i)) {
2836 dout(5) << __func__ << " osd." << i << " is marked as noout, "
2837 << "will not mark it out" << dendl;
2838 return false;
2839 }
2840
7c673cae
FG
2841 int num_osds = osdmap.get_num_osds();
2842 if (num_osds == 0) {
2843 dout(5) << __func__ << " no osds" << dendl;
2844 return false;
2845 }
2846 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2847 float in_ratio = (float)in / (float)num_osds;
11fdf7f2 2848 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
7c673cae
FG
2849 if (i >= 0)
2850 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 2851 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
2852 << ", will not mark osd." << i << " out" << dendl;
2853 else
2854 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
11fdf7f2 2855 << g_conf()->mon_osd_min_in_ratio
7c673cae
FG
2856 << ", will not mark osds out" << dendl;
2857 return false;
2858 }
2859
2860 return true;
2861}
2862
2863bool OSDMonitor::can_mark_in(int i)
2864{
31f18b77
FG
2865 if (osdmap.is_noin(i)) {
2866 dout(5) << __func__ << " osd." << i << " is marked as noin, "
2867 << "will not mark it in" << dendl;
7c673cae
FG
2868 return false;
2869 }
31f18b77 2870
7c673cae
FG
2871 return true;
2872}
2873
2874bool OSDMonitor::check_failures(utime_t now)
2875{
2876 bool found_failure = false;
2877 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2878 p != failure_info.end();
2879 ++p) {
2880 if (can_mark_down(p->first)) {
2881 found_failure |= check_failure(now, p->first, p->second);
2882 }
2883 }
2884 return found_failure;
2885}
2886
2887bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2888{
2889 // already pending failure?
2890 if (pending_inc.new_state.count(target_osd) &&
2891 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2892 dout(10) << " already pending failure" << dendl;
2893 return true;
2894 }
2895
2896 set<string> reporters_by_subtree;
11fdf7f2
TL
2897 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2898 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
7c673cae
FG
2899 utime_t max_failed_since = fi.get_failed_since();
2900 utime_t failed_for = now - max_failed_since;
2901
2902 utime_t grace = orig_grace;
2903 double my_grace = 0, peer_grace = 0;
2904 double decay_k = 0;
11fdf7f2
TL
2905 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2906 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
2907 decay_k = ::log(.5) / halflife;
2908
2909 // scale grace period based on historical probability of 'lagginess'
2910 // (false positive failures due to slowness).
2911 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2912 double decay = exp((double)failed_for * decay_k);
2913 dout(20) << " halflife " << halflife << " decay_k " << decay_k
2914 << " failed_for " << failed_for << " decay " << decay << dendl;
2915 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2916 grace += my_grace;
2917 }
2918
2919 // consider the peers reporting a failure a proxy for a potential
2920 // 'subcluster' over the overall cluster that is similarly
2921 // laggy. this is clearly not true in all cases, but will sometimes
2922 // help us localize the grace correction to a subset of the system
2923 // (say, a rack with a bad switch) that is unhappy.
11fdf7f2 2924 ceph_assert(fi.reporters.size());
eafe8130 2925 for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
7c673cae
FG
2926 // get the parent bucket whose type matches with "reporter_subtree_level".
2927 // fall back to OSD if the level doesn't exist.
eafe8130
TL
2928 if (osdmap.exists(p->first)) {
2929 auto reporter_loc = osdmap.crush->get_full_location(p->first);
2930 if (auto iter = reporter_loc.find(reporter_subtree_level);
2931 iter == reporter_loc.end()) {
2932 reporters_by_subtree.insert("osd." + to_string(p->first));
2933 } else {
2934 reporters_by_subtree.insert(iter->second);
2935 }
2936 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2937 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2938 utime_t elapsed = now - xi.down_stamp;
2939 double decay = exp((double)elapsed * decay_k);
2940 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2941 }
2942 ++p;
7c673cae 2943 } else {
eafe8130
TL
2944 fi.cancel_report(p->first);;
2945 p = fi.reporters.erase(p);
7c673cae
FG
2946 }
2947 }
2948
11fdf7f2 2949 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
7c673cae
FG
2950 peer_grace /= (double)fi.reporters.size();
2951 grace += peer_grace;
2952 }
2953
2954 dout(10) << " osd." << target_osd << " has "
2955 << fi.reporters.size() << " reporters, "
2956 << grace << " grace (" << orig_grace << " + " << my_grace
2957 << " + " << peer_grace << "), max_failed_since " << max_failed_since
2958 << dendl;
2959
2960 if (failed_for >= grace &&
11fdf7f2 2961 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
7c673cae
FG
2962 dout(1) << " we have enough reporters to mark osd." << target_osd
2963 << " down" << dendl;
2964 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2965
31f18b77
FG
2966 mon->clog->info() << "osd." << target_osd << " failed ("
2967 << osdmap.crush->get_full_location_ordered_string(
2968 target_osd)
2969 << ") ("
2970 << (int)reporters_by_subtree.size()
2971 << " reporters from different "
7c673cae
FG
2972 << reporter_subtree_level << " after "
2973 << failed_for << " >= grace " << grace << ")";
2974 return true;
2975 }
2976 return false;
2977}
2978
224ce89b 2979void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
2980{
2981 // already pending failure?
2982 if (pending_inc.new_state.count(target_osd) &&
2983 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2984 dout(10) << " already pending failure" << dendl;
2985 return;
2986 }
2987
2988 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2989 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2990
31f18b77
FG
2991 mon->clog->info() << "osd." << target_osd << " failed ("
2992 << osdmap.crush->get_full_location_ordered_string(target_osd)
2993 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
2994 return;
2995}
2996
2997bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2998{
2999 op->mark_osdmon_event(__func__);
3000 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
11fdf7f2
TL
3001 dout(1) << "prepare_failure osd." << m->get_target_osd()
3002 << " " << m->get_target_addrs()
3003 << " from " << m->get_orig_source()
7c673cae
FG
3004 << " is reporting failure:" << m->if_osd_failed() << dendl;
3005
11fdf7f2 3006 int target_osd = m->get_target_osd();
7c673cae 3007 int reporter = m->get_orig_source().num();
11fdf7f2
TL
3008 ceph_assert(osdmap.is_up(target_osd));
3009 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
7c673cae 3010
eafe8130
TL
3011 mon->no_reply(op);
3012
7c673cae
FG
3013 if (m->if_osd_failed()) {
3014 // calculate failure time
3015 utime_t now = ceph_clock_now();
3016 utime_t failed_since =
3017 m->get_recv_stamp() - utime_t(m->failed_for, 0);
3018
3019 // add a report
3020 if (m->is_immediate()) {
11fdf7f2
TL
3021 mon->clog->debug() << "osd." << m->get_target_osd()
3022 << " reported immediately failed by "
3023 << m->get_orig_source();
224ce89b 3024 force_failure(target_osd, reporter);
7c673cae
FG
3025 return true;
3026 }
11fdf7f2
TL
3027 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3028 << m->get_orig_source();
7c673cae
FG
3029
3030 failure_info_t& fi = failure_info[target_osd];
3031 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3032 if (old_op) {
3033 mon->no_reply(old_op);
3034 }
3035
3036 return check_failure(now, target_osd, fi);
3037 } else {
3038 // remove the report
11fdf7f2
TL
3039 mon->clog->debug() << "osd." << m->get_target_osd()
3040 << " failure report canceled by "
3041 << m->get_orig_source();
7c673cae
FG
3042 if (failure_info.count(target_osd)) {
3043 failure_info_t& fi = failure_info[target_osd];
3044 MonOpRequestRef report_op = fi.cancel_report(reporter);
3045 if (report_op) {
3046 mon->no_reply(report_op);
3047 }
3048 if (fi.reporters.empty()) {
3049 dout(10) << " removing last failure_info for osd." << target_osd
3050 << dendl;
3051 failure_info.erase(target_osd);
3052 } else {
3053 dout(10) << " failure_info for osd." << target_osd << " now "
3054 << fi.reporters.size() << " reporters" << dendl;
3055 }
3056 } else {
3057 dout(10) << " no failure_info for osd." << target_osd << dendl;
3058 }
7c673cae
FG
3059 }
3060
3061 return false;
3062}
3063
3064void OSDMonitor::process_failures()
3065{
3066 map<int,failure_info_t>::iterator p = failure_info.begin();
3067 while (p != failure_info.end()) {
3068 if (osdmap.is_up(p->first)) {
3069 ++p;
3070 } else {
3071 dout(10) << "process_failures osd." << p->first << dendl;
3072 list<MonOpRequestRef> ls;
3073 p->second.take_report_messages(ls);
3074 failure_info.erase(p++);
3075
3076 while (!ls.empty()) {
3077 MonOpRequestRef o = ls.front();
3078 if (o) {
3079 o->mark_event(__func__);
3080 MOSDFailure *m = o->get_req<MOSDFailure>();
3081 send_latest(o, m->get_epoch());
28e407b8 3082 mon->no_reply(o);
7c673cae
FG
3083 }
3084 ls.pop_front();
3085 }
3086 }
3087 }
3088}
3089
3090void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3091{
3092 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3093
3094 for (map<int,failure_info_t>::iterator p = failure_info.begin();
3095 p != failure_info.end();
3096 ++p) {
3097 p->second.take_report_messages(ls);
3098 }
3099 failure_info.clear();
3100}
3101
3102
3103// boot --
3104
3105bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3106{
3107 op->mark_osdmon_event(__func__);
3108 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
3109 int from = m->get_orig_source_inst().name.num();
3110
3111 // check permissions, ignore if failed (no response expected)
11fdf7f2 3112 MonSession *session = op->get_session();
7c673cae
FG
3113 if (!session)
3114 goto ignore;
3115 if (!session->is_capable("osd", MON_CAP_X)) {
3116 dout(0) << "got preprocess_boot message from entity with insufficient caps"
11fdf7f2 3117 << session->caps << dendl;
7c673cae
FG
3118 goto ignore;
3119 }
3120
11fdf7f2
TL
3121 if (m->sb.cluster_fsid != mon->monmap->fsid) {
3122 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3123 << " != " << mon->monmap->fsid << dendl;
7c673cae
FG
3124 goto ignore;
3125 }
3126
11fdf7f2
TL
3127 if (m->get_orig_source_inst().addr.is_blank_ip()) {
3128 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
7c673cae
FG
3129 goto ignore;
3130 }
3131
11fdf7f2 3132 ceph_assert(m->get_orig_source_inst().name.is_osd());
7c673cae 3133
11fdf7f2
TL
3134 // force all osds to have gone through luminous prior to upgrade to nautilus
3135 {
3136 vector<string> missing;
3137 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3138 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3139 }
3140 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3141 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3142 }
3143 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3144 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3145 }
3146 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3147 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3148 }
7c673cae 3149
11fdf7f2
TL
3150 if (!missing.empty()) {
3151 using std::experimental::make_ostream_joiner;
7c673cae 3152
11fdf7f2
TL
3153 stringstream ss;
3154 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
c07f9fc5 3155
11fdf7f2
TL
3156 mon->clog->info() << "disallowing boot of OSD "
3157 << m->get_orig_source_inst()
3158 << " because the osd lacks " << ss.str();
7c673cae
FG
3159 goto ignore;
3160 }
3161 }
3162
11fdf7f2
TL
3163 // make sure upgrades stop at nautilus
3164 if (HAVE_FEATURE(m->osd_features, SERVER_O) &&
3165 osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
3166 mon->clog->info() << "disallowing boot of post-nautilus OSD "
7c673cae 3167 << m->get_orig_source_inst()
11fdf7f2 3168 << " because require_osd_release < nautilus";
7c673cae
FG
3169 goto ignore;
3170 }
3171
f64942e4
AA
3172 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3173 // we are reusing a jewel feature bit that was retired in luminous.
3174 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3175 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3176 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3177 mon->clog->info() << "disallowing boot of OSD "
3178 << m->get_orig_source_inst()
3179 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3180 goto ignore;
3181 }
3182
7c673cae
FG
3183 // already booted?
3184 if (osdmap.is_up(from) &&
11fdf7f2
TL
3185 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3186 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
7c673cae 3187 // yup.
11fdf7f2
TL
3188 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3189 << " " << m->get_orig_source_addrs()
3190 << " =~ " << osdmap.get_addrs(from) << dendl;
7c673cae
FG
3191 _booted(op, false);
3192 return true;
3193 }
3194
3195 if (osdmap.exists(from) &&
3196 !osdmap.get_uuid(from).is_zero() &&
3197 osdmap.get_uuid(from) != m->sb.osd_fsid) {
3198 dout(7) << __func__ << " from " << m->get_orig_source_inst()
3199 << " clashes with existing osd: different fsid"
3200 << " (ours: " << osdmap.get_uuid(from)
3201 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3202 goto ignore;
3203 }
3204
3205 if (osdmap.exists(from) &&
3206 osdmap.get_info(from).up_from > m->version &&
11fdf7f2
TL
3207 osdmap.get_most_recent_addrs(from).legacy_equals(
3208 m->get_orig_source_addrs())) {
7c673cae
FG
3209 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3210 send_latest(op, m->sb.current_epoch+1);
3211 return true;
3212 }
3213
3214 // noup?
3215 if (!can_mark_up(from)) {
3216 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3217 send_latest(op, m->sb.current_epoch+1);
3218 return true;
3219 }
3220
3221 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3222 return false;
3223
3224 ignore:
3225 return true;
3226}
3227
3228bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3229{
3230 op->mark_osdmon_event(__func__);
3231 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
11fdf7f2
TL
3232 dout(7) << __func__ << " from " << m->get_source()
3233 << " sb " << m->sb
3234 << " client_addrs" << m->get_connection()->get_peer_addrs()
3235 << " cluster_addrs " << m->cluster_addrs
3236 << " hb_back_addrs " << m->hb_back_addrs
3237 << " hb_front_addrs " << m->hb_front_addrs
7c673cae
FG
3238 << dendl;
3239
11fdf7f2 3240 ceph_assert(m->get_orig_source().is_osd());
7c673cae
FG
3241 int from = m->get_orig_source().num();
3242
3243 // does this osd exist?
3244 if (from >= osdmap.get_max_osd()) {
3245 dout(1) << "boot from osd." << from << " >= max_osd "
3246 << osdmap.get_max_osd() << dendl;
3247 return false;
3248 }
3249
3250 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3251 if (pending_inc.new_state.count(from))
3252 oldstate ^= pending_inc.new_state[from];
3253
3254 // already up? mark down first?
3255 if (osdmap.is_up(from)) {
11fdf7f2
TL
3256 dout(7) << __func__ << " was up, first marking down osd." << from << " "
3257 << osdmap.get_addrs(from) << dendl;
7c673cae 3258 // preprocess should have caught these; if not, assert.
11fdf7f2
TL
3259 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3260 m->get_orig_source_addrs()) ||
3261 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3262 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
7c673cae
FG
3263
3264 if (pending_inc.new_state.count(from) == 0 ||
3265 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3266 // mark previous guy down
3267 pending_inc.new_state[from] = CEPH_OSD_UP;
3268 }
3269 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3270 } else if (pending_inc.new_up_client.count(from)) {
3271 // already prepared, just wait
3272 dout(7) << __func__ << " already prepared, waiting on "
3273 << m->get_orig_source_addr() << dendl;
3274 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3275 } else {
3276 // mark new guy up.
11fdf7f2
TL
3277 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3278 pending_inc.new_up_cluster[from] = m->cluster_addrs;
3279 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3280 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
7c673cae
FG
3281
3282 down_pending_out.erase(from); // if any
3283
3284 if (m->sb.weight)
3285 osd_weight[from] = m->sb.weight;
3286
3287 // set uuid?
3288 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3289 << dendl;
3290 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3291 // preprocess should have caught this; if not, assert.
11fdf7f2 3292 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
7c673cae
FG
3293 pending_inc.new_uuid[from] = m->sb.osd_fsid;
3294 }
3295
3296 // fresh osd?
3297 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3298 const osd_info_t& i = osdmap.get_info(from);
3299 if (i.up_from > i.lost_at) {
3300 dout(10) << " fresh osd; marking lost_at too" << dendl;
3301 pending_inc.new_lost[from] = osdmap.get_epoch();
3302 }
3303 }
3304
3305 // metadata
3306 bufferlist osd_metadata;
11fdf7f2 3307 encode(m->metadata, osd_metadata);
7c673cae 3308 pending_metadata[from] = osd_metadata;
31f18b77 3309 pending_metadata_rm.erase(from);
7c673cae
FG
3310
3311 // adjust last clean unmount epoch?
3312 const osd_info_t& info = osdmap.get_info(from);
3313 dout(10) << " old osd_info: " << info << dendl;
3314 if (m->sb.mounted > info.last_clean_begin ||
3315 (m->sb.mounted == info.last_clean_begin &&
3316 m->sb.clean_thru > info.last_clean_end)) {
3317 epoch_t begin = m->sb.mounted;
3318 epoch_t end = m->sb.clean_thru;
3319
3320 dout(10) << __func__ << " osd." << from << " last_clean_interval "
3321 << "[" << info.last_clean_begin << "," << info.last_clean_end
3322 << ") -> [" << begin << "-" << end << ")"
3323 << dendl;
3324 pending_inc.new_last_clean_interval[from] =
3325 pair<epoch_t,epoch_t>(begin, end);
3326 }
3327
3328 osd_xinfo_t xi = osdmap.get_xinfo(from);
3329 if (m->boot_epoch == 0) {
11fdf7f2
TL
3330 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3331 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3332 dout(10) << " not laggy, new xi " << xi << dendl;
3333 } else {
3334 if (xi.down_stamp.sec()) {
3335 int interval = ceph_clock_now().sec() -
3336 xi.down_stamp.sec();
11fdf7f2
TL
3337 if (g_conf()->mon_osd_laggy_max_interval &&
3338 (interval > g_conf()->mon_osd_laggy_max_interval)) {
3339 interval = g_conf()->mon_osd_laggy_max_interval;
7c673cae
FG
3340 }
3341 xi.laggy_interval =
11fdf7f2
TL
3342 interval * g_conf()->mon_osd_laggy_weight +
3343 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3344 }
3345 xi.laggy_probability =
11fdf7f2
TL
3346 g_conf()->mon_osd_laggy_weight +
3347 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
7c673cae
FG
3348 dout(10) << " laggy, now xi " << xi << dendl;
3349 }
3350
3351 // set features shared by the osd
3352 if (m->osd_features)
3353 xi.features = m->osd_features;
3354 else
3355 xi.features = m->get_connection()->get_features();
3356
3357 // mark in?
11fdf7f2 3358 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
7c673cae 3359 (oldstate & CEPH_OSD_AUTOOUT)) ||
11fdf7f2
TL
3360 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3361 (g_conf()->mon_osd_auto_mark_in)) {
7c673cae
FG
3362 if (can_mark_in(from)) {
3363 if (osdmap.osd_xinfo[from].old_weight > 0) {
3364 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
3365 xi.old_weight = 0;
3366 } else {
3367 pending_inc.new_weight[from] = CEPH_OSD_IN;
3368 }
3369 } else {
3370 dout(7) << __func__ << " NOIN set, will not mark in "
3371 << m->get_orig_source_addr() << dendl;
3372 }
3373 }
3374
3375 pending_inc.new_xinfo[from] = xi;
3376
3377 // wait
3378 wait_for_finished_proposal(op, new C_Booted(this, op));
3379 }
3380 return true;
3381}
3382
3383void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3384{
3385 op->mark_osdmon_event(__func__);
3386 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
3387 dout(7) << "_booted " << m->get_orig_source_inst()
3388 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3389
3390 if (logit) {
11fdf7f2
TL
3391 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3392 << " boot";
7c673cae
FG
3393 }
3394
3395 send_latest(op, m->sb.current_epoch+1);
3396}
3397
3398
3399// -------------
3400// full
3401
3402bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3403{
3404 op->mark_osdmon_event(__func__);
3405 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3406 int from = m->get_orig_source().num();
3407 set<string> state;
3408 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3409
3410 // check permissions, ignore if failed
11fdf7f2 3411 MonSession *session = op->get_session();
7c673cae
FG
3412 if (!session)
3413 goto ignore;
3414 if (!session->is_capable("osd", MON_CAP_X)) {
3415 dout(0) << "MOSDFull from entity with insufficient privileges:"
3416 << session->caps << dendl;
3417 goto ignore;
3418 }
3419
3420 // ignore a full message from the osd instance that already went down
3421 if (!osdmap.exists(from)) {
3422 dout(7) << __func__ << " ignoring full message from nonexistent "
3423 << m->get_orig_source_inst() << dendl;
3424 goto ignore;
3425 }
3426 if ((!osdmap.is_up(from) &&
11fdf7f2
TL
3427 osdmap.get_most_recent_addrs(from).legacy_equals(
3428 m->get_orig_source_addrs())) ||
7c673cae 3429 (osdmap.is_up(from) &&
11fdf7f2 3430 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
7c673cae
FG
3431 dout(7) << __func__ << " ignoring full message from down "
3432 << m->get_orig_source_inst() << dendl;
3433 goto ignore;
3434 }
3435
3436 OSDMap::calc_state_set(osdmap.get_state(from), state);
3437
3438 if ((osdmap.get_state(from) & mask) == m->state) {
3439 dout(7) << __func__ << " state already " << state << " for osd." << from
3440 << " " << m->get_orig_source_inst() << dendl;
3441 _reply_map(op, m->version);
3442 goto ignore;
3443 }
3444
3445 dout(10) << __func__ << " want state " << state << " for osd." << from
3446 << " " << m->get_orig_source_inst() << dendl;
3447 return false;
3448
3449 ignore:
3450 return true;
3451}
3452
3453bool OSDMonitor::prepare_full(MonOpRequestRef op)
3454{
3455 op->mark_osdmon_event(__func__);
3456 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3457 const int from = m->get_orig_source().num();
3458
3459 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3460 const unsigned want_state = m->state & mask; // safety first
3461
3462 unsigned cur_state = osdmap.get_state(from);
3463 auto p = pending_inc.new_state.find(from);
3464 if (p != pending_inc.new_state.end()) {
3465 cur_state ^= p->second;
3466 }
3467 cur_state &= mask;
3468
3469 set<string> want_state_set, cur_state_set;
3470 OSDMap::calc_state_set(want_state, want_state_set);
3471 OSDMap::calc_state_set(cur_state, cur_state_set);
3472
3473 if (cur_state != want_state) {
3474 if (p != pending_inc.new_state.end()) {
3475 p->second &= ~mask;
3476 } else {
3477 pending_inc.new_state[from] = 0;
3478 }
3479 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3480 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3481 << " -> " << want_state_set << dendl;
3482 } else {
3483 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3484 << " = wanted " << want_state_set << ", just waiting" << dendl;
3485 }
3486
3487 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3488 return true;
3489}
3490
3491// -------------
3492// alive
3493
3494bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3495{
3496 op->mark_osdmon_event(__func__);
3497 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3498 int from = m->get_orig_source().num();
3499
3500 // check permissions, ignore if failed
11fdf7f2 3501 MonSession *session = op->get_session();
7c673cae
FG
3502 if (!session)
3503 goto ignore;
3504 if (!session->is_capable("osd", MON_CAP_X)) {
3505 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3506 << session->caps << dendl;
3507 goto ignore;
3508 }
3509
3510 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3511 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3512 dout(7) << "preprocess_alive ignoring alive message from down "
3513 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3514 << dendl;
7c673cae
FG
3515 goto ignore;
3516 }
3517
3518 if (osdmap.get_up_thru(from) >= m->want) {
3519 // yup.
3520 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3521 _reply_map(op, m->version);
3522 return true;
3523 }
3524
3525 dout(10) << "preprocess_alive want up_thru " << m->want
3526 << " from " << m->get_orig_source_inst() << dendl;
3527 return false;
3528
3529 ignore:
3530 return true;
3531}
3532
3533bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3534{
3535 op->mark_osdmon_event(__func__);
3536 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3537 int from = m->get_orig_source().num();
3538
3539 if (0) { // we probably don't care much about these
3540 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3541 }
3542
3543 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3544 << " from " << m->get_orig_source_inst() << dendl;
3545
3546 update_up_thru(from, m->version); // set to the latest map the OSD has
3547 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3548 return true;
3549}
3550
3551void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3552{
3553 op->mark_osdmon_event(__func__);
3554 dout(7) << "_reply_map " << e
3555 << " from " << op->get_req()->get_orig_source_inst()
3556 << dendl;
3557 send_latest(op, e);
3558}
3559
3560// pg_created
3561bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3562{
3563 op->mark_osdmon_event(__func__);
3564 auto m = static_cast<MOSDPGCreated*>(op->get_req());
3565 dout(10) << __func__ << " " << *m << dendl;
11fdf7f2 3566 auto session = op->get_session();
94b18763 3567 mon->no_reply(op);
7c673cae
FG
3568 if (!session) {
3569 dout(10) << __func__ << ": no monitor session!" << dendl;
3570 return true;
3571 }
3572 if (!session->is_capable("osd", MON_CAP_X)) {
3573 derr << __func__ << " received from entity "
3574 << "with insufficient privileges " << session->caps << dendl;
3575 return true;
3576 }
3577 // always forward the "created!" to the leader
3578 return false;
3579}
3580
3581bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3582{
3583 op->mark_osdmon_event(__func__);
3584 auto m = static_cast<MOSDPGCreated*>(op->get_req());
3585 dout(10) << __func__ << " " << *m << dendl;
3586 auto src = m->get_orig_source();
3587 auto from = src.num();
3588 if (!src.is_osd() ||
3589 !mon->osdmon()->osdmap.is_up(from) ||
11fdf7f2
TL
3590 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3591 m->get_orig_source_addrs())) {
7c673cae
FG
3592 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3593 return false;
3594 }
3595 pending_created_pgs.push_back(m->pgid);
3596 return true;
3597}
3598
11fdf7f2
TL
3599bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3600{
3601 op->mark_osdmon_event(__func__);
3602 auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3603 dout(10) << __func__ << " " << *m << dendl;
3604 const pg_pool_t *pi;
3605 auto session = op->get_session();
3606 if (!session) {
3607 dout(10) << __func__ << ": no monitor session!" << dendl;
3608 goto ignore;
3609 }
3610 if (!session->is_capable("osd", MON_CAP_X)) {
3611 derr << __func__ << " received from entity "
3612 << "with insufficient privileges " << session->caps << dendl;
3613 goto ignore;
3614 }
3615 pi = osdmap.get_pg_pool(m->pgid.pool());
3616 if (!pi) {
3617 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3618 goto ignore;
3619 }
3620 if (pi->get_pg_num() <= m->pgid.ps()) {
3621 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3622 goto ignore;
3623 }
3624 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3625 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3626 goto ignore;
3627 }
3628 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3629 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3630 goto ignore;
3631 }
3632 return false;
3633
3634 ignore:
3635 mon->no_reply(op);
3636 return true;
3637}
3638
3639bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3640{
3641 op->mark_osdmon_event(__func__);
3642 auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3643 dout(10) << __func__ << " " << *m << dendl;
3644 pg_pool_t p;
3645 if (pending_inc.new_pools.count(m->pgid.pool()))
3646 p = pending_inc.new_pools[m->pgid.pool()];
3647 else
3648 p = *osdmap.get_pg_pool(m->pgid.pool());
3649 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3650 p.get_pg_num_pending() > m->pgid.ps()) {
3651 dout(10) << __func__
3652 << " race with concurrent pg_num[_pending] update, will retry"
3653 << dendl;
3654 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3655 return true;
3656 }
3657
3658 if (m->ready) {
3659 p.dec_pg_num(m->pgid,
3660 pending_inc.epoch,
3661 m->source_version,
3662 m->target_version,
3663 m->last_epoch_started,
3664 m->last_epoch_clean);
3665 p.last_change = pending_inc.epoch;
3666 } else {
3667 // back off the merge attempt!
3668 p.set_pg_num_pending(p.get_pg_num());
3669 }
3670
3671 // force pre-nautilus clients to resend their ops, since they
3672 // don't understand pg_num_pending changes form a new interval
3673 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3674
3675 pending_inc.new_pools[m->pgid.pool()] = p;
3676
3677 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3678 if (m->ready &&
3679 prob > 0 &&
3680 prob > (double)(rand() % 1000)/1000.0) {
3681 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3682 auto n = new MMonCommand(mon->monmap->get_fsid());
3683 n->set_connection(m->get_connection());
3684 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3685 osdmap.get_pool_name(m->pgid.pool()) +
3686 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3687 stringify(m->pgid.ps() + 1) + "\"}" };
3688 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3689 nop->set_type_service();
3690 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3691 } else {
3692 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3693 }
3694 return true;
3695}
3696
3697
7c673cae
FG
3698// -------------
3699// pg_temp changes
3700
3701bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3702{
3703 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3704 dout(10) << "preprocess_pgtemp " << *m << dendl;
3705 mempool::osdmap::vector<int> empty;
3706 int from = m->get_orig_source().num();
3707 size_t ignore_cnt = 0;
3708
3709 // check caps
11fdf7f2 3710 MonSession *session = op->get_session();
7c673cae
FG
3711 if (!session)
3712 goto ignore;
3713 if (!session->is_capable("osd", MON_CAP_X)) {
3714 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3715 << session->caps << dendl;
3716 goto ignore;
3717 }
3718
3719 if (!osdmap.is_up(from) ||
11fdf7f2
TL
3720 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3721 dout(7) << "ignoring pgtemp message from down "
3722 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3723 << dendl;
7c673cae
FG
3724 goto ignore;
3725 }
3726
3efd9988
FG
3727 if (m->forced) {
3728 return false;
3729 }
3730
7c673cae
FG
3731 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3732 dout(20) << " " << p->first
31f18b77 3733 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
3734 << " -> " << p->second << dendl;
3735
3736 // does the pool exist?
3737 if (!osdmap.have_pg_pool(p->first.pool())) {
3738 /*
3739 * 1. If the osdmap does not have the pool, it means the pool has been
3740 * removed in-between the osd sending this message and us handling it.
3741 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3742 * not exist in the pending either, as the osds would not send a
3743 * message about a pool they know nothing about (yet).
3744 * 3. However, if the pool does exist in the pending, then it must be a
3745 * new pool, and not relevant to this message (see 1).
3746 */
3747 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3748 << ": pool has been removed" << dendl;
3749 ignore_cnt++;
3750 continue;
3751 }
3752
3753 int acting_primary = -1;
3754 osdmap.pg_to_up_acting_osds(
3755 p->first, nullptr, nullptr, nullptr, &acting_primary);
3756 if (acting_primary != from) {
3757 /* If the source isn't the primary based on the current osdmap, we know
3758 * that the interval changed and that we can discard this message.
3759 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3760 * which of two pg temp mappings on the same pg is more recent.
3761 */
3762 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3763 << ": primary has changed" << dendl;
3764 ignore_cnt++;
3765 continue;
3766 }
3767
3768 // removal?
3769 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3770 osdmap.primary_temp->count(p->first)))
3771 return false;
3772 // change?
3773 // NOTE: we assume that this will clear pg_primary, so consider
3774 // an existing pg_primary field to imply a change
3775 if (p->second.size() &&
3776 (osdmap.pg_temp->count(p->first) == 0 ||
11fdf7f2 3777 osdmap.pg_temp->get(p->first) != p->second ||
7c673cae
FG
3778 osdmap.primary_temp->count(p->first)))
3779 return false;
3780 }
3781
3782 // should we ignore all the pgs?
3783 if (ignore_cnt == m->pg_temp.size())
3784 goto ignore;
3785
3786 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3787 _reply_map(op, m->map_epoch);
3788 return true;
3789
3790 ignore:
3791 return true;
3792}
3793
3794void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
3795{
3796 epoch_t old_up_thru = osdmap.get_up_thru(from);
3797 auto ut = pending_inc.new_up_thru.find(from);
3798 if (ut != pending_inc.new_up_thru.end()) {
3799 old_up_thru = ut->second;
3800 }
3801 if (up_thru > old_up_thru) {
3802 // set up_thru too, so the osd doesn't have to ask again
3803 pending_inc.new_up_thru[from] = up_thru;
3804 }
3805}
3806
3807bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
3808{
3809 op->mark_osdmon_event(__func__);
3810 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3811 int from = m->get_orig_source().num();
3812 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
3813 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3814 uint64_t pool = p->first.pool();
3815 if (pending_inc.old_pools.count(pool)) {
3816 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3817 << ": pool pending removal" << dendl;
3818 continue;
3819 }
3820 if (!osdmap.have_pg_pool(pool)) {
3821 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3822 << ": pool has been removed" << dendl;
3823 continue;
3824 }
3825 pending_inc.new_pg_temp[p->first] =
3826 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
3827
3828 // unconditionally clear pg_primary (until this message can encode
3829 // a change for that, too.. at which point we need to also fix
3830 // preprocess_pg_temp)
3831 if (osdmap.primary_temp->count(p->first) ||
3832 pending_inc.new_primary_temp.count(p->first))
3833 pending_inc.new_primary_temp[p->first] = -1;
3834 }
3835
3836 // set up_thru too, so the osd doesn't have to ask again
3837 update_up_thru(from, m->map_epoch);
3838
3839 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
3840 return true;
3841}
3842
3843
3844// ---
3845
3846bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
3847{
3848 op->mark_osdmon_event(__func__);
3849 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3850 dout(7) << "preprocess_remove_snaps " << *m << dendl;
3851
3852 // check privilege, ignore if failed
11fdf7f2 3853 MonSession *session = op->get_session();
f64942e4 3854 mon->no_reply(op);
7c673cae
FG
3855 if (!session)
3856 goto ignore;
3857 if (!session->caps.is_capable(
11fdf7f2 3858 cct,
7c673cae
FG
3859 CEPH_ENTITY_TYPE_MON,
3860 session->entity_name,
11fdf7f2
TL
3861 "osd", "osd pool rmsnap", {}, true, true, false,
3862 session->get_peer_socket_addr())) {
7c673cae
FG
3863 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3864 << session->caps << dendl;
3865 goto ignore;
3866 }
3867
3868 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3869 q != m->snaps.end();
3870 ++q) {
3871 if (!osdmap.have_pg_pool(q->first)) {
3872 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3873 continue;
3874 }
3875 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3876 for (vector<snapid_t>::iterator p = q->second.begin();
3877 p != q->second.end();
3878 ++p) {
3879 if (*p > pi->get_snap_seq() ||
3880 !pi->removed_snaps.contains(*p))
3881 return false;
3882 }
3883 }
3884
3885 ignore:
3886 return true;
3887}
3888
3889bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3890{
3891 op->mark_osdmon_event(__func__);
3892 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3893 dout(7) << "prepare_remove_snaps " << *m << dendl;
3894
3895 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3896 p != m->snaps.end();
3897 ++p) {
3898
3899 if (!osdmap.have_pg_pool(p->first)) {
3900 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3901 continue;
3902 }
3903
3904 pg_pool_t& pi = osdmap.pools[p->first];
3905 for (vector<snapid_t>::iterator q = p->second.begin();
3906 q != p->second.end();
3907 ++q) {
3908 if (!pi.removed_snaps.contains(*q) &&
3909 (!pending_inc.new_pools.count(p->first) ||
3910 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3911 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3912 newpi->removed_snaps.insert(*q);
11fdf7f2 3913 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
7c673cae
FG
3914 dout(10) << " pool " << p->first << " removed_snaps added " << *q
3915 << " (now " << newpi->removed_snaps << ")" << dendl;
3916 if (*q > newpi->get_snap_seq()) {
11fdf7f2
TL
3917 dout(10) << " pool " << p->first << " snap_seq "
3918 << newpi->get_snap_seq() << " -> " << *q << dendl;
7c673cae
FG
3919 newpi->set_snap_seq(*q);
3920 }
3921 newpi->set_snap_epoch(pending_inc.epoch);
11fdf7f2 3922 pending_inc.new_removed_snaps[p->first].insert(*q);
7c673cae
FG
3923 }
3924 }
3925 }
3926 return true;
3927}
3928
3929// osd beacon
3930bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3931{
3932 op->mark_osdmon_event(__func__);
7c673cae 3933 // check caps
11fdf7f2 3934 auto session = op->get_session();
94b18763 3935 mon->no_reply(op);
7c673cae
FG
3936 if (!session) {
3937 dout(10) << __func__ << " no monitor session!" << dendl;
3938 return true;
3939 }
3940 if (!session->is_capable("osd", MON_CAP_X)) {
3941 derr << __func__ << " received from entity "
3942 << "with insufficient privileges " << session->caps << dendl;
3943 return true;
3944 }
3945 // Always forward the beacon to the leader, even if they are the same as
3946 // the old one. The leader will mark as down osds that haven't sent
3947 // beacon for a few minutes.
3948 return false;
3949}
3950
3951bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3952{
3953 op->mark_osdmon_event(__func__);
3954 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3955 const auto src = beacon->get_orig_source();
3956 dout(10) << __func__ << " " << *beacon
3957 << " from " << src << dendl;
3958 int from = src.num();
3959
3960 if (!src.is_osd() ||
3961 !osdmap.is_up(from) ||
11fdf7f2
TL
3962 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
3963 if (src.is_osd() && !osdmap.is_up(from)) {
3964 // share some new maps with this guy in case it may not be
3965 // aware of its own deadness...
3966 send_latest(op, beacon->version+1);
3967 }
3968 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
7c673cae
FG
3969 return false;
3970 }
3971
3972 last_osd_report[from] = ceph_clock_now();
3973 osd_epochs[from] = beacon->version;
3974
3975 for (const auto& pg : beacon->pgs) {
3976 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3977 }
3978 return false;
3979}
3980
3981// ---------------
3982// map helpers
3983
3984void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3985{
3986 op->mark_osdmon_event(__func__);
3987 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3988 << " start " << start << dendl;
3989 if (start == 0)
3990 send_full(op);
3991 else
3992 send_incremental(op, start);
3993}
3994
3995
28e407b8 3996MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
7c673cae 3997{
28e407b8
AA
3998 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
3999 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
7c673cae
FG
4000 r->oldest_map = get_first_committed();
4001 r->newest_map = osdmap.get_epoch();
4002 return r;
4003}
4004
28e407b8 4005MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
7c673cae 4006{
11fdf7f2
TL
4007 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4008 << std::hex << features << std::dec << dendl;
28e407b8 4009 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
4010 m->oldest_map = get_first_committed();
4011 m->newest_map = osdmap.get_epoch();
4012
4013 for (epoch_t e = to; e >= from && e > 0; e--) {
4014 bufferlist bl;
28e407b8 4015 int err = get_version(e, features, bl);
7c673cae 4016 if (err == 0) {
11fdf7f2 4017 ceph_assert(bl.length());
7c673cae
FG
4018 // if (get_version(e, bl) > 0) {
4019 dout(20) << "build_incremental inc " << e << " "
4020 << bl.length() << " bytes" << dendl;
4021 m->incremental_maps[e] = bl;
4022 } else {
11fdf7f2
TL
4023 ceph_assert(err == -ENOENT);
4024 ceph_assert(!bl.length());
28e407b8 4025 get_version_full(e, features, bl);
7c673cae
FG
4026 if (bl.length() > 0) {
4027 //else if (get_version("full", e, bl) > 0) {
4028 dout(20) << "build_incremental full " << e << " "
4029 << bl.length() << " bytes" << dendl;
4030 m->maps[e] = bl;
4031 } else {
4032 ceph_abort(); // we should have all maps.
4033 }
4034 }
4035 }
4036 return m;
4037}
4038
4039void OSDMonitor::send_full(MonOpRequestRef op)
4040{
4041 op->mark_osdmon_event(__func__);
4042 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
28e407b8 4043 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
7c673cae
FG
4044}
4045
4046void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4047{
4048 op->mark_osdmon_event(__func__);
4049
4050 MonSession *s = op->get_session();
11fdf7f2 4051 ceph_assert(s);
7c673cae 4052
11fdf7f2 4053 if (s->proxy_con) {
7c673cae
FG
4054 // oh, we can tell the other mon to do it
4055 dout(10) << __func__ << " asking proxying mon to send_incremental from "
4056 << first << dendl;
4057 MRoute *r = new MRoute(s->proxy_tid, NULL);
4058 r->send_osdmap_first = first;
4059 s->proxy_con->send_message(r);
4060 op->mark_event("reply: send routed send_osdmap_first reply");
4061 } else {
4062 // do it ourselves
4063 send_incremental(first, s, false, op);
4064 }
4065}
4066
4067void OSDMonitor::send_incremental(epoch_t first,
4068 MonSession *session,
4069 bool onetime,
4070 MonOpRequestRef req)
4071{
4072 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
11fdf7f2 4073 << " to " << session->name << dendl;
7c673cae 4074
28e407b8
AA
4075 // get feature of the peer
4076 // use quorum_con_features, if it's an anonymous connection.
4077 uint64_t features = session->con_features ? session->con_features :
4078 mon->get_quorum_con_features();
4079
7c673cae 4080 if (first <= session->osd_epoch) {
11fdf7f2 4081 dout(10) << __func__ << " " << session->name << " should already have epoch "
7c673cae
FG
4082 << session->osd_epoch << dendl;
4083 first = session->osd_epoch + 1;
4084 }
4085
4086 if (first < get_first_committed()) {
11fdf7f2
TL
4087 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4088 m->oldest_map = get_first_committed();
4089 m->newest_map = osdmap.get_epoch();
4090
4091 // share removed snaps during the gap
4092 get_removed_snaps_range(first, m->oldest_map, &m->gap_removed_snaps);
4093
7c673cae
FG
4094 first = get_first_committed();
4095 bufferlist bl;
28e407b8 4096 int err = get_version_full(first, features, bl);
11fdf7f2
TL
4097 ceph_assert(err == 0);
4098 ceph_assert(bl.length());
7c673cae
FG
4099 dout(20) << "send_incremental starting with base full "
4100 << first << " " << bl.length() << " bytes" << dendl;
7c673cae
FG
4101 m->maps[first] = bl;
4102
4103 if (req) {
4104 mon->send_reply(req, m);
4105 session->osd_epoch = first;
4106 return;
4107 } else {
4108 session->con->send_message(m);
4109 session->osd_epoch = first;
4110 }
4111 first++;
4112 }
4113
4114 while (first <= osdmap.get_epoch()) {
11fdf7f2 4115 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
28e407b8
AA
4116 osdmap.get_epoch());
4117 MOSDMap *m = build_incremental(first, last, features);
7c673cae
FG
4118
4119 if (req) {
4120 // send some maps. it may not be all of them, but it will get them
4121 // started.
4122 mon->send_reply(req, m);
4123 } else {
4124 session->con->send_message(m);
4125 first = last + 1;
4126 }
4127 session->osd_epoch = last;
4128 if (onetime || req)
4129 break;
4130 }
4131}
4132
11fdf7f2
TL
4133void OSDMonitor::get_removed_snaps_range(
4134 epoch_t start, epoch_t end,
4135 mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps)
4136{
4137 // we only care about pools that exist now.
4138 for (auto& p : osdmap.get_pools()) {
4139 auto& t = (*gap_removed_snaps)[p.first];
4140 for (epoch_t epoch = start; epoch < end; ++epoch) {
4141 string k = make_snap_epoch_key(p.first, epoch);
4142 bufferlist v;
4143 mon->store->get(OSD_SNAP_PREFIX, k, v);
4144 if (v.length()) {
4145 auto q = v.cbegin();
4146 OSDMap::snap_interval_set_t snaps;
4147 decode(snaps, q);
4148 t.union_of(snaps);
4149 }
4150 }
4151 dout(10) << __func__ << " " << p.first << " " << t << dendl;
4152 }
4153}
4154
7c673cae
FG
4155int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4156{
28e407b8
AA
4157 return get_version(ver, mon->get_quorum_con_features(), bl);
4158}
4159
4160void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4161{
4162 OSDMap::Incremental inc;
11fdf7f2 4163 auto q = bl.cbegin();
28e407b8
AA
4164 inc.decode(q);
4165 // always encode with subset of osdmap's canonical features
4166 uint64_t f = features & inc.encode_features;
4167 dout(20) << __func__ << " " << inc.epoch << " with features " << f
4168 << dendl;
4169 bl.clear();
4170 if (inc.fullmap.length()) {
4171 // embedded full map?
4172 OSDMap m;
4173 m.decode(inc.fullmap);
4174 inc.fullmap.clear();
4175 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4176 }
4177 if (inc.crush.length()) {
4178 // embedded crush map
4179 CrushWrapper c;
11fdf7f2 4180 auto p = inc.crush.cbegin();
28e407b8
AA
4181 c.decode(p);
4182 inc.crush.clear();
4183 c.encode(inc.crush, f);
4184 }
4185 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4186}
4187
4188void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4189{
4190 OSDMap m;
11fdf7f2 4191 auto q = bl.cbegin();
28e407b8
AA
4192 m.decode(q);
4193 // always encode with subset of osdmap's canonical features
4194 uint64_t f = features & m.get_encoding_features();
4195 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4196 << dendl;
4197 bl.clear();
4198 m.encode(bl, f | CEPH_FEATURE_RESERVED);
4199}
4200
4201int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4202{
4203 uint64_t significant_features = OSDMap::get_significant_features(features);
4204 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4205 return 0;
4206 }
4207 int ret = PaxosService::get_version(ver, bl);
4208 if (ret < 0) {
7c673cae 4209 return ret;
28e407b8
AA
4210 }
4211 // NOTE: this check is imprecise; the OSDMap encoding features may
4212 // be a subset of the latest mon quorum features, but worst case we
4213 // reencode once and then cache the (identical) result under both
4214 // feature masks.
4215 if (significant_features !=
4216 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4217 reencode_incremental_map(bl, features);
4218 }
eafe8130 4219 inc_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4220 return 0;
7c673cae
FG
4221}
4222
11fdf7f2
TL
4223int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4224{
4225 bufferlist inc_bl;
4226 int err = get_version(ver, inc_bl);
4227 ceph_assert(err == 0);
4228 ceph_assert(inc_bl.length());
4229
4230 auto p = inc_bl.cbegin();
4231 inc.decode(p);
4232 dout(10) << __func__ << " "
4233 << " epoch " << inc.epoch
4234 << " inc_crc " << inc.inc_crc
4235 << " full_crc " << inc.full_crc
4236 << " encode_features " << inc.encode_features << dendl;
4237 return 0;
4238}
4239
4240int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4241{
4242 dout(10) << __func__ << " ver " << ver << dendl;
4243
4244 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4245 if (closest_pinned == 0) {
4246 return -ENOENT;
4247 }
4248 if (closest_pinned > ver) {
4249 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4250 }
4251 ceph_assert(closest_pinned <= ver);
4252
4253 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4254
4255 // get osdmap incremental maps and apply on top of this one.
4256 bufferlist osdm_bl;
4257 bool has_cached_osdmap = false;
4258 for (version_t v = ver-1; v >= closest_pinned; --v) {
4259 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4260 &osdm_bl)) {
4261 dout(10) << __func__ << " found map in cache ver " << v << dendl;
4262 closest_pinned = v;
4263 has_cached_osdmap = true;
4264 break;
4265 }
4266 }
4267
4268 if (!has_cached_osdmap) {
4269 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4270 if (err != 0) {
4271 derr << __func__ << " closest pinned map ver " << closest_pinned
4272 << " not available! error: " << cpp_strerror(err) << dendl;
4273 }
4274 ceph_assert(err == 0);
4275 }
4276
4277 ceph_assert(osdm_bl.length());
4278
4279 OSDMap osdm;
4280 osdm.decode(osdm_bl);
4281
4282 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4283 << " e" << osdm.epoch
4284 << " crc " << osdm.get_crc()
4285 << " -- applying incremental maps." << dendl;
4286
4287 uint64_t encode_features = 0;
4288 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4289 dout(20) << __func__ << " applying inc epoch " << v << dendl;
4290
4291 OSDMap::Incremental inc;
4292 int err = get_inc(v, inc);
4293 ceph_assert(err == 0);
4294
4295 encode_features = inc.encode_features;
4296
4297 err = osdm.apply_incremental(inc);
4298 ceph_assert(err == 0);
4299
4300 // this block performs paranoid checks on map retrieval
4301 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4302 inc.full_crc != 0) {
4303
4304 uint64_t f = encode_features;
4305 if (!f) {
4306 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4307 }
4308
4309 // encode osdmap to force calculating crcs
4310 bufferlist tbl;
4311 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4312 // decode osdmap to compare crcs with what's expected by incremental
4313 OSDMap tosdm;
4314 tosdm.decode(tbl);
4315
4316 if (tosdm.get_crc() != inc.full_crc) {
4317 derr << __func__
4318 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4319 << ", expected " << inc.full_crc << ")" << dendl;
4320 ceph_abort_msg("osdmap crc mismatch");
4321 }
4322 }
4323
4324 // note: we cannot add the recently computed map to the cache, as is,
4325 // because we have not encoded the map into a bl.
4326 }
4327
4328 if (!encode_features) {
4329 dout(10) << __func__
4330 << " last incremental map didn't have features;"
4331 << " defaulting to quorum's or all" << dendl;
4332 encode_features =
4333 (mon->quorum_con_features ? mon->quorum_con_features : -1);
4334 }
4335 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4336
4337 return 0;
4338}
4339
7c673cae
FG
4340int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4341{
28e407b8
AA
4342 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4343}
4344
4345int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4346 bufferlist& bl)
4347{
4348 uint64_t significant_features = OSDMap::get_significant_features(features);
4349 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4350 return 0;
4351 }
4352 int ret = PaxosService::get_version_full(ver, bl);
11fdf7f2
TL
4353 if (ret == -ENOENT) {
4354 // build map?
4355 ret = get_full_from_pinned_map(ver, bl);
4356 }
28e407b8 4357 if (ret < 0) {
7c673cae 4358 return ret;
28e407b8
AA
4359 }
4360 // NOTE: this check is imprecise; the OSDMap encoding features may
4361 // be a subset of the latest mon quorum features, but worst case we
4362 // reencode once and then cache the (identical) result under both
4363 // feature masks.
4364 if (significant_features !=
4365 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4366 reencode_full_map(bl, features);
4367 }
eafe8130 4368 full_osd_cache.add_bytes({ver, significant_features}, bl);
28e407b8 4369 return 0;
7c673cae
FG
4370}
4371
11fdf7f2
TL
4372epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4373{
4374 dout(10) << "blacklist " << av << " until " << until << dendl;
4375 for (auto a : av.v) {
4376 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4377 a.set_type(entity_addr_t::TYPE_ANY);
4378 } else {
4379 a.set_type(entity_addr_t::TYPE_LEGACY);
4380 }
4381 pending_inc.new_blacklist[a] = until;
4382 }
4383 return pending_inc.epoch;
4384}
4385
4386epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
7c673cae 4387{
11fdf7f2
TL
4388 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4389 a.set_type(entity_addr_t::TYPE_ANY);
4390 } else {
4391 a.set_type(entity_addr_t::TYPE_LEGACY);
4392 }
7c673cae
FG
4393 dout(10) << "blacklist " << a << " until " << until << dendl;
4394 pending_inc.new_blacklist[a] = until;
4395 return pending_inc.epoch;
4396}
4397
4398
4399void OSDMonitor::check_osdmap_subs()
4400{
4401 dout(10) << __func__ << dendl;
4402 if (!osdmap.get_epoch()) {
4403 return;
4404 }
4405 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4406 if (osdmap_subs == mon->session_map.subs.end()) {
4407 return;
4408 }
4409 auto p = osdmap_subs->second->begin();
4410 while (!p.end()) {
4411 auto sub = *p;
4412 ++p;
4413 check_osdmap_sub(sub);
4414 }
4415}
4416
4417void OSDMonitor::check_osdmap_sub(Subscription *sub)
4418{
4419 dout(10) << __func__ << " " << sub << " next " << sub->next
4420 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4421 if (sub->next <= osdmap.get_epoch()) {
4422 if (sub->next >= 1)
4423 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4424 else
28e407b8 4425 sub->session->con->send_message(build_latest_full(sub->session->con_features));
7c673cae
FG
4426 if (sub->onetime)
4427 mon->session_map.remove_sub(sub);
4428 else
4429 sub->next = osdmap.get_epoch() + 1;
4430 }
4431}
4432
4433void OSDMonitor::check_pg_creates_subs()
4434{
7c673cae
FG
4435 if (!osdmap.get_num_up_osds()) {
4436 return;
4437 }
11fdf7f2 4438 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
7c673cae
FG
4439 mon->with_session_map([this](const MonSessionMap& session_map) {
4440 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4441 if (pg_creates_subs == session_map.subs.end()) {
4442 return;
4443 }
4444 for (auto sub : *pg_creates_subs->second) {
4445 check_pg_creates_sub(sub);
4446 }
4447 });
4448}
4449
4450void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4451{
11fdf7f2
TL
4452 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4453 ceph_assert(sub->type == "osd_pg_creates");
7c673cae
FG
4454 // only send these if the OSD is up. we will check_subs() when they do
4455 // come up so they will get the creates then.
11fdf7f2
TL
4456 if (sub->session->name.is_osd() &&
4457 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4458 sub->next = send_pg_creates(sub->session->name.num(),
7c673cae
FG
4459 sub->session->con.get(),
4460 sub->next);
4461 }
4462}
4463
c07f9fc5 4464void OSDMonitor::do_application_enable(int64_t pool_id,
11fdf7f2
TL
4465 const std::string &app_name,
4466 const std::string &app_key,
4467 const std::string &app_value)
c07f9fc5 4468{
11fdf7f2 4469 ceph_assert(paxos->is_plugged() && is_writeable());
c07f9fc5
FG
4470
4471 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4472 << dendl;
4473
11fdf7f2 4474 ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
35e4c445 4475
c07f9fc5 4476 auto pp = osdmap.get_pg_pool(pool_id);
11fdf7f2 4477 ceph_assert(pp != nullptr);
c07f9fc5
FG
4478
4479 pg_pool_t p = *pp;
4480 if (pending_inc.new_pools.count(pool_id)) {
4481 p = pending_inc.new_pools[pool_id];
4482 }
4483
11fdf7f2
TL
4484 if (app_key.empty()) {
4485 p.application_metadata.insert({app_name, {}});
4486 } else {
4487 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4488 }
c07f9fc5
FG
4489 p.last_change = pending_inc.epoch;
4490 pending_inc.new_pools[pool_id] = p;
4491}
4492
494da23a
TL
4493void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4494 pool_opts_t::key_t opt,
4495 pool_opts_t::value_t val)
4496{
4497 auto p = pending_inc.new_pools.try_emplace(
4498 pool_id, *osdmap.get_pg_pool(pool_id));
4499 p.first->second.opts.set(opt, val);
4500}
4501
31f18b77 4502unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
4503 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4504 const mempool::osdmap::set<int64_t>& removed_pools,
4505 utime_t modified,
4506 creating_pgs_t* creating_pgs) const
4507{
31f18b77 4508 unsigned queued = 0;
7c673cae
FG
4509 for (auto& p : pools) {
4510 int64_t poolid = p.first;
11fdf7f2
TL
4511 if (creating_pgs->created_pools.count(poolid)) {
4512 dout(10) << __func__ << " already created " << poolid << dendl;
4513 continue;
4514 }
7c673cae 4515 const pg_pool_t& pool = p.second;
31f18b77 4516 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
4517 pool.get_type(), pool.get_size());
4518 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4519 continue;
4520
4521 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4522 const auto created = pool.get_last_change();
4523 if (last_scan_epoch && created <= last_scan_epoch) {
4524 dout(10) << __func__ << " no change in pool " << poolid
4525 << " " << pool << dendl;
4526 continue;
4527 }
4528 if (removed_pools.count(poolid)) {
4529 dout(10) << __func__ << " pool is being removed: " << poolid
4530 << " " << pool << dendl;
4531 continue;
4532 }
31f18b77 4533 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 4534 << " " << pool << dendl;
11fdf7f2
TL
4535 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4536 created, modified);
4537 queued++;
7c673cae 4538 }
31f18b77 4539 return queued;
7c673cae
FG
4540}
4541
4542void OSDMonitor::update_creating_pgs()
4543{
31f18b77
FG
4544 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4545 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
4546 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4547 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 4548 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
4549 int acting_primary = -1;
4550 auto pgid = pg.first;
94b18763
FG
4551 if (!osdmap.pg_exists(pgid)) {
4552 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4553 << dendl;
4554 continue;
4555 }
7c673cae 4556 auto mapped = pg.second.first;
c07f9fc5 4557 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
11fdf7f2
TL
4558 spg_t spgid(pgid);
4559 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
7c673cae
FG
4560 // check the previous creating_pgs, look for the target to whom the pg was
4561 // previously mapped
4562 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4563 const auto last_acting_primary = pgs_by_epoch.first;
4564 for (auto& pgs: pgs_by_epoch.second) {
11fdf7f2 4565 if (pgs.second.count(spgid)) {
7c673cae
FG
4566 if (last_acting_primary == acting_primary) {
4567 mapped = pgs.first;
4568 } else {
4569 dout(20) << __func__ << " " << pgid << " "
4570 << " acting_primary:" << last_acting_primary
4571 << " -> " << acting_primary << dendl;
4572 // note epoch if the target of the create message changed.
4573 mapped = mapping.get_epoch();
4574 }
4575 break;
31f18b77
FG
4576 } else {
4577 // newly creating
4578 mapped = mapping.get_epoch();
4579 }
7c673cae
FG
4580 }
4581 }
4582 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 4583 << " to create " << pgid << "@" << mapped << dendl;
11fdf7f2 4584 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
7c673cae
FG
4585 }
4586 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4587 creating_pgs_epoch = mapping.get_epoch();
4588}
4589
c07f9fc5 4590epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
4591{
4592 dout(30) << __func__ << " osd." << osd << " next=" << next
4593 << " " << creating_pgs_by_osd_epoch << dendl;
4594 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
4595 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4596 dout(20) << __func__
4597 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4598 // the subscribers will be updated when the mapping is completed anyway
4599 return next;
4600 }
7c673cae
FG
4601 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4602 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4603 return next;
11fdf7f2
TL
4604 ceph_assert(!creating_pgs_by_epoch->second.empty());
4605
4606 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4607 MOSDPGCreate2 *m = nullptr;
4608
4609 bool old = osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS;
7c673cae 4610
7c673cae
FG
4611 epoch_t last = 0;
4612 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4613 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4614 auto epoch = epoch_pgs->first;
4615 auto& pgs = epoch_pgs->second;
4616 dout(20) << __func__ << " osd." << osd << " from " << next
4617 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4618 last = epoch;
4619 for (auto& pg : pgs) {
7c673cae
FG
4620 // Need the create time from the monitor using its clock to set
4621 // last_scrub_stamp upon pg creation.
11fdf7f2
TL
4622 auto create = creating_pgs.pgs.find(pg.pgid);
4623 ceph_assert(create != creating_pgs.pgs.end());
4624 if (old) {
4625 if (!oldm) {
4626 oldm = new MOSDPGCreate(creating_pgs_epoch);
4627 }
4628 oldm->mkpg.emplace(pg.pgid,
4629 pg_create_t{create->second.first, pg.pgid, 0});
4630 oldm->ctimes.emplace(pg.pgid, create->second.second);
4631 } else {
4632 if (!m) {
4633 m = new MOSDPGCreate2(creating_pgs_epoch);
4634 }
4635 m->pgs.emplace(pg, create->second);
4636 }
7c673cae 4637 dout(20) << __func__ << " will create " << pg
c07f9fc5 4638 << " at " << create->second.first << dendl;
7c673cae
FG
4639 }
4640 }
11fdf7f2
TL
4641 if (m) {
4642 con->send_message(m);
4643 } else if (oldm) {
4644 con->send_message(oldm);
4645 } else {
7c673cae
FG
4646 dout(20) << __func__ << " osd." << osd << " from " << next
4647 << " has nothing to send" << dendl;
4648 return next;
4649 }
11fdf7f2 4650
7c673cae
FG
4651 // sub is current through last + 1
4652 return last + 1;
4653}
4654
4655// TICK
4656
4657
4658void OSDMonitor::tick()
4659{
4660 if (!is_active()) return;
4661
4662 dout(10) << osdmap << dendl;
4663
11fdf7f2
TL
4664 // always update osdmap manifest, regardless of being the leader.
4665 load_osdmap_manifest();
4666
7c673cae
FG
4667 if (!mon->is_leader()) return;
4668
4669 bool do_propose = false;
4670 utime_t now = ceph_clock_now();
4671
11fdf7f2 4672 if (handle_osd_timeouts(now, last_osd_report)) {
181888fb
FG
4673 do_propose = true;
4674 }
7c673cae
FG
4675
4676 // mark osds down?
11fdf7f2 4677 if (check_failures(now)) {
7c673cae 4678 do_propose = true;
11fdf7f2
TL
4679 }
4680
4681 // Force a proposal if we need to prune; pruning is performed on
4682 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4683 // even if there's nothing going on.
4684 if (is_prune_enabled() && should_prune()) {
4685 do_propose = true;
4686 }
7c673cae
FG
4687
4688 // mark down osds out?
4689
4690 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4691 * influence at all. The decision is made based on the ratio of "in" osds,
4692 * and the function returns false if this ratio is lower that the minimum
11fdf7f2 4693 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
7c673cae
FG
4694 */
4695 if (can_mark_out(-1)) {
11fdf7f2
TL
4696 string down_out_subtree_limit = g_conf().get_val<string>(
4697 "mon_osd_down_out_subtree_limit");
7c673cae
FG
4698 set<int> down_cache; // quick cache of down subtrees
4699
4700 map<int,utime_t>::iterator i = down_pending_out.begin();
4701 while (i != down_pending_out.end()) {
4702 int o = i->first;
4703 utime_t down = now;
4704 down -= i->second;
4705 ++i;
4706
4707 if (osdmap.is_down(o) &&
4708 osdmap.is_in(o) &&
4709 can_mark_out(o)) {
11fdf7f2 4710 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
7c673cae
FG
4711 utime_t grace = orig_grace;
4712 double my_grace = 0.0;
4713
11fdf7f2 4714 if (g_conf()->mon_osd_adjust_down_out_interval) {
7c673cae
FG
4715 // scale grace period the same way we do the heartbeat grace.
4716 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
11fdf7f2 4717 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
7c673cae
FG
4718 double decay_k = ::log(.5) / halflife;
4719 double decay = exp((double)down * decay_k);
4720 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
4721 << " down for " << down << " decay " << decay << dendl;
4722 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
4723 grace += my_grace;
4724 }
4725
4726 // is this an entire large subtree down?
11fdf7f2
TL
4727 if (down_out_subtree_limit.length()) {
4728 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
7c673cae 4729 if (type > 0) {
11fdf7f2
TL
4730 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
4731 dout(10) << "tick entire containing " << down_out_subtree_limit
4732 << " subtree for osd." << o
4733 << " is down; resetting timer" << dendl;
7c673cae
FG
4734 // reset timer, too.
4735 down_pending_out[o] = now;
4736 continue;
4737 }
4738 }
4739 }
4740
c07f9fc5 4741 bool down_out = !osdmap.is_destroyed(o) &&
11fdf7f2 4742 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
c07f9fc5 4743 bool destroyed_out = osdmap.is_destroyed(o) &&
11fdf7f2 4744 g_conf()->mon_osd_destroyed_out_interval > 0 &&
c07f9fc5
FG
4745 // this is not precise enough as we did not make a note when this osd
4746 // was marked as destroyed, but let's not bother with that
4747 // complexity for now.
11fdf7f2 4748 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
c07f9fc5 4749 if (down_out || destroyed_out) {
7c673cae
FG
4750 dout(10) << "tick marking osd." << o << " OUT after " << down
4751 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
4752 pending_inc.new_weight[o] = CEPH_OSD_OUT;
4753
4754 // set the AUTOOUT bit.
4755 if (pending_inc.new_state.count(o) == 0)
4756 pending_inc.new_state[o] = 0;
4757 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
4758
4759 // remember previous weight
4760 if (pending_inc.new_xinfo.count(o) == 0)
4761 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
4762 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
4763
4764 do_propose = true;
4765
224ce89b
WB
4766 mon->clog->info() << "Marking osd." << o << " out (has been down for "
4767 << int(down.sec()) << " seconds)";
7c673cae
FG
4768 } else
4769 continue;
4770 }
4771
4772 down_pending_out.erase(o);
4773 }
4774 } else {
4775 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
4776 }
4777
4778 // expire blacklisted items?
4779 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4780 p != osdmap.blacklist.end();
4781 ++p) {
4782 if (p->second < now) {
4783 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
4784 pending_inc.old_blacklist.push_back(p->first);
4785 do_propose = true;
4786 }
4787 }
4788
11fdf7f2
TL
4789 if (try_prune_purged_snaps()) {
4790 do_propose = true;
7c673cae
FG
4791 }
4792
4793 if (update_pools_status())
4794 do_propose = true;
4795
4796 if (do_propose ||
4797 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
4798 propose_pending();
eafe8130
TL
4799
4800 {
4801 std::lock_guard l(balancer_lock);
4802 if (ceph_using_tcmalloc() && mon_memory_autotune && pcm != nullptr) {
4803 pcm->tune_memory();
4804 pcm->balance();
4805 _set_new_cache_sizes();
4806 dout(10) << "tick balancer "
4807 << " inc cache_bytes: " << inc_cache->get_cache_bytes()
4808 << " inc comtd_bytes: " << inc_cache->get_committed_size()
4809 << " inc used_bytes: " << inc_cache->_get_used_bytes()
4810 << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
4811 << dendl;
4812 dout(10) << "tick balancer "
4813 << " full cache_bytes: " << full_cache->get_cache_bytes()
4814 << " full comtd_bytes: " << full_cache->get_committed_size()
4815 << " full used_bytes: " << full_cache->_get_used_bytes()
4816 << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
4817 << dendl;
4818 }
4819 }
4820}
4821
4822void OSDMonitor::_set_new_cache_sizes()
4823{
4824 uint64_t cache_size = 0;
4825 int64_t inc_alloc = 0;
4826 int64_t full_alloc = 0;
4827 int64_t kv_alloc = 0;
4828
4829 if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
4830 cache_size = pcm->get_tuned_mem();
4831 inc_alloc = inc_cache->get_committed_size();
4832 full_alloc = full_cache->get_committed_size();
4833 kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
4834 }
4835
4836 inc_osd_cache.set_bytes(inc_alloc);
4837 full_osd_cache.set_bytes(full_alloc);
4838
4839 dout(10) << __func__ << " cache_size:" << cache_size
4840 << " inc_alloc: " << inc_alloc
4841 << " full_alloc: " << full_alloc
4842 << " kv_alloc: " << kv_alloc
4843 << dendl;
7c673cae
FG
4844}
4845
4846bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
4847 std::map<int,utime_t> &last_osd_report)
4848{
11fdf7f2 4849 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
7c673cae
FG
4850 if (now - mon->get_leader_since() < timeo) {
4851 // We haven't been the leader for long enough to consider OSD timeouts
4852 return false;
4853 }
4854
4855 int max_osd = osdmap.get_max_osd();
4856 bool new_down = false;
4857
4858 for (int i=0; i < max_osd; ++i) {
4859 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
4860 if (!osdmap.exists(i)) {
4861 last_osd_report.erase(i); // if any
4862 continue;
4863 }
7c673cae
FG
4864 if (!osdmap.is_up(i))
4865 continue;
4866 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
4867 if (t == last_osd_report.end()) {
4868 // it wasn't in the map; start the timer.
4869 last_osd_report[i] = now;
4870 } else if (can_mark_down(i)) {
4871 utime_t diff = now - t->second;
4872 if (diff > timeo) {
31f18b77
FG
4873 mon->clog->info() << "osd." << i << " marked down after no beacon for "
4874 << diff << " seconds";
4875 derr << "no beacon from osd." << i << " since " << t->second
4876 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
4877 pending_inc.new_state[i] = CEPH_OSD_UP;
4878 new_down = true;
4879 }
4880 }
4881 }
4882 return new_down;
4883}
4884
11fdf7f2
TL
4885static void dump_cpu_list(Formatter *f, const char *name,
4886 const string& strlist)
7c673cae 4887{
11fdf7f2
TL
4888 cpu_set_t cpu_set;
4889 size_t cpu_set_size;
4890 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
4891 return;
4892 }
4893 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
4894 f->open_array_section(name);
4895 for (auto cpu : cpus) {
4896 f->dump_int("cpu", cpu);
7c673cae 4897 }
11fdf7f2 4898 f->close_section();
7c673cae
FG
4899}
4900
4901void OSDMonitor::dump_info(Formatter *f)
4902{
4903 f->open_object_section("osdmap");
4904 osdmap.dump(f);
4905 f->close_section();
4906
4907 f->open_array_section("osd_metadata");
4908 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4909 if (osdmap.exists(i)) {
4910 f->open_object_section("osd");
4911 f->dump_unsigned("id", i);
4912 dump_osd_metadata(i, f, NULL);
4913 f->close_section();
4914 }
4915 }
4916 f->close_section();
4917
4918 f->dump_unsigned("osdmap_first_committed", get_first_committed());
4919 f->dump_unsigned("osdmap_last_committed", get_last_committed());
4920
4921 f->open_object_section("crushmap");
4922 osdmap.crush->dump(f);
4923 f->close_section();
11fdf7f2
TL
4924
4925 if (has_osdmap_manifest) {
4926 f->open_object_section("osdmap_manifest");
4927 osdmap_manifest.dump(f);
4928 f->close_section();
4929 }
7c673cae
FG
4930}
4931
4932namespace {
4933 enum osd_pool_get_choices {
11fdf7f2 4934 SIZE, MIN_SIZE,
28e407b8 4935 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
7c673cae
FG
4936 NODELETE, NOPGCHANGE, NOSIZECHANGE,
4937 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4938 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
11fdf7f2 4939 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
7c673cae
FG
4940 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4941 CACHE_TARGET_FULL_RATIO,
4942 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4943 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4944 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4945 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4946 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4947 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4948 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4949 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
11fdf7f2
TL
4950 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
4951 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
4952 PG_AUTOSCALE_BIAS };
7c673cae
FG
4953
4954 std::set<osd_pool_get_choices>
4955 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4956 const std::set<osd_pool_get_choices>& second)
4957 {
4958 std::set<osd_pool_get_choices> result;
4959 std::set_difference(first.begin(), first.end(),
4960 second.begin(), second.end(),
4961 std::inserter(result, result.end()));
4962 return result;
4963 }
4964}
4965
4966
4967bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4968{
4969 op->mark_osdmon_event(__func__);
4970 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4971 int r = 0;
4972 bufferlist rdata;
4973 stringstream ss, ds;
4974
11fdf7f2 4975 cmdmap_t cmdmap;
7c673cae
FG
4976 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4977 string rs = ss.str();
4978 mon->reply_command(op, -EINVAL, rs, get_last_committed());
4979 return true;
4980 }
4981
11fdf7f2 4982 MonSession *session = op->get_session();
7c673cae 4983 if (!session) {
11fdf7f2 4984 derr << __func__ << " no session" << dendl;
7c673cae
FG
4985 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4986 return true;
4987 }
4988
4989 string prefix;
11fdf7f2 4990 cmd_getval(cct, cmdmap, "prefix", prefix);
7c673cae
FG
4991
4992 string format;
11fdf7f2 4993 cmd_getval(cct, cmdmap, "format", format, string("plain"));
7c673cae
FG
4994 boost::scoped_ptr<Formatter> f(Formatter::create(format));
4995
4996 if (prefix == "osd stat") {
11fdf7f2 4997 osdmap.print_summary(f.get(), ds, "", true);
7c673cae
FG
4998 if (f)
4999 f->flush(rdata);
5000 else
5001 rdata.append(ds);
5002 }
7c673cae
FG
5003 else if (prefix == "osd dump" ||
5004 prefix == "osd tree" ||
11fdf7f2 5005 prefix == "osd tree-from" ||
7c673cae
FG
5006 prefix == "osd ls" ||
5007 prefix == "osd getmap" ||
31f18b77
FG
5008 prefix == "osd getcrushmap" ||
5009 prefix == "osd ls-tree") {
7c673cae
FG
5010 string val;
5011
5012 epoch_t epoch = 0;
5013 int64_t epochnum;
11fdf7f2 5014 cmd_getval(cct, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
7c673cae
FG
5015 epoch = epochnum;
5016
5017 bufferlist osdmap_bl;
5018 int err = get_version_full(epoch, osdmap_bl);
5019 if (err == -ENOENT) {
5020 r = -ENOENT;
5021 ss << "there is no map for epoch " << epoch;
5022 goto reply;
5023 }
11fdf7f2
TL
5024 ceph_assert(err == 0);
5025 ceph_assert(osdmap_bl.length());
7c673cae
FG
5026
5027 OSDMap *p;
5028 if (epoch == osdmap.get_epoch()) {
5029 p = &osdmap;
5030 } else {
5031 p = new OSDMap;
5032 p->decode(osdmap_bl);
5033 }
5034
224ce89b
WB
5035 auto sg = make_scope_guard([&] {
5036 if (p != &osdmap) {
5037 delete p;
5038 }
5039 });
5040
7c673cae
FG
5041 if (prefix == "osd dump") {
5042 stringstream ds;
5043 if (f) {
5044 f->open_object_section("osdmap");
5045 p->dump(f.get());
5046 f->close_section();
5047 f->flush(ds);
5048 } else {
5049 p->print(ds);
5050 }
5051 rdata.append(ds);
5052 if (!f)
5053 ds << " ";
5054 } else if (prefix == "osd ls") {
5055 if (f) {
5056 f->open_array_section("osds");
5057 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5058 if (osdmap.exists(i)) {
5059 f->dump_int("osd", i);
5060 }
5061 }
5062 f->close_section();
5063 f->flush(ds);
5064 } else {
5065 bool first = true;
5066 for (int i = 0; i < osdmap.get_max_osd(); i++) {
5067 if (osdmap.exists(i)) {
5068 if (!first)
5069 ds << "\n";
5070 first = false;
5071 ds << i;
5072 }
5073 }
5074 }
5075 rdata.append(ds);
11fdf7f2
TL
5076 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5077 string bucket;
5078 if (prefix == "osd tree-from") {
5079 cmd_getval(cct, cmdmap, "bucket", bucket);
5080 if (!osdmap.crush->name_exists(bucket)) {
5081 ss << "bucket '" << bucket << "' does not exist";
5082 r = -ENOENT;
5083 goto reply;
5084 }
5085 int id = osdmap.crush->get_item_id(bucket);
5086 if (id >= 0) {
5087 ss << "\"" << bucket << "\" is not a bucket";
5088 r = -EINVAL;
5089 goto reply;
5090 }
5091 }
5092
31f18b77 5093 vector<string> states;
11fdf7f2 5094 cmd_getval(cct, cmdmap, "states", states);
31f18b77
FG
5095 unsigned filter = 0;
5096 for (auto& s : states) {
5097 if (s == "up") {
5098 filter |= OSDMap::DUMP_UP;
5099 } else if (s == "down") {
5100 filter |= OSDMap::DUMP_DOWN;
5101 } else if (s == "in") {
5102 filter |= OSDMap::DUMP_IN;
5103 } else if (s == "out") {
5104 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
5105 } else if (s == "destroyed") {
5106 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
5107 } else {
5108 ss << "unrecognized state '" << s << "'";
5109 r = -EINVAL;
5110 goto reply;
5111 }
5112 }
5113 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
5114 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5115 ss << "cannot specify both 'in' and 'out'";
5116 r = -EINVAL;
5117 goto reply;
5118 }
5119 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5120 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5121 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5122 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5123 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5124 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5125 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
5126 r = -EINVAL;
5127 goto reply;
5128 }
7c673cae
FG
5129 if (f) {
5130 f->open_object_section("tree");
11fdf7f2 5131 p->print_tree(f.get(), NULL, filter, bucket);
7c673cae
FG
5132 f->close_section();
5133 f->flush(ds);
5134 } else {
11fdf7f2 5135 p->print_tree(NULL, &ds, filter, bucket);
7c673cae
FG
5136 }
5137 rdata.append(ds);
5138 } else if (prefix == "osd getmap") {
5139 rdata.append(osdmap_bl);
5140 ss << "got osdmap epoch " << p->get_epoch();
5141 } else if (prefix == "osd getcrushmap") {
5142 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
5143 ss << p->get_crush_version();
5144 } else if (prefix == "osd ls-tree") {
5145 string bucket_name;
11fdf7f2 5146 cmd_getval(cct, cmdmap, "name", bucket_name);
31f18b77
FG
5147 set<int> osds;
5148 r = p->get_osds_by_bucket_name(bucket_name, &osds);
5149 if (r == -ENOENT) {
5150 ss << "\"" << bucket_name << "\" does not exist";
5151 goto reply;
5152 } else if (r < 0) {
5153 ss << "can not parse bucket name:\"" << bucket_name << "\"";
5154 goto reply;
5155 }
5156
5157 if (f) {
5158 f->open_array_section("osds");
5159 for (auto &i : osds) {
5160 if (osdmap.exists(i)) {
5161 f->dump_int("osd", i);
5162 }
5163 }
5164 f->close_section();
5165 f->flush(ds);
5166 } else {
5167 bool first = true;
5168 for (auto &i : osds) {
5169 if (osdmap.exists(i)) {
5170 if (!first)
5171 ds << "\n";
5172 first = false;
5173 ds << i;
5174 }
5175 }
5176 }
5177
5178 rdata.append(ds);
7c673cae 5179 }
7c673cae
FG
5180 } else if (prefix == "osd getmaxosd") {
5181 if (f) {
5182 f->open_object_section("getmaxosd");
5183 f->dump_unsigned("epoch", osdmap.get_epoch());
5184 f->dump_int("max_osd", osdmap.get_max_osd());
5185 f->close_section();
5186 f->flush(rdata);
5187 } else {
5188 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5189 rdata.append(ds);
5190 }
5191 } else if (prefix == "osd utilization") {
5192 string out;
5193 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5194 if (f)
5195 f->flush(rdata);
5196 else
5197 rdata.append(out);
5198 r = 0;
5199 goto reply;
5200 } else if (prefix == "osd find") {
5201 int64_t osd;
11fdf7f2 5202 if (!cmd_getval(cct, cmdmap, "id", osd)) {
7c673cae
FG
5203 ss << "unable to parse osd id value '"
5204 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5205 r = -EINVAL;
5206 goto reply;
5207 }
5208 if (!osdmap.exists(osd)) {
5209 ss << "osd." << osd << " does not exist";
5210 r = -ENOENT;
5211 goto reply;
5212 }
5213 string format;
11fdf7f2 5214 cmd_getval(cct, cmdmap, "format", format);
7c673cae
FG
5215 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5216 f->open_object_section("osd_location");
5217 f->dump_int("osd", osd);
11fdf7f2 5218 f->dump_object("addrs", osdmap.get_addrs(osd));
f64942e4 5219 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
11fdf7f2
TL
5220
5221 // try to identify host, pod/container name, etc.
5222 map<string,string> m;
5223 load_metadata(osd, m, nullptr);
5224 if (auto p = m.find("hostname"); p != m.end()) {
5225 f->dump_string("host", p->second);
5226 }
5227 for (auto& k : {
5228 "pod_name", "pod_namespace", // set by rook
5229 "container_name" // set by ceph-ansible
5230 }) {
5231 if (auto p = m.find(k); p != m.end()) {
5232 f->dump_string(k, p->second);
5233 }
5234 }
5235
5236 // crush is helpful too
7c673cae
FG
5237 f->open_object_section("crush_location");
5238 map<string,string> loc = osdmap.crush->get_full_location(osd);
5239 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5240 f->dump_string(p->first.c_str(), p->second);
5241 f->close_section();
5242 f->close_section();
5243 f->flush(rdata);
5244 } else if (prefix == "osd metadata") {
5245 int64_t osd = -1;
5246 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
11fdf7f2 5247 !cmd_getval(cct, cmdmap, "id", osd)) {
7c673cae
FG
5248 ss << "unable to parse osd id value '"
5249 << cmd_vartype_stringify(cmdmap["id"]) << "'";
5250 r = -EINVAL;
5251 goto reply;
5252 }
5253 if (osd >= 0 && !osdmap.exists(osd)) {
5254 ss << "osd." << osd << " does not exist";
5255 r = -ENOENT;
5256 goto reply;
5257 }
5258 string format;
11fdf7f2 5259 cmd_getval(cct, cmdmap, "format", format);
7c673cae
FG
5260 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5261 if (osd >= 0) {
5262 f->open_object_section("osd_metadata");
5263 f->dump_unsigned("id", osd);
5264 r = dump_osd_metadata(osd, f.get(), &ss);
5265 if (r < 0)
5266 goto reply;
5267 f->close_section();
5268 } else {
5269 r = 0;
5270 f->open_array_section("osd_metadata");
5271 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5272 if (osdmap.exists(i)) {
5273 f->open_object_section("osd");
5274 f->dump_unsigned("id", i);
5275 r = dump_osd_metadata(i, f.get(), NULL);
5276 if (r == -EINVAL || r == -ENOENT) {
5277 // Drop error, continue to get other daemons' metadata
5278 dout(4) << "No metadata for osd." << i << dendl;
5279 r = 0;
5280 } else if (r < 0) {
5281 // Unexpected error
5282 goto reply;
5283 }
5284 f->close_section();
5285 }
5286 }
5287 f->close_section();
5288 }
5289 f->flush(rdata);
31f18b77
FG
5290 } else if (prefix == "osd versions") {
5291 if (!f)
5292 f.reset(Formatter::create("json-pretty"));
5293 count_metadata("ceph_version", f.get());
5294 f->flush(rdata);
5295 r = 0;
5296 } else if (prefix == "osd count-metadata") {
5297 if (!f)
5298 f.reset(Formatter::create("json-pretty"));
5299 string field;
11fdf7f2 5300 cmd_getval(cct, cmdmap, "property", field);
31f18b77
FG
5301 count_metadata(field, f.get());
5302 f->flush(rdata);
5303 r = 0;
11fdf7f2
TL
5304 } else if (prefix == "osd numa-status") {
5305 TextTable tbl;
5306 if (f) {
5307 f->open_array_section("osds");
5308 } else {
5309 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5310 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5311 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5312 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5313 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5314 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5315 }
5316 for (int i=0; i<osdmap.get_max_osd(); ++i) {
5317 if (osdmap.exists(i)) {
5318 map<string,string> m;
5319 ostringstream err;
5320 if (load_metadata(i, m, &err) < 0) {
5321 continue;
5322 }
5323 string host;
5324 auto p = m.find("hostname");
5325 if (p != m.end()) {
5326 host = p->second;
5327 }
5328 if (f) {
5329 f->open_object_section("osd");
5330 f->dump_int("osd", i);
5331 f->dump_string("host", host);
5332 for (auto n : { "network_numa_node", "objectstore_numa_node",
5333 "numa_node" }) {
5334 p = m.find(n);
5335 if (p != m.end()) {
5336 f->dump_int(n, atoi(p->second.c_str()));
5337 }
5338 }
5339 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5340 p = m.find(n);
5341 if (p != m.end()) {
5342 list<string> ls = get_str_list(p->second, ",");
5343 f->open_array_section(n);
5344 for (auto node : ls) {
5345 f->dump_int("node", atoi(node.c_str()));
5346 }
5347 f->close_section();
5348 }
5349 }
5350 for (auto n : { "numa_node_cpus" }) {
5351 p = m.find(n);
5352 if (p != m.end()) {
5353 dump_cpu_list(f.get(), n, p->second);
5354 }
5355 }
5356 f->close_section();
5357 } else {
5358 tbl << i;
5359 tbl << host;
5360 p = m.find("network_numa_nodes");
5361 if (p != m.end()) {
5362 tbl << p->second;
5363 } else {
5364 tbl << "-";
5365 }
5366 p = m.find("objectstore_numa_nodes");
5367 if (p != m.end()) {
5368 tbl << p->second;
5369 } else {
5370 tbl << "-";
5371 }
5372 p = m.find("numa_node");
5373 auto q = m.find("numa_node_cpus");
5374 if (p != m.end() && q != m.end()) {
5375 tbl << p->second;
5376 tbl << q->second;
5377 } else {
5378 tbl << "-";
5379 tbl << "-";
5380 }
5381 tbl << TextTable::endrow;
5382 }
5383 }
5384 }
5385 if (f) {
5386 f->close_section();
5387 f->flush(rdata);
5388 } else {
5389 rdata.append(stringify(tbl));
5390 }
7c673cae
FG
5391 } else if (prefix == "osd map") {
5392 string poolstr, objstr, namespacestr;
11fdf7f2
TL
5393 cmd_getval(cct, cmdmap, "pool", poolstr);
5394 cmd_getval(cct, cmdmap, "object", objstr);
5395 cmd_getval(cct, cmdmap, "nspace", namespacestr);
7c673cae
FG
5396
5397 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5398 if (pool < 0) {
5399 ss << "pool " << poolstr << " does not exist";
5400 r = -ENOENT;
5401 goto reply;
5402 }
5403 object_locator_t oloc(pool, namespacestr);
5404 object_t oid(objstr);
5405 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5406 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5407 vector<int> up, acting;
5408 int up_p, acting_p;
5409 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5410
5411 string fullobjname;
5412 if (!namespacestr.empty())
5413 fullobjname = namespacestr + string("/") + oid.name;
5414 else
5415 fullobjname = oid.name;
5416 if (f) {
5417 f->open_object_section("osd_map");
5418 f->dump_unsigned("epoch", osdmap.get_epoch());
5419 f->dump_string("pool", poolstr);
5420 f->dump_int("pool_id", pool);
5421 f->dump_stream("objname") << fullobjname;
5422 f->dump_stream("raw_pgid") << pgid;
5423 f->dump_stream("pgid") << mpgid;
5424 f->open_array_section("up");
5425 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5426 f->dump_int("osd", *p);
5427 f->close_section();
5428 f->dump_int("up_primary", up_p);
5429 f->open_array_section("acting");
5430 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5431 f->dump_int("osd", *p);
5432 f->close_section();
5433 f->dump_int("acting_primary", acting_p);
5434 f->close_section(); // osd_map
5435 f->flush(rdata);
5436 } else {
5437 ds << "osdmap e" << osdmap.get_epoch()
5438 << " pool '" << poolstr << "' (" << pool << ")"
5439 << " object '" << fullobjname << "' ->"
5440 << " pg " << pgid << " (" << mpgid << ")"
5441 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5442 << pg_vector_string(acting) << ", p" << acting_p << ")";
5443 rdata.append(ds);
5444 }
5445
5446 } else if (prefix == "pg map") {
5447 pg_t pgid;
5448 string pgidstr;
11fdf7f2 5449 cmd_getval(cct, cmdmap, "pgid", pgidstr);
7c673cae
FG
5450 if (!pgid.parse(pgidstr.c_str())) {
5451 ss << "invalid pgid '" << pgidstr << "'";
5452 r = -EINVAL;
5453 goto reply;
5454 }
5455 vector<int> up, acting;
5456 if (!osdmap.have_pg_pool(pgid.pool())) {
5457 ss << "pg '" << pgidstr << "' does not exist";
5458 r = -ENOENT;
5459 goto reply;
5460 }
5461 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5462 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5463 if (f) {
5464 f->open_object_section("pg_map");
5465 f->dump_unsigned("epoch", osdmap.get_epoch());
5466 f->dump_stream("raw_pgid") << pgid;
5467 f->dump_stream("pgid") << mpgid;
5468 f->open_array_section("up");
5469 for (auto osd : up) {
5470 f->dump_int("up_osd", osd);
5471 }
5472 f->close_section();
5473 f->open_array_section("acting");
5474 for (auto osd : acting) {
5475 f->dump_int("acting_osd", osd);
5476 }
5477 f->close_section();
5478 f->close_section();
5479 f->flush(rdata);
5480 } else {
5481 ds << "osdmap e" << osdmap.get_epoch()
5482 << " pg " << pgid << " (" << mpgid << ")"
5483 << " -> up " << up << " acting " << acting;
5484 rdata.append(ds);
5485 }
5486 goto reply;
5487
7c673cae 5488 } else if (prefix == "osd lspools") {
7c673cae
FG
5489 if (f)
5490 f->open_array_section("pools");
5491 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5492 p != osdmap.pools.end();
5493 ++p) {
11fdf7f2
TL
5494 if (f) {
5495 f->open_object_section("pool");
5496 f->dump_int("poolnum", p->first);
5497 f->dump_string("poolname", osdmap.pool_name[p->first]);
5498 f->close_section();
5499 } else {
5500 ds << p->first << ' ' << osdmap.pool_name[p->first];
5501 if (next(p) != osdmap.pools.end()) {
5502 ds << '\n';
7c673cae
FG
5503 }
5504 }
5505 }
5506 if (f) {
5507 f->close_section();
5508 f->flush(ds);
5509 }
5510 rdata.append(ds);
5511 } else if (prefix == "osd blacklist ls") {
5512 if (f)
5513 f->open_array_section("blacklist");
5514
5515 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5516 p != osdmap.blacklist.end();
5517 ++p) {
5518 if (f) {
5519 f->open_object_section("entry");
11fdf7f2 5520 f->dump_string("addr", p->first.get_legacy_str());
7c673cae
FG
5521 f->dump_stream("until") << p->second;
5522 f->close_section();
5523 } else {
5524 stringstream ss;
5525 string s;
5526 ss << p->first << " " << p->second;
5527 getline(ss, s);
5528 s += "\n";
5529 rdata.append(s);
5530 }
5531 }
5532 if (f) {
5533 f->close_section();
5534 f->flush(rdata);
5535 }
5536 ss << "listed " << osdmap.blacklist.size() << " entries";
5537
5538 } else if (prefix == "osd pool ls") {
5539 string detail;
11fdf7f2 5540 cmd_getval(cct, cmdmap, "detail", detail);
7c673cae
FG
5541 if (!f && detail == "detail") {
5542 ostringstream ss;
5543 osdmap.print_pools(ss);
5544 rdata.append(ss.str());
5545 } else {
5546 if (f)
5547 f->open_array_section("pools");
5548 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5549 it != osdmap.get_pools().end();
5550 ++it) {
5551 if (f) {
5552 if (detail == "detail") {
5553 f->open_object_section("pool");
eafe8130 5554 f->dump_int("pool_id", it->first);
7c673cae
FG
5555 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5556 it->second.dump(f.get());
5557 f->close_section();
5558 } else {
5559 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5560 }
5561 } else {
5562 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5563 }
5564 }
5565 if (f) {
5566 f->close_section();
5567 f->flush(rdata);
5568 }
5569 }
5570
5571 } else if (prefix == "osd crush get-tunable") {
5572 string tunable;
11fdf7f2 5573 cmd_getval(cct, cmdmap, "tunable", tunable);
7c673cae
FG
5574 ostringstream rss;
5575 if (f)
5576 f->open_object_section("tunable");
5577 if (tunable == "straw_calc_version") {
5578 if (f)
5579 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5580 else
5581 rss << osdmap.crush->get_straw_calc_version() << "\n";
5582 } else {
5583 r = -EINVAL;
5584 goto reply;
5585 }
5586 if (f) {
5587 f->close_section();
5588 f->flush(rdata);
5589 } else {
5590 rdata.append(rss.str());
5591 }
5592 r = 0;
5593
5594 } else if (prefix == "osd pool get") {
5595 string poolstr;
11fdf7f2 5596 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
5597 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5598 if (pool < 0) {
5599 ss << "unrecognized pool '" << poolstr << "'";
5600 r = -ENOENT;
5601 goto reply;
5602 }
5603
5604 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5605 string var;
11fdf7f2 5606 cmd_getval(cct, cmdmap, "var", var);
7c673cae
FG
5607
5608 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5609 const choices_map_t ALL_CHOICES = {
5610 {"size", SIZE},
5611 {"min_size", MIN_SIZE},
7c673cae 5612 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
28e407b8
AA
5613 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5614 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
7c673cae
FG
5615 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5616 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5617 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5618 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5619 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5620 {"use_gmt_hitset", USE_GMT_HITSET},
11fdf7f2 5621 {"target_max_objects", TARGET_MAX_OBJECTS},
7c673cae
FG
5622 {"target_max_bytes", TARGET_MAX_BYTES},
5623 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5624 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5625 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5626 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5627 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5628 {"erasure_code_profile", ERASURE_CODE_PROFILE},
5629 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5630 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5631 {"fast_read", FAST_READ},
5632 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5633 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5634 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5635 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5636 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5637 {"recovery_priority", RECOVERY_PRIORITY},
5638 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5639 {"scrub_priority", SCRUB_PRIORITY},
5640 {"compression_mode", COMPRESSION_MODE},
5641 {"compression_algorithm", COMPRESSION_ALGORITHM},
5642 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5643 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5644 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5645 {"csum_type", CSUM_TYPE},
5646 {"csum_max_block", CSUM_MAX_BLOCK},
5647 {"csum_min_block", CSUM_MIN_BLOCK},
11fdf7f2
TL
5648 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5649 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5650 {"pg_num_min", PG_NUM_MIN},
5651 {"target_size_bytes", TARGET_SIZE_BYTES},
5652 {"target_size_ratio", TARGET_SIZE_RATIO},
5653 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
7c673cae
FG
5654 };
5655
5656 typedef std::set<osd_pool_get_choices> choices_set_t;
5657
5658 const choices_set_t ONLY_TIER_CHOICES = {
5659 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5660 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5661 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5662 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5663 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 5664 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
5665 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
5666 };
5667 const choices_set_t ONLY_ERASURE_CHOICES = {
28e407b8 5668 EC_OVERWRITES, ERASURE_CODE_PROFILE
7c673cae
FG
5669 };
5670
5671 choices_set_t selected_choices;
5672 if (var == "all") {
5673 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
5674 it != ALL_CHOICES.end(); ++it) {
5675 selected_choices.insert(it->second);
5676 }
5677
5678 if(!p->is_tier()) {
5679 selected_choices = subtract_second_from_first(selected_choices,
5680 ONLY_TIER_CHOICES);
5681 }
5682
5683 if(!p->is_erasure()) {
5684 selected_choices = subtract_second_from_first(selected_choices,
5685 ONLY_ERASURE_CHOICES);
5686 }
5687 } else /* var != "all" */ {
5688 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
5689 osd_pool_get_choices selected = found->second;
5690
5691 if (!p->is_tier() &&
5692 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
5693 ss << "pool '" << poolstr
5694 << "' is not a tier pool: variable not applicable";
5695 r = -EACCES;
5696 goto reply;
5697 }
5698
5699 if (!p->is_erasure() &&
5700 ONLY_ERASURE_CHOICES.find(selected)
5701 != ONLY_ERASURE_CHOICES.end()) {
5702 ss << "pool '" << poolstr
5703 << "' is not a erasure pool: variable not applicable";
5704 r = -EACCES;
5705 goto reply;
5706 }
5707
94b18763
FG
5708 if (pool_opts_t::is_opt_name(var) &&
5709 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
5710 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
5711 r = -ENOENT;
5712 goto reply;
5713 }
5714
7c673cae
FG
5715 selected_choices.insert(selected);
5716 }
5717
5718 if (f) {
94b18763
FG
5719 f->open_object_section("pool");
5720 f->dump_string("pool", poolstr);
5721 f->dump_int("pool_id", pool);
7c673cae
FG
5722 for(choices_set_t::const_iterator it = selected_choices.begin();
5723 it != selected_choices.end(); ++it) {
5724 choices_map_t::const_iterator i;
c07f9fc5
FG
5725 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5726 if (i->second == *it) {
5727 break;
5728 }
5729 }
11fdf7f2 5730 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
5731 switch(*it) {
5732 case PG_NUM:
5733 f->dump_int("pg_num", p->get_pg_num());
5734 break;
5735 case PGP_NUM:
5736 f->dump_int("pgp_num", p->get_pgp_num());
5737 break;
7c673cae
FG
5738 case SIZE:
5739 f->dump_int("size", p->get_size());
5740 break;
5741 case MIN_SIZE:
5742 f->dump_int("min_size", p->get_min_size());
5743 break;
7c673cae 5744 case CRUSH_RULE:
31f18b77 5745 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 5746 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 5747 p->get_crush_rule()));
7c673cae 5748 } else {
31f18b77 5749 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
5750 }
5751 break;
28e407b8
AA
5752 case EC_OVERWRITES:
5753 f->dump_bool("allow_ec_overwrites",
5754 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
5755 break;
11fdf7f2
TL
5756 case PG_AUTOSCALE_MODE:
5757 f->dump_string("pg_autoscale_mode",
5758 pg_pool_t::get_pg_autoscale_mode_name(
5759 p->pg_autoscale_mode));
5760 break;
7c673cae
FG
5761 case HASHPSPOOL:
5762 case NODELETE:
5763 case NOPGCHANGE:
5764 case NOSIZECHANGE:
5765 case WRITE_FADVISE_DONTNEED:
5766 case NOSCRUB:
5767 case NODEEP_SCRUB:
94b18763
FG
5768 f->dump_bool(i->first.c_str(),
5769 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
5770 break;
5771 case HIT_SET_PERIOD:
5772 f->dump_int("hit_set_period", p->hit_set_period);
5773 break;
5774 case HIT_SET_COUNT:
5775 f->dump_int("hit_set_count", p->hit_set_count);
5776 break;
5777 case HIT_SET_TYPE:
5778 f->dump_string("hit_set_type",
5779 HitSet::get_type_name(p->hit_set_params.get_type()));
5780 break;
5781 case HIT_SET_FPP:
5782 {
5783 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5784 BloomHitSet::Params *bloomp =
5785 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5786 f->dump_float("hit_set_fpp", bloomp->get_fpp());
5787 } else if(var != "all") {
5788 f->close_section();
5789 ss << "hit set is not of type Bloom; " <<
5790 "invalid to get a false positive rate!";
5791 r = -EINVAL;
5792 goto reply;
5793 }
5794 }
5795 break;
5796 case USE_GMT_HITSET:
5797 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
5798 break;
5799 case TARGET_MAX_OBJECTS:
5800 f->dump_unsigned("target_max_objects", p->target_max_objects);
5801 break;
5802 case TARGET_MAX_BYTES:
5803 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
5804 break;
5805 case CACHE_TARGET_DIRTY_RATIO:
5806 f->dump_unsigned("cache_target_dirty_ratio_micro",
5807 p->cache_target_dirty_ratio_micro);
5808 f->dump_float("cache_target_dirty_ratio",
5809 ((float)p->cache_target_dirty_ratio_micro/1000000));
5810 break;
5811 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5812 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
5813 p->cache_target_dirty_high_ratio_micro);
5814 f->dump_float("cache_target_dirty_high_ratio",
5815 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
5816 break;
5817 case CACHE_TARGET_FULL_RATIO:
5818 f->dump_unsigned("cache_target_full_ratio_micro",
5819 p->cache_target_full_ratio_micro);
5820 f->dump_float("cache_target_full_ratio",
5821 ((float)p->cache_target_full_ratio_micro/1000000));
5822 break;
5823 case CACHE_MIN_FLUSH_AGE:
5824 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
5825 break;
5826 case CACHE_MIN_EVICT_AGE:
5827 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
5828 break;
5829 case ERASURE_CODE_PROFILE:
5830 f->dump_string("erasure_code_profile", p->erasure_code_profile);
5831 break;
5832 case MIN_READ_RECENCY_FOR_PROMOTE:
5833 f->dump_int("min_read_recency_for_promote",
5834 p->min_read_recency_for_promote);
5835 break;
5836 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5837 f->dump_int("min_write_recency_for_promote",
5838 p->min_write_recency_for_promote);
5839 break;
5840 case FAST_READ:
5841 f->dump_int("fast_read", p->fast_read);
5842 break;
5843 case HIT_SET_GRADE_DECAY_RATE:
5844 f->dump_int("hit_set_grade_decay_rate",
5845 p->hit_set_grade_decay_rate);
5846 break;
5847 case HIT_SET_SEARCH_LAST_N:
5848 f->dump_int("hit_set_search_last_n",
5849 p->hit_set_search_last_n);
5850 break;
5851 case SCRUB_MIN_INTERVAL:
5852 case SCRUB_MAX_INTERVAL:
5853 case DEEP_SCRUB_INTERVAL:
5854 case RECOVERY_PRIORITY:
5855 case RECOVERY_OP_PRIORITY:
5856 case SCRUB_PRIORITY:
5857 case COMPRESSION_MODE:
5858 case COMPRESSION_ALGORITHM:
5859 case COMPRESSION_REQUIRED_RATIO:
5860 case COMPRESSION_MAX_BLOB_SIZE:
5861 case COMPRESSION_MIN_BLOB_SIZE:
5862 case CSUM_TYPE:
5863 case CSUM_MAX_BLOCK:
5864 case CSUM_MIN_BLOCK:
11fdf7f2
TL
5865 case FINGERPRINT_ALGORITHM:
5866 case PG_NUM_MIN:
5867 case TARGET_SIZE_BYTES:
5868 case TARGET_SIZE_RATIO:
5869 case PG_AUTOSCALE_BIAS:
c07f9fc5
FG
5870 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5871 if (p->opts.is_set(key)) {
c07f9fc5 5872 if(*it == CSUM_TYPE) {
11fdf7f2 5873 int64_t val;
c07f9fc5
FG
5874 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
5875 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
5876 } else {
5877 p->opts.dump(i->first, f.get());
5878 }
94b18763 5879 }
7c673cae
FG
5880 break;
5881 }
7c673cae 5882 }
94b18763
FG
5883 f->close_section();
5884 f->flush(rdata);
7c673cae
FG
5885 } else /* !f */ {
5886 for(choices_set_t::const_iterator it = selected_choices.begin();
5887 it != selected_choices.end(); ++it) {
5888 choices_map_t::const_iterator i;
5889 switch(*it) {
5890 case PG_NUM:
5891 ss << "pg_num: " << p->get_pg_num() << "\n";
5892 break;
5893 case PGP_NUM:
5894 ss << "pgp_num: " << p->get_pgp_num() << "\n";
5895 break;
7c673cae
FG
5896 case SIZE:
5897 ss << "size: " << p->get_size() << "\n";
5898 break;
5899 case MIN_SIZE:
5900 ss << "min_size: " << p->get_min_size() << "\n";
5901 break;
7c673cae 5902 case CRUSH_RULE:
31f18b77 5903 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 5904 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 5905 p->get_crush_rule()) << "\n";
7c673cae 5906 } else {
31f18b77 5907 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
5908 }
5909 break;
11fdf7f2
TL
5910 case PG_AUTOSCALE_MODE:
5911 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
5912 p->pg_autoscale_mode) <<"\n";
5913 break;
7c673cae
FG
5914 case HIT_SET_PERIOD:
5915 ss << "hit_set_period: " << p->hit_set_period << "\n";
5916 break;
5917 case HIT_SET_COUNT:
5918 ss << "hit_set_count: " << p->hit_set_count << "\n";
5919 break;
5920 case HIT_SET_TYPE:
5921 ss << "hit_set_type: " <<
5922 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
5923 break;
5924 case HIT_SET_FPP:
5925 {
5926 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5927 BloomHitSet::Params *bloomp =
5928 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5929 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
5930 } else if(var != "all") {
5931 ss << "hit set is not of type Bloom; " <<
5932 "invalid to get a false positive rate!";
5933 r = -EINVAL;
5934 goto reply;
5935 }
5936 }
5937 break;
5938 case USE_GMT_HITSET:
5939 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
5940 break;
5941 case TARGET_MAX_OBJECTS:
5942 ss << "target_max_objects: " << p->target_max_objects << "\n";
5943 break;
5944 case TARGET_MAX_BYTES:
5945 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
5946 break;
5947 case CACHE_TARGET_DIRTY_RATIO:
5948 ss << "cache_target_dirty_ratio: "
5949 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
5950 break;
5951 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5952 ss << "cache_target_dirty_high_ratio: "
5953 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
5954 break;
5955 case CACHE_TARGET_FULL_RATIO:
5956 ss << "cache_target_full_ratio: "
5957 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
5958 break;
5959 case CACHE_MIN_FLUSH_AGE:
5960 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5961 break;
5962 case CACHE_MIN_EVICT_AGE:
5963 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5964 break;
5965 case ERASURE_CODE_PROFILE:
5966 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5967 break;
5968 case MIN_READ_RECENCY_FOR_PROMOTE:
5969 ss << "min_read_recency_for_promote: " <<
5970 p->min_read_recency_for_promote << "\n";
5971 break;
5972 case HIT_SET_GRADE_DECAY_RATE:
5973 ss << "hit_set_grade_decay_rate: " <<
5974 p->hit_set_grade_decay_rate << "\n";
5975 break;
5976 case HIT_SET_SEARCH_LAST_N:
5977 ss << "hit_set_search_last_n: " <<
5978 p->hit_set_search_last_n << "\n";
5979 break;
28e407b8
AA
5980 case EC_OVERWRITES:
5981 ss << "allow_ec_overwrites: " <<
5982 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
5983 "\n";
5984 break;
7c673cae
FG
5985 case HASHPSPOOL:
5986 case NODELETE:
5987 case NOPGCHANGE:
5988 case NOSIZECHANGE:
5989 case WRITE_FADVISE_DONTNEED:
5990 case NOSCRUB:
5991 case NODEEP_SCRUB:
5992 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5993 if (i->second == *it)
5994 break;
5995 }
11fdf7f2 5996 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
5997 ss << i->first << ": " <<
5998 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5999 "true" : "false") << "\n";
6000 break;
6001 case MIN_WRITE_RECENCY_FOR_PROMOTE:
6002 ss << "min_write_recency_for_promote: " <<
6003 p->min_write_recency_for_promote << "\n";
6004 break;
6005 case FAST_READ:
6006 ss << "fast_read: " << p->fast_read << "\n";
6007 break;
6008 case SCRUB_MIN_INTERVAL:
6009 case SCRUB_MAX_INTERVAL:
6010 case DEEP_SCRUB_INTERVAL:
6011 case RECOVERY_PRIORITY:
6012 case RECOVERY_OP_PRIORITY:
6013 case SCRUB_PRIORITY:
6014 case COMPRESSION_MODE:
6015 case COMPRESSION_ALGORITHM:
6016 case COMPRESSION_REQUIRED_RATIO:
6017 case COMPRESSION_MAX_BLOB_SIZE:
6018 case COMPRESSION_MIN_BLOB_SIZE:
6019 case CSUM_TYPE:
6020 case CSUM_MAX_BLOCK:
6021 case CSUM_MIN_BLOCK:
11fdf7f2
TL
6022 case FINGERPRINT_ALGORITHM:
6023 case PG_NUM_MIN:
6024 case TARGET_SIZE_BYTES:
6025 case TARGET_SIZE_RATIO:
6026 case PG_AUTOSCALE_BIAS:
7c673cae
FG
6027 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6028 if (i->second == *it)
6029 break;
6030 }
11fdf7f2 6031 ceph_assert(i != ALL_CHOICES.end());
7c673cae
FG
6032 {
6033 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6034 if (p->opts.is_set(key)) {
6035 if(key == pool_opts_t::CSUM_TYPE) {
11fdf7f2 6036 int64_t val;
7c673cae
FG
6037 p->opts.get(key, &val);
6038 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6039 } else {
6040 ss << i->first << ": " << p->opts.get(key) << "\n";
6041 }
6042 }
6043 }
6044 break;
6045 }
6046 rdata.append(ss.str());
6047 ss.str("");
6048 }
6049 }
6050 r = 0;
7c673cae
FG
6051 } else if (prefix == "osd pool get-quota") {
6052 string pool_name;
11fdf7f2 6053 cmd_getval(cct, cmdmap, "pool", pool_name);
7c673cae
FG
6054
6055 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6056 if (poolid < 0) {
11fdf7f2 6057 ceph_assert(poolid == -ENOENT);
7c673cae
FG
6058 ss << "unrecognized pool '" << pool_name << "'";
6059 r = -ENOENT;
6060 goto reply;
6061 }
6062 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6063
6064 if (f) {
6065 f->open_object_section("pool_quotas");
6066 f->dump_string("pool_name", pool_name);
6067 f->dump_unsigned("pool_id", poolid);
6068 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6069 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6070 f->close_section();
6071 f->flush(rdata);
6072 } else {
6073 stringstream rs;
6074 rs << "quotas for pool '" << pool_name << "':\n"
6075 << " max objects: ";
6076 if (p->quota_max_objects == 0)
6077 rs << "N/A";
6078 else
1adf2230 6079 rs << si_u_t(p->quota_max_objects) << " objects";
7c673cae
FG
6080 rs << "\n"
6081 << " max bytes : ";
6082 if (p->quota_max_bytes == 0)
6083 rs << "N/A";
6084 else
1adf2230 6085 rs << byte_u_t(p->quota_max_bytes);
7c673cae
FG
6086 rdata.append(rs.str());
6087 }
6088 rdata.append("\n");
6089 r = 0;
6090 } else if (prefix == "osd crush rule list" ||
6091 prefix == "osd crush rule ls") {
c07f9fc5
FG
6092 if (f) {
6093 f->open_array_section("rules");
6094 osdmap.crush->list_rules(f.get());
6095 f->close_section();
6096 f->flush(rdata);
6097 } else {
6098 ostringstream ss;
6099 osdmap.crush->list_rules(&ss);
6100 rdata.append(ss.str());
6101 }
b5b8bbf5
FG
6102 } else if (prefix == "osd crush rule ls-by-class") {
6103 string class_name;
11fdf7f2 6104 cmd_getval(cct, cmdmap, "class", class_name);
b5b8bbf5
FG
6105 if (class_name.empty()) {
6106 ss << "no class specified";
6107 r = -EINVAL;
6108 goto reply;
6109 }
6110 set<int> rules;
6111 r = osdmap.crush->get_rules_by_class(class_name, &rules);
6112 if (r < 0) {
6113 ss << "failed to get rules by class '" << class_name << "'";
6114 goto reply;
6115 }
6116 if (f) {
6117 f->open_array_section("rules");
6118 for (auto &rule: rules) {
6119 f->dump_string("name", osdmap.crush->get_rule_name(rule));
6120 }
6121 f->close_section();
6122 f->flush(rdata);
6123 } else {
6124 ostringstream rs;
6125 for (auto &rule: rules) {
6126 rs << osdmap.crush->get_rule_name(rule) << "\n";
6127 }
6128 rdata.append(rs.str());
6129 }
7c673cae
FG
6130 } else if (prefix == "osd crush rule dump") {
6131 string name;
11fdf7f2 6132 cmd_getval(cct, cmdmap, "name", name);
7c673cae 6133 string format;
11fdf7f2 6134 cmd_getval(cct, cmdmap, "format", format);
7c673cae
FG
6135 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6136 if (name == "") {
6137 f->open_array_section("rules");
6138 osdmap.crush->dump_rules(f.get());
6139 f->close_section();
6140 } else {
6141 int ruleno = osdmap.crush->get_rule_id(name);
6142 if (ruleno < 0) {
31f18b77 6143 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
6144 r = ruleno;
6145 goto reply;
6146 }
6147 osdmap.crush->dump_rule(ruleno, f.get());
6148 }
6149 ostringstream rs;
6150 f->flush(rs);
6151 rs << "\n";
6152 rdata.append(rs.str());
6153 } else if (prefix == "osd crush dump") {
6154 string format;
11fdf7f2 6155 cmd_getval(cct, cmdmap, "format", format);
7c673cae
FG
6156 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6157 f->open_object_section("crush_map");
6158 osdmap.crush->dump(f.get());
6159 f->close_section();
6160 ostringstream rs;
6161 f->flush(rs);
6162 rs << "\n";
6163 rdata.append(rs.str());
6164 } else if (prefix == "osd crush show-tunables") {
6165 string format;
11fdf7f2 6166 cmd_getval(cct, cmdmap, "format", format);
7c673cae
FG
6167 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6168 f->open_object_section("crush_map_tunables");
6169 osdmap.crush->dump_tunables(f.get());
6170 f->close_section();
6171 ostringstream rs;
6172 f->flush(rs);
6173 rs << "\n";
6174 rdata.append(rs.str());
6175 } else if (prefix == "osd crush tree") {
c07f9fc5 6176 string shadow;
11fdf7f2 6177 cmd_getval(cct, cmdmap, "shadow", shadow);
c07f9fc5
FG
6178 bool show_shadow = shadow == "--show-shadow";
6179 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6180 if (f) {
91327a77 6181 f->open_object_section("crush_tree");
c07f9fc5
FG
6182 osdmap.crush->dump_tree(nullptr,
6183 f.get(),
6184 osdmap.get_pool_names(),
6185 show_shadow);
91327a77 6186 f->close_section();
c07f9fc5
FG
6187 f->flush(rdata);
6188 } else {
6189 ostringstream ss;
6190 osdmap.crush->dump_tree(&ss,
6191 nullptr,
6192 osdmap.get_pool_names(),
6193 show_shadow);
6194 rdata.append(ss.str());
6195 }
d2e6a577
FG
6196 } else if (prefix == "osd crush ls") {
6197 string name;
11fdf7f2 6198 if (!cmd_getval(cct, cmdmap, "node", name)) {
d2e6a577
FG
6199 ss << "no node specified";
6200 r = -EINVAL;
6201 goto reply;
6202 }
6203 if (!osdmap.crush->name_exists(name)) {
6204 ss << "node '" << name << "' does not exist";
6205 r = -ENOENT;
6206 goto reply;
6207 }
6208 int id = osdmap.crush->get_item_id(name);
6209 list<int> result;
6210 if (id >= 0) {
6211 result.push_back(id);
6212 } else {
6213 int num = osdmap.crush->get_bucket_size(id);
6214 for (int i = 0; i < num; ++i) {
6215 result.push_back(osdmap.crush->get_bucket_item(id, i));
6216 }
6217 }
6218 if (f) {
6219 f->open_array_section("items");
6220 for (auto i : result) {
6221 f->dump_string("item", osdmap.crush->get_item_name(i));
6222 }
6223 f->close_section();
6224 f->flush(rdata);
6225 } else {
6226 ostringstream ss;
6227 for (auto i : result) {
6228 ss << osdmap.crush->get_item_name(i) << "\n";
6229 }
6230 rdata.append(ss.str());
6231 }
6232 r = 0;
7c673cae
FG
6233 } else if (prefix == "osd crush class ls") {
6234 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6235 f->open_array_section("crush_classes");
6236 for (auto i : osdmap.crush->class_name)
6237 f->dump_string("class", i.second);
6238 f->close_section();
6239 f->flush(rdata);
224ce89b
WB
6240 } else if (prefix == "osd crush class ls-osd") {
6241 string name;
11fdf7f2 6242 cmd_getval(cct, cmdmap, "class", name);
224ce89b
WB
6243 set<int> osds;
6244 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
6245 if (f) {
6246 f->open_array_section("osds");
6247 for (auto &osd: osds)
6248 f->dump_int("osd", osd);
6249 f->close_section();
6250 f->flush(rdata);
6251 } else {
6252 bool first = true;
6253 for (auto &osd : osds) {
6254 if (!first)
6255 ds << "\n";
6256 first = false;
6257 ds << osd;
6258 }
6259 rdata.append(ds);
6260 }
11fdf7f2
TL
6261 } else if (prefix == "osd crush get-device-class") {
6262 vector<string> idvec;
6263 cmd_getval(cct, cmdmap, "ids", idvec);
6264 map<int, string> class_by_osd;
6265 for (auto& id : idvec) {
6266 ostringstream ts;
6267 long osd = parse_osd_id(id.c_str(), &ts);
6268 if (osd < 0) {
6269 ss << "unable to parse osd id:'" << id << "'";
6270 r = -EINVAL;
6271 goto reply;
6272 }
6273 auto device_class = osdmap.crush->get_item_class(osd);
6274 if (device_class)
6275 class_by_osd[osd] = device_class;
6276 else
6277 class_by_osd[osd] = ""; // no class
6278 }
6279 if (f) {
6280 f->open_array_section("osd_device_classes");
6281 for (auto& i : class_by_osd) {
6282 f->open_object_section("osd_device_class");
6283 f->dump_int("osd", i.first);
6284 f->dump_string("device_class", i.second);
6285 f->close_section();
6286 }
6287 f->close_section();
6288 f->flush(rdata);
6289 } else {
6290 if (class_by_osd.size() == 1) {
6291 // for single input, make a clean output
6292 ds << class_by_osd.begin()->second;
6293 } else {
6294 // note that we do not group osds by class here
6295 for (auto it = class_by_osd.begin();
6296 it != class_by_osd.end();
6297 it++) {
6298 ds << "osd." << it->first << ' ' << it->second;
6299 if (next(it) != class_by_osd.end())
6300 ds << '\n';
6301 }
6302 }
6303 rdata.append(ds);
6304 }
7c673cae
FG
6305 } else if (prefix == "osd erasure-code-profile ls") {
6306 const auto &profiles = osdmap.get_erasure_code_profiles();
6307 if (f)
6308 f->open_array_section("erasure-code-profiles");
6309 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6310 if (f)
6311 f->dump_string("profile", i->first.c_str());
6312 else
6313 rdata.append(i->first + "\n");
6314 }
6315 if (f) {
6316 f->close_section();
6317 ostringstream rs;
6318 f->flush(rs);
6319 rs << "\n";
6320 rdata.append(rs.str());
6321 }
c07f9fc5
FG
6322 } else if (prefix == "osd crush weight-set ls") {
6323 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6324 if (f) {
6325 f->open_array_section("weight_sets");
6326 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6327 f->dump_string("pool", "(compat)");
6328 }
6329 for (auto& i : osdmap.crush->choose_args) {
6330 if (i.first >= 0) {
6331 f->dump_string("pool", osdmap.get_pool_name(i.first));
6332 }
6333 }
6334 f->close_section();
6335 f->flush(rdata);
6336 } else {
6337 ostringstream rs;
6338 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6339 rs << "(compat)\n";
6340 }
6341 for (auto& i : osdmap.crush->choose_args) {
6342 if (i.first >= 0) {
6343 rs << osdmap.get_pool_name(i.first) << "\n";
6344 }
6345 }
6346 rdata.append(rs.str());
6347 }
6348 } else if (prefix == "osd crush weight-set dump") {
6349 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6350 "json-pretty"));
6351 osdmap.crush->dump_choose_args(f.get());
6352 f->flush(rdata);
7c673cae
FG
6353 } else if (prefix == "osd erasure-code-profile get") {
6354 string name;
11fdf7f2 6355 cmd_getval(cct, cmdmap, "name", name);
7c673cae
FG
6356 if (!osdmap.has_erasure_code_profile(name)) {
6357 ss << "unknown erasure code profile '" << name << "'";
6358 r = -ENOENT;
6359 goto reply;
6360 }
6361 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6362 if (f)
6363 f->open_object_section("profile");
6364 for (map<string,string>::const_iterator i = profile.begin();
6365 i != profile.end();
6366 ++i) {
6367 if (f)
6368 f->dump_string(i->first.c_str(), i->second.c_str());
6369 else
6370 rdata.append(i->first + "=" + i->second + "\n");
6371 }
6372 if (f) {
6373 f->close_section();
6374 ostringstream rs;
6375 f->flush(rs);
6376 rs << "\n";
6377 rdata.append(rs.str());
6378 }
181888fb
FG
6379 } else if (prefix == "osd pool application get") {
6380 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6381 "json-pretty"));
6382 string pool_name;
11fdf7f2 6383 cmd_getval(cct, cmdmap, "pool", pool_name);
181888fb 6384 string app;
11fdf7f2 6385 cmd_getval(cct, cmdmap, "app", app);
181888fb 6386 string key;
11fdf7f2 6387 cmd_getval(cct, cmdmap, "key", key);
181888fb
FG
6388
6389 if (pool_name.empty()) {
6390 // all
6391 f->open_object_section("pools");
6392 for (const auto &pool : osdmap.pools) {
6393 std::string name("<unknown>");
6394 const auto &pni = osdmap.pool_name.find(pool.first);
6395 if (pni != osdmap.pool_name.end())
6396 name = pni->second;
6397 f->open_object_section(name.c_str());
6398 for (auto &app_pair : pool.second.application_metadata) {
6399 f->open_object_section(app_pair.first.c_str());
6400 for (auto &kv_pair : app_pair.second) {
6401 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6402 }
6403 f->close_section();
6404 }
6405 f->close_section(); // name
6406 }
6407 f->close_section(); // pools
6408 f->flush(rdata);
6409 } else {
6410 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6411 if (pool < 0) {
6412 ss << "unrecognized pool '" << pool_name << "'";
6413 r = -ENOENT;
6414 goto reply;
6415 }
6416 auto p = osdmap.get_pg_pool(pool);
6417 // filter by pool
6418 if (app.empty()) {
6419 f->open_object_section(pool_name.c_str());
6420 for (auto &app_pair : p->application_metadata) {
6421 f->open_object_section(app_pair.first.c_str());
6422 for (auto &kv_pair : app_pair.second) {
6423 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6424 }
6425 f->close_section(); // application
6426 }
6427 f->close_section(); // pool_name
6428 f->flush(rdata);
6429 goto reply;
6430 }
6431
6432 auto app_it = p->application_metadata.find(app);
6433 if (app_it == p->application_metadata.end()) {
6434 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6435 r = -ENOENT;
6436 goto reply;
6437 }
6438 // filter by pool + app
6439 if (key.empty()) {
6440 f->open_object_section(app_it->first.c_str());
6441 for (auto &kv_pair : app_it->second) {
6442 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6443 }
6444 f->close_section(); // application
6445 f->flush(rdata);
6446 goto reply;
6447 }
6448 // filter by pool + app + key
6449 auto key_it = app_it->second.find(key);
6450 if (key_it == app_it->second.end()) {
6451 ss << "application '" << app << "' on pool '" << pool_name
6452 << "' does not have key '" << key << "'";
6453 r = -ENOENT;
6454 goto reply;
6455 }
6456 ss << key_it->second << "\n";
6457 rdata.append(ss.str());
6458 ss.str("");
6459 }
11fdf7f2
TL
6460 } else if (prefix == "osd get-require-min-compat-client") {
6461 ss << ceph_release_name(osdmap.require_min_compat_client) << std::endl;
6462 rdata.append(ss.str());
6463 ss.str("");
6464 goto reply;
6465 } else if (prefix == "osd pool application enable" ||
6466 prefix == "osd pool application disable" ||
6467 prefix == "osd pool application set" ||
6468 prefix == "osd pool application rm") {
6469 bool changed = false;
6470 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6471 if (r != 0) {
6472 // Error, reply.
6473 goto reply;
6474 } else if (changed) {
6475 // Valid mutation, proceed to prepare phase
6476 return false;
6477 } else {
6478 // Idempotent case, reply
6479 goto reply;
6480 }
7c673cae
FG
6481 } else {
6482 // try prepare update
6483 return false;
6484 }
6485
6486 reply:
6487 string rs;
6488 getline(ss, rs);
6489 mon->reply_command(op, r, rs, rdata, get_last_committed());
6490 return true;
6491}
6492
3efd9988
FG
6493void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6494{
6495 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6496 osdmap.get_pg_pool(pool_id));
11fdf7f2 6497 ceph_assert(pool);
3efd9988
FG
6498 pool->set_flag(flags);
6499}
6500
6501void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 6502{
3efd9988
FG
6503 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6504 osdmap.get_pg_pool(pool_id));
11fdf7f2 6505 ceph_assert(pool);
3efd9988 6506 pool->unset_flag(flags);
7c673cae
FG
6507}
6508
11fdf7f2
TL
6509string OSDMonitor::make_snap_epoch_key(int64_t pool, epoch_t epoch)
6510{
6511 char k[80];
6512 snprintf(k, sizeof(k), "removed_epoch_%llu_%08lx",
6513 (unsigned long long)pool, (unsigned long)epoch);
6514 return k;
6515}
6516
6517string OSDMonitor::make_snap_key(int64_t pool, snapid_t snap)
6518{
6519 char k[80];
6520 snprintf(k, sizeof(k), "removed_snap_%llu_%016llx",
6521 (unsigned long long)pool, (unsigned long long)snap);
6522 return k;
6523}
6524
6525
6526string OSDMonitor::make_snap_key_value(
6527 int64_t pool, snapid_t snap, snapid_t num,
6528 epoch_t epoch, bufferlist *v)
6529{
6530 // encode the *last* epoch in the key so that we can use forward
6531 // iteration only to search for an epoch in an interval.
6532 encode(snap, *v);
6533 encode(snap + num, *v);
6534 encode(epoch, *v);
6535 return make_snap_key(pool, snap + num - 1);
6536}
6537
6538string OSDMonitor::make_snap_purged_key(int64_t pool, snapid_t snap)
6539{
6540 char k[80];
6541 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6542 (unsigned long long)pool, (unsigned long long)snap);
6543 return k;
6544}
6545string OSDMonitor::make_snap_purged_key_value(
6546 int64_t pool, snapid_t snap, snapid_t num,
6547 epoch_t epoch, bufferlist *v)
6548{
6549 // encode the *last* epoch in the key so that we can use forward
6550 // iteration only to search for an epoch in an interval.
6551 encode(snap, *v);
6552 encode(snap + num, *v);
6553 encode(epoch, *v);
6554 return make_snap_purged_key(pool, snap + num - 1);
6555}
6556
6557int OSDMonitor::lookup_pruned_snap(int64_t pool, snapid_t snap,
6558 snapid_t *begin, snapid_t *end)
6559{
6560 string k = make_snap_key(pool, snap);
6561 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6562 it->lower_bound(k);
6563 if (!it->valid()) {
6564 return -ENOENT;
6565 }
6566 if (it->key().find(OSD_SNAP_PREFIX) != 0) {
6567 return -ENOENT;
6568 }
6569 bufferlist v = it->value();
6570 auto p = v.cbegin();
6571 decode(*begin, p);
6572 decode(*end, p);
6573 if (snap < *begin || snap >= *end) {
6574 return -ENOENT;
6575 }
6576 return 0;
6577}
6578
6579bool OSDMonitor::try_prune_purged_snaps()
6580{
6581 if (!mon->mgrstatmon()->is_readable()) {
6582 return false;
6583 }
6584 if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC) {
6585 return false;
6586 }
6587 if (!pending_inc.new_purged_snaps.empty()) {
6588 return false; // we already pruned for this epoch
6589 }
6590
6591 unsigned max_prune = cct->_conf.get_val<uint64_t>(
6592 "mon_max_snap_prune_per_epoch");
6593 if (!max_prune) {
6594 max_prune = 100000;
6595 }
6596 dout(10) << __func__ << " max_prune " << max_prune << dendl;
6597
6598 unsigned actually_pruned = 0;
6599 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
6600 for (auto& p : osdmap.get_pools()) {
6601 auto q = purged_snaps.find(p.first);
6602 if (q == purged_snaps.end()) {
6603 continue;
6604 }
6605 auto& purged = q->second;
6606 if (purged.empty()) {
6607 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
6608 continue;
6609 }
6610 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
6611 OSDMap::snap_interval_set_t to_prune;
6612 unsigned maybe_pruned = actually_pruned;
6613 for (auto i = purged.begin(); i != purged.end(); ++i) {
6614 snapid_t begin = i.get_start();
6615 auto end = i.get_start() + i.get_len();
6616 snapid_t pbegin = 0, pend = 0;
6617 int r = lookup_pruned_snap(p.first, begin, &pbegin, &pend);
6618 if (r == 0) {
6619 // already purged.
6620 // be a bit aggressive about backing off here, because the mon may
6621 // do a lot of work going through this set, and if we know the
6622 // purged set from the OSDs is at least *partly* stale we may as
6623 // well wait for it to be fresh.
6624 dout(20) << __func__ << " we've already pruned " << pbegin
6625 << "~" << (pend - pbegin) << dendl;
6626 break; // next pool
6627 }
6628 if (pbegin && pbegin < end) {
6629 // the tail of [begin,end) is purged; shorten the range
6630 ceph_assert(pbegin > begin);
6631 end = pbegin;
6632 }
6633 to_prune.insert(begin, end - begin);
6634 maybe_pruned += end - begin;
6635 if (maybe_pruned >= max_prune) {
6636 break;
6637 }
6638 }
6639 if (!to_prune.empty()) {
6640 // PGs may still be reporting things as purged that we have already
6641 // pruned from removed_snaps_queue.
6642 OSDMap::snap_interval_set_t actual;
6643 auto r = osdmap.removed_snaps_queue.find(p.first);
6644 if (r != osdmap.removed_snaps_queue.end()) {
6645 actual.intersection_of(to_prune, r->second);
6646 }
6647 actually_pruned += actual.size();
6648 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
6649 << ", actual pruned " << actual << dendl;
6650 if (!actual.empty()) {
6651 pending_inc.new_purged_snaps[p.first].swap(actual);
6652 }
6653 }
6654 if (actually_pruned >= max_prune) {
6655 break;
6656 }
6657 }
6658 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
6659 return !!actually_pruned;
6660}
6661
7c673cae
FG
6662bool OSDMonitor::update_pools_status()
6663{
11fdf7f2 6664 if (!mon->mgrstatmon()->is_readable())
7c673cae
FG
6665 return false;
6666
6667 bool ret = false;
6668
6669 auto& pools = osdmap.get_pools();
6670 for (auto it = pools.begin(); it != pools.end(); ++it) {
11fdf7f2 6671 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
31f18b77 6672 if (!pstat)
7c673cae 6673 continue;
31f18b77 6674 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
6675 const pg_pool_t &pool = it->second;
6676 const string& pool_name = osdmap.get_pool_name(it->first);
6677
6678 bool pool_is_full =
6679 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
6680 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
6681
11fdf7f2 6682 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7c673cae
FG
6683 if (pool_is_full)
6684 continue;
6685
6686 mon->clog->info() << "pool '" << pool_name
3efd9988
FG
6687 << "' no longer out of quota; removing NO_QUOTA flag";
6688 // below we cancel FLAG_FULL too, we'll set it again in
6689 // OSDMonitor::encode_pending if it still fails the osd-full checking.
6690 clear_pool_flags(it->first,
11fdf7f2 6691 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
6692 ret = true;
6693 } else {
6694 if (!pool_is_full)
6695 continue;
6696
6697 if (pool.quota_max_bytes > 0 &&
6698 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
6699 mon->clog->warn() << "pool '" << pool_name << "' is full"
6700 << " (reached quota's max_bytes: "
1adf2230 6701 << byte_u_t(pool.quota_max_bytes) << ")";
7c673cae
FG
6702 }
6703 if (pool.quota_max_objects > 0 &&
6704 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
6705 mon->clog->warn() << "pool '" << pool_name << "' is full"
6706 << " (reached quota's max_objects: "
6707 << pool.quota_max_objects << ")";
6708 }
11fdf7f2 6709 // set both FLAG_FULL_QUOTA and FLAG_FULL
3efd9988
FG
6710 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
6711 // since FLAG_FULL should always take precedence
6712 set_pool_flags(it->first,
11fdf7f2 6713 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
3efd9988
FG
6714 clear_pool_flags(it->first,
6715 pg_pool_t::FLAG_NEARFULL |
6716 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
6717 ret = true;
6718 }
6719 }
6720 return ret;
6721}
6722
7c673cae
FG
6723int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
6724{
6725 op->mark_osdmon_event(__func__);
6726 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
6727 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
11fdf7f2 6728 MonSession *session = op->get_session();
7c673cae
FG
6729 if (!session)
6730 return -EPERM;
6731 string erasure_code_profile;
6732 stringstream ss;
31f18b77 6733 string rule_name;
94b18763 6734 int ret = 0;
11fdf7f2
TL
6735 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
6736 0, 0, 0, 0, 0, 0.0,
6737 erasure_code_profile,
6738 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
94b18763
FG
6739
6740 if (ret < 0) {
6741 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
6742 }
6743 return ret;
7c673cae
FG
6744}
6745
6746int OSDMonitor::crush_rename_bucket(const string& srcname,
6747 const string& dstname,
6748 ostream *ss)
6749{
6750 int ret;
6751 //
6752 // Avoid creating a pending crush if it does not already exists and
6753 // the rename would fail.
6754 //
6755 if (!_have_pending_crush()) {
6756 ret = _get_stable_crush().can_rename_bucket(srcname,
6757 dstname,
6758 ss);
6759 if (ret)
6760 return ret;
6761 }
6762
6763 CrushWrapper newcrush;
6764 _get_pending_crush(newcrush);
6765
6766 ret = newcrush.rename_bucket(srcname,
6767 dstname,
6768 ss);
6769 if (ret)
6770 return ret;
6771
6772 pending_inc.crush.clear();
6773 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6774 *ss << "renamed bucket " << srcname << " into " << dstname;
6775 return 0;
6776}
6777
6778void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
6779{
6780 string replacement = "";
6781
6782 if (plugin == "jerasure_generic" ||
6783 plugin == "jerasure_sse3" ||
6784 plugin == "jerasure_sse4" ||
6785 plugin == "jerasure_neon") {
6786 replacement = "jerasure";
6787 } else if (plugin == "shec_generic" ||
6788 plugin == "shec_sse3" ||
6789 plugin == "shec_sse4" ||
6790 plugin == "shec_neon") {
6791 replacement = "shec";
6792 }
6793
6794 if (replacement != "") {
6795 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
6796 << plugin << " that has been deprecated. Please use "
6797 << replacement << " instead." << dendl;
6798 }
6799}
6800
6801int OSDMonitor::normalize_profile(const string& profilename,
6802 ErasureCodeProfile &profile,
6803 bool force,
6804 ostream *ss)
6805{
6806 ErasureCodeInterfaceRef erasure_code;
6807 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6808 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
6809 check_legacy_ec_plugin(plugin->second, profilename);
6810 int err = instance.factory(plugin->second,
11fdf7f2 6811 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
6812 profile, &erasure_code, ss);
6813 if (err) {
6814 return err;
6815 }
6816
6817 err = erasure_code->init(profile, ss);
6818 if (err) {
6819 return err;
6820 }
6821
6822 auto it = profile.find("stripe_unit");
6823 if (it != profile.end()) {
6824 string err_str;
1adf2230 6825 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7c673cae
FG
6826 if (!err_str.empty()) {
6827 *ss << "could not parse stripe_unit '" << it->second
6828 << "': " << err_str << std::endl;
6829 return -EINVAL;
6830 }
6831 uint32_t data_chunks = erasure_code->get_data_chunk_count();
6832 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
6833 if (chunk_size != stripe_unit) {
6834 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
6835 << "alignment. Would be padded to " << chunk_size
6836 << std::endl;
6837 return -EINVAL;
6838 }
6839 if ((stripe_unit % 4096) != 0 && !force) {
6840 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
6841 << "use --force to override this check" << std::endl;
6842 return -EINVAL;
6843 }
6844 }
6845 return 0;
6846}
6847
31f18b77 6848int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 6849 const string &profile,
31f18b77 6850 int *rule,
7c673cae
FG
6851 ostream *ss)
6852{
6853 int ruleid = osdmap.crush->get_rule_id(name);
6854 if (ruleid != -ENOENT) {
31f18b77 6855 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
6856 return -EEXIST;
6857 }
6858
6859 CrushWrapper newcrush;
6860 _get_pending_crush(newcrush);
6861
6862 ruleid = newcrush.get_rule_id(name);
6863 if (ruleid != -ENOENT) {
31f18b77 6864 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
6865 return -EALREADY;
6866 } else {
6867 ErasureCodeInterfaceRef erasure_code;
6868 int err = get_erasure_code(profile, &erasure_code, ss);
6869 if (err) {
6870 *ss << "failed to load plugin using profile " << profile << std::endl;
6871 return err;
6872 }
6873
224ce89b 6874 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
6875 erasure_code.reset();
6876 if (err < 0)
6877 return err;
31f18b77 6878 *rule = err;
7c673cae
FG
6879 pending_inc.crush.clear();
6880 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6881 return 0;
6882 }
6883}
6884
6885int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
6886 ErasureCodeInterfaceRef *erasure_code,
6887 ostream *ss) const
6888{
6889 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
6890 return -EAGAIN;
6891 ErasureCodeProfile profile =
6892 osdmap.get_erasure_code_profile(erasure_code_profile);
6893 ErasureCodeProfile::const_iterator plugin =
6894 profile.find("plugin");
6895 if (plugin == profile.end()) {
6896 *ss << "cannot determine the erasure code plugin"
6897 << " because there is no 'plugin' entry in the erasure_code_profile "
6898 << profile << std::endl;
6899 return -EINVAL;
6900 }
6901 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
6902 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6903 return instance.factory(plugin->second,
11fdf7f2 6904 g_conf().get_val<std::string>("erasure_code_dir"),
7c673cae
FG
6905 profile, erasure_code, ss);
6906}
6907
6908int OSDMonitor::check_cluster_features(uint64_t features,
6909 stringstream &ss)
6910{
6911 stringstream unsupported_ss;
6912 int unsupported_count = 0;
6913 if ((mon->get_quorum_con_features() & features) != features) {
6914 unsupported_ss << "the monitor cluster";
6915 ++unsupported_count;
6916 }
6917
6918 set<int32_t> up_osds;
6919 osdmap.get_up_osds(up_osds);
6920 for (set<int32_t>::iterator it = up_osds.begin();
6921 it != up_osds.end(); ++it) {
6922 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
6923 if ((xi.features & features) != features) {
6924 if (unsupported_count > 0)
6925 unsupported_ss << ", ";
6926 unsupported_ss << "osd." << *it;
6927 unsupported_count ++;
6928 }
6929 }
6930
6931 if (unsupported_count > 0) {
6932 ss << "features " << features << " unsupported by: "
6933 << unsupported_ss.str();
6934 return -ENOTSUP;
6935 }
6936
6937 // check pending osd state, too!
6938 for (map<int32_t,osd_xinfo_t>::const_iterator p =
6939 pending_inc.new_xinfo.begin();
6940 p != pending_inc.new_xinfo.end(); ++p) {
6941 const osd_xinfo_t &xi = p->second;
6942 if ((xi.features & features) != features) {
6943 dout(10) << __func__ << " pending osd." << p->first
6944 << " features are insufficient; retry" << dendl;
6945 return -EAGAIN;
6946 }
6947 }
6948
6949 return 0;
6950}
6951
6952bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
6953 stringstream& ss)
6954{
6955 OSDMap::Incremental new_pending = pending_inc;
11fdf7f2 6956 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7c673cae
FG
6957 OSDMap newmap;
6958 newmap.deepish_copy_from(osdmap);
6959 newmap.apply_incremental(new_pending);
6960
6961 // client compat
31f18b77 6962 if (newmap.require_min_compat_client > 0) {
7c673cae 6963 auto mv = newmap.get_min_compat_client();
31f18b77
FG
6964 if (mv > newmap.require_min_compat_client) {
6965 ss << "new crush map requires client version " << ceph_release_name(mv)
7c673cae 6966 << " but require_min_compat_client is "
31f18b77 6967 << ceph_release_name(newmap.require_min_compat_client);
7c673cae
FG
6968 return false;
6969 }
6970 }
6971
6972 // osd compat
6973 uint64_t features =
6974 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
6975 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
6976 stringstream features_ss;
6977 int r = check_cluster_features(features, features_ss);
6978 if (r) {
6979 ss << "Could not change CRUSH: " << features_ss.str();
6980 return false;
6981 }
6982
6983 return true;
6984}
6985
6986bool OSDMonitor::erasure_code_profile_in_use(
6987 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
6988 const string &profile,
6989 ostream *ss)
6990{
6991 bool found = false;
6992 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
6993 p != pools.end();
6994 ++p) {
11fdf7f2 6995 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7c673cae
FG
6996 *ss << osdmap.pool_name[p->first] << " ";
6997 found = true;
6998 }
6999 }
7000 if (found) {
7001 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7002 }
7003 return found;
7004}
7005
7006int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7007 map<string,string> *erasure_code_profile_map,
7008 ostream *ss)
7009{
11fdf7f2
TL
7010 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7011 get_json_str_map,
7012 *ss,
7013 erasure_code_profile_map,
7014 true);
7c673cae
FG
7015 if (r)
7016 return r;
11fdf7f2 7017 ceph_assert((*erasure_code_profile_map).count("plugin"));
7c673cae
FG
7018 string default_plugin = (*erasure_code_profile_map)["plugin"];
7019 map<string,string> user_map;
7020 for (vector<string>::const_iterator i = erasure_code_profile.begin();
7021 i != erasure_code_profile.end();
7022 ++i) {
7023 size_t equal = i->find('=');
7024 if (equal == string::npos) {
7025 user_map[*i] = string();
7026 (*erasure_code_profile_map)[*i] = string();
7027 } else {
11fdf7f2 7028 const string key = i->substr(0, equal);
7c673cae
FG
7029 equal++;
7030 const string value = i->substr(equal);
11fdf7f2
TL
7031 if (key.find("ruleset-") == 0) {
7032 *ss << "property '" << key << "' is no longer supported; try "
7033 << "'crush-" << key.substr(8) << "' instead";
7034 return -EINVAL;
3efd9988 7035 }
7c673cae
FG
7036 user_map[key] = value;
7037 (*erasure_code_profile_map)[key] = value;
7038 }
7039 }
7040
7041 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7042 (*erasure_code_profile_map) = user_map;
7043
7044 return 0;
7045}
7046
7047int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7048 const string &erasure_code_profile,
11fdf7f2 7049 uint8_t repl_size,
7c673cae
FG
7050 unsigned *size, unsigned *min_size,
7051 ostream *ss)
7052{
7053 int err = 0;
7054 switch (pool_type) {
7055 case pg_pool_t::TYPE_REPLICATED:
11fdf7f2
TL
7056 if (repl_size == 0) {
7057 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7058 }
7059 *size = repl_size;
7060 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7c673cae
FG
7061 break;
7062 case pg_pool_t::TYPE_ERASURE:
7063 {
7064 ErasureCodeInterfaceRef erasure_code;
7065 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7066 if (err == 0) {
7067 *size = erasure_code->get_chunk_count();
11fdf7f2
TL
7068 *min_size =
7069 erasure_code->get_data_chunk_count() +
7070 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7071 assert(*min_size <= *size);
7072 assert(*min_size >= erasure_code->get_data_chunk_count());
7c673cae
FG
7073 }
7074 }
7075 break;
7076 default:
7077 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7078 err = -EINVAL;
7079 break;
7080 }
7081 return err;
7082}
7083
7084int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7085 const string &erasure_code_profile,
7086 uint32_t *stripe_width,
7087 ostream *ss)
7088{
7089 int err = 0;
7090 switch (pool_type) {
7091 case pg_pool_t::TYPE_REPLICATED:
7092 // ignored
7093 break;
7094 case pg_pool_t::TYPE_ERASURE:
7095 {
7096 ErasureCodeProfile profile =
7097 osdmap.get_erasure_code_profile(erasure_code_profile);
7098 ErasureCodeInterfaceRef erasure_code;
7099 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7100 if (err)
7101 break;
7102 uint32_t data_chunks = erasure_code->get_data_chunk_count();
11fdf7f2 7103 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7c673cae
FG
7104 auto it = profile.find("stripe_unit");
7105 if (it != profile.end()) {
7106 string err_str;
1adf2230 7107 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
11fdf7f2 7108 ceph_assert(err_str.empty());
7c673cae
FG
7109 }
7110 *stripe_width = data_chunks *
7111 erasure_code->get_chunk_size(stripe_unit * data_chunks);
7112 }
7113 break;
7114 default:
7115 *ss << "prepare_pool_stripe_width: "
7116 << pool_type << " is not a known pool type";
7117 err = -EINVAL;
7118 break;
7119 }
7120 return err;
7121}
7122
31f18b77 7123int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
7124 const string &erasure_code_profile,
7125 const string &rule_name,
7126 int *crush_rule,
7127 ostream *ss)
7c673cae
FG
7128{
7129
31f18b77 7130 if (*crush_rule < 0) {
7c673cae
FG
7131 switch (pool_type) {
7132 case pg_pool_t::TYPE_REPLICATED:
7133 {
31f18b77 7134 if (rule_name == "") {
224ce89b 7135 // Use default rule
11fdf7f2 7136 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
31f18b77
FG
7137 if (*crush_rule < 0) {
7138 // Errors may happen e.g. if no valid rule is available
7139 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
7140 << "'osd pool default crush *' config options";
7141 return -ENOENT;
7142 }
7143 } else {
31f18b77 7144 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
7145 }
7146 }
7147 break;
7148 case pg_pool_t::TYPE_ERASURE:
7149 {
31f18b77 7150 int err = crush_rule_create_erasure(rule_name,
7c673cae 7151 erasure_code_profile,
31f18b77 7152 crush_rule, ss);
7c673cae
FG
7153 switch (err) {
7154 case -EALREADY:
31f18b77
FG
7155 dout(20) << "prepare_pool_crush_rule: rule "
7156 << rule_name << " try again" << dendl;
7c673cae
FG
7157 // fall through
7158 case 0:
7159 // need to wait for the crush rule to be proposed before proceeding
7160 err = -EAGAIN;
7161 break;
7162 case -EEXIST:
7163 err = 0;
7164 break;
7165 }
7166 return err;
7167 }
7168 break;
7169 default:
31f18b77 7170 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
7171 << " is not a known pool type";
7172 return -EINVAL;
7173 break;
7174 }
7175 } else {
31f18b77
FG
7176 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7177 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
7178 return -ENOENT;
7179 }
7180 }
7181
7182 return 0;
7183}
7184
31f18b77 7185int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
7186 int *crush_rule,
7187 ostream *ss)
7c673cae
FG
7188{
7189 int ret;
31f18b77 7190 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
7191 if (ret != -ENOENT) {
7192 // found it, use it
31f18b77 7193 *crush_rule = ret;
7c673cae
FG
7194 } else {
7195 CrushWrapper newcrush;
7196 _get_pending_crush(newcrush);
7197
31f18b77 7198 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
7199 if (ret != -ENOENT) {
7200 // found it, wait for it to be proposed
31f18b77 7201 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
7202 << " try again" << dendl;
7203 return -EAGAIN;
7204 } else {
224ce89b 7205 // Cannot find it , return error
31f18b77 7206 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
7207 return ret;
7208 }
7209 }
7210 return 0;
7211}
7212
3efd9988
FG
7213int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7214{
11fdf7f2 7215 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
3efd9988
FG
7216 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
7217 auto max_pgs = max_pgs_per_osd * num_osds;
7218 uint64_t projected = 0;
7219 if (pool < 0) {
7220 projected += pg_num * size;
7221 }
7222 for (const auto& i : osdmap.get_pools()) {
7223 if (i.first == pool) {
7224 projected += pg_num * size;
7225 } else {
11fdf7f2 7226 projected += i.second.get_pg_num_target() * i.second.get_size();
3efd9988
FG
7227 }
7228 }
7229 if (projected > max_pgs) {
7230 if (pool >= 0) {
7231 *ss << "pool id " << pool;
7232 }
7233 *ss << " pg_num " << pg_num << " size " << size
7234 << " would mean " << projected
7235 << " total pgs, which exceeds max " << max_pgs
7236 << " (mon_max_pg_per_osd " << max_pgs_per_osd
7237 << " * num_in_osds " << num_osds << ")";
7238 return -ERANGE;
7239 }
7240 return 0;
7241}
7242
7c673cae
FG
7243/**
7244 * @param name The name of the new pool
31f18b77
FG
7245 * @param crush_rule The crush rule to use. If <0, will use the system default
7246 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
7247 * @param pg_num The pg_num to use. If set to 0, will use the system default
7248 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
11fdf7f2 7249 * @param repl_size Replication factor, or 0 for default
7c673cae
FG
7250 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7251 * @param pool_type TYPE_ERASURE, or TYPE_REP
7252 * @param expected_num_objects expected number of objects on the pool
7253 * @param fast_read fast read type.
7254 * @param ss human readable error message, if any.
7255 *
7256 * @return 0 on success, negative errno on failure.
7257 */
11fdf7f2 7258int OSDMonitor::prepare_new_pool(string& name,
31f18b77
FG
7259 int crush_rule,
7260 const string &crush_rule_name,
7c673cae 7261 unsigned pg_num, unsigned pgp_num,
11fdf7f2
TL
7262 unsigned pg_num_min,
7263 const uint64_t repl_size,
7264 const uint64_t target_size_bytes,
7265 const float target_size_ratio,
7c673cae
FG
7266 const string &erasure_code_profile,
7267 const unsigned pool_type,
7268 const uint64_t expected_num_objects,
7269 FastReadType fast_read,
7270 ostream *ss)
7271{
7272 if (name.length() == 0)
7273 return -EINVAL;
7274 if (pg_num == 0)
11fdf7f2 7275 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7c673cae 7276 if (pgp_num == 0)
11fdf7f2
TL
7277 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7278 if (!pgp_num)
7279 pgp_num = pg_num;
7280 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7c673cae 7281 *ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 7282 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7c673cae
FG
7283 << " (you may adjust 'mon max pool pg num' for higher values)";
7284 return -ERANGE;
7285 }
7286 if (pgp_num > pg_num) {
7287 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7288 << ", which in this case is " << pg_num;
7289 return -ERANGE;
7290 }
7291 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7292 *ss << "'fast_read' can only apply to erasure coding pool";
7293 return -EINVAL;
7294 }
7295 int r;
31f18b77
FG
7296 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7297 crush_rule_name, &crush_rule, ss);
7c673cae 7298 if (r) {
94b18763 7299 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
7300 return r;
7301 }
11fdf7f2 7302 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
7303 CrushWrapper newcrush;
7304 _get_pending_crush(newcrush);
7305 ostringstream err;
7306 CrushTester tester(newcrush, err);
b5b8bbf5 7307 tester.set_min_x(0);
224ce89b
WB
7308 tester.set_max_x(50);
7309 tester.set_rule(crush_rule);
b5b8bbf5 7310 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 7311 r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 7312 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 7313 if (r < 0) {
94b18763 7314 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
7315 << ": " << err.str() << dendl;
7316 *ss << "crush test failed with " << r << ": " << err.str();
7317 return r;
7318 }
181888fb 7319 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 7320 << duration << dendl;
7c673cae
FG
7321 }
7322 unsigned size, min_size;
11fdf7f2
TL
7323 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7324 &size, &min_size, ss);
7c673cae 7325 if (r) {
94b18763 7326 dout(10) << "prepare_pool_size returns " << r << dendl;
7c673cae
FG
7327 return r;
7328 }
3efd9988
FG
7329 r = check_pg_num(-1, pg_num, size, ss);
7330 if (r) {
94b18763 7331 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
7332 return r;
7333 }
7c673cae 7334
31f18b77 7335 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
7336 return -EINVAL;
7337 }
7338
7339 uint32_t stripe_width = 0;
7340 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7341 if (r) {
94b18763 7342 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
7343 return r;
7344 }
7345
7346 bool fread = false;
7347 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7348 switch (fast_read) {
7349 case FAST_READ_OFF:
7350 fread = false;
7351 break;
7352 case FAST_READ_ON:
7353 fread = true;
7354 break;
7355 case FAST_READ_DEFAULT:
11fdf7f2 7356 fread = g_conf()->osd_pool_default_ec_fast_read;
7c673cae
FG
7357 break;
7358 default:
7359 *ss << "invalid fast_read setting: " << fast_read;
7360 return -EINVAL;
7361 }
7362 }
7363
7364 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7365 p != pending_inc.new_pool_names.end();
7366 ++p) {
7367 if (p->second == name)
7368 return 0;
7369 }
7370
7371 if (-1 == pending_inc.new_pool_max)
7372 pending_inc.new_pool_max = osdmap.pool_max;
7373 int64_t pool = ++pending_inc.new_pool_max;
7374 pg_pool_t empty;
7375 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
11fdf7f2 7376 pi->create_time = ceph_clock_now();
7c673cae
FG
7377 pi->type = pool_type;
7378 pi->fast_read = fread;
11fdf7f2
TL
7379 pi->flags = g_conf()->osd_pool_default_flags;
7380 if (g_conf()->osd_pool_default_flag_hashpspool)
7c673cae 7381 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
11fdf7f2 7382 if (g_conf()->osd_pool_default_flag_nodelete)
7c673cae 7383 pi->set_flag(pg_pool_t::FLAG_NODELETE);
11fdf7f2 7384 if (g_conf()->osd_pool_default_flag_nopgchange)
7c673cae 7385 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
11fdf7f2 7386 if (g_conf()->osd_pool_default_flag_nosizechange)
7c673cae 7387 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
11fdf7f2
TL
7388 pi->set_flag(pg_pool_t::FLAG_CREATING);
7389 if (g_conf()->osd_pool_use_gmt_hitset)
7c673cae
FG
7390 pi->use_gmt_hitset = true;
7391 else
7392 pi->use_gmt_hitset = false;
7393
7394 pi->size = size;
7395 pi->min_size = min_size;
31f18b77 7396 pi->crush_rule = crush_rule;
7c673cae
FG
7397 pi->expected_num_objects = expected_num_objects;
7398 pi->object_hash = CEPH_STR_HASH_RJENKINS;
11fdf7f2
TL
7399
7400 {
7401 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7402 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7403 pi->pg_autoscale_mode = m >= 0 ? m : 0;
7404 }
7405 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7406 pi->set_pg_num(
7407 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7408 : pg_num);
7409 pi->set_pg_num_pending(pi->get_pg_num());
7410 pi->set_pg_num_target(pg_num);
7411 pi->set_pgp_num(pi->get_pg_num());
7412 pi->set_pgp_num_target(pgp_num);
7413 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7414 pg_num_min) {
7415 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7416 }
7417
7c673cae 7418 pi->last_change = pending_inc.epoch;
11fdf7f2
TL
7419 pi->auid = 0;
7420
7421 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7422 pi->erasure_code_profile = erasure_code_profile;
7423 } else {
7424 pi->erasure_code_profile = "";
7425 }
7c673cae 7426 pi->stripe_width = stripe_width;
11fdf7f2
TL
7427
7428 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7429 target_size_bytes) {
7430 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7431 // larger than int32_t max.
7432 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7433 }
7434 if (target_size_ratio > 0.0 &&
7435 osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
7436 // only store for nautilus+, just to be consistent and tidy.
7437 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7438 }
7439
7c673cae 7440 pi->cache_target_dirty_ratio_micro =
11fdf7f2 7441 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7c673cae 7442 pi->cache_target_dirty_high_ratio_micro =
11fdf7f2 7443 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7c673cae 7444 pi->cache_target_full_ratio_micro =
11fdf7f2
TL
7445 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7446 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7447 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7448
7c673cae
FG
7449 pending_inc.new_pool_names[pool] = name;
7450 return 0;
7451}
7452
7453bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7454{
7455 op->mark_osdmon_event(__func__);
7456 ostringstream ss;
7457 if (pending_inc.new_flags < 0)
7458 pending_inc.new_flags = osdmap.get_flags();
7459 pending_inc.new_flags |= flag;
7460 ss << OSDMap::get_flag_string(flag) << " is set";
7461 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7462 get_last_committed() + 1));
7463 return true;
7464}
7465
7466bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7467{
7468 op->mark_osdmon_event(__func__);
7469 ostringstream ss;
7470 if (pending_inc.new_flags < 0)
7471 pending_inc.new_flags = osdmap.get_flags();
7472 pending_inc.new_flags &= ~flag;
7473 ss << OSDMap::get_flag_string(flag) << " is unset";
7474 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7475 get_last_committed() + 1));
7476 return true;
7477}
7478
11fdf7f2 7479int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7c673cae
FG
7480 stringstream& ss)
7481{
7482 string poolstr;
11fdf7f2 7483 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
7484 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7485 if (pool < 0) {
7486 ss << "unrecognized pool '" << poolstr << "'";
7487 return -ENOENT;
7488 }
7489 string var;
11fdf7f2 7490 cmd_getval(cct, cmdmap, "var", var);
7c673cae
FG
7491
7492 pg_pool_t p = *osdmap.get_pg_pool(pool);
7493 if (pending_inc.new_pools.count(pool))
7494 p = pending_inc.new_pools[pool];
7495
7496 // accept val as a json string in the normal case (current
7497 // generation monitor). parse out int or float values from the
7498 // string as needed. however, if it is not a string, try to pull
7499 // out an int, in case an older monitor with an older json schema is
7500 // forwarding a request.
7501 string val;
7502 string interr, floaterr;
7503 int64_t n = 0;
7504 double f = 0;
7505 int64_t uf = 0; // micro-f
11fdf7f2 7506 cmd_getval(cct, cmdmap, "val", val);
f64942e4
AA
7507
7508 // parse string as both int and float; different fields use different types.
7509 n = strict_strtoll(val.c_str(), 10, &interr);
7510 f = strict_strtod(val.c_str(), &floaterr);
7511 uf = llrintl(f * (double)1000000.0);
7c673cae
FG
7512
7513 if (!p.is_tier() &&
7514 (var == "hit_set_type" || var == "hit_set_period" ||
7515 var == "hit_set_count" || var == "hit_set_fpp" ||
7516 var == "target_max_objects" || var == "target_max_bytes" ||
7517 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7518 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7519 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7520 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7521 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7522 return -EACCES;
7523 }
7524
7525 if (var == "size") {
7526 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7527 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7528 return -EPERM;
7529 }
7530 if (p.type == pg_pool_t::TYPE_ERASURE) {
7531 ss << "can not change the size of an erasure-coded pool";
7532 return -ENOTSUP;
7533 }
7534 if (interr.length()) {
7535 ss << "error parsing integer value '" << val << "': " << interr;
7536 return -EINVAL;
7537 }
7538 if (n <= 0 || n > 10) {
7539 ss << "pool size must be between 1 and 10";
7540 return -EINVAL;
7541 }
eafe8130
TL
7542 if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
7543 return -EINVAL;
7544 }
3efd9988
FG
7545 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7546 if (r < 0) {
7547 return r;
7548 }
7c673cae
FG
7549 p.size = n;
7550 if (n < p.min_size)
7551 p.min_size = n;
7552 } else if (var == "min_size") {
7553 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7554 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7555 return -EPERM;
7556 }
7557 if (interr.length()) {
7558 ss << "error parsing integer value '" << val << "': " << interr;
7559 return -EINVAL;
7560 }
7561
7562 if (p.type != pg_pool_t::TYPE_ERASURE) {
7563 if (n < 1 || n > p.size) {
494da23a 7564 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7c673cae
FG
7565 return -EINVAL;
7566 }
7567 } else {
7568 ErasureCodeInterfaceRef erasure_code;
7569 int k;
7570 stringstream tmp;
7571 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7572 if (err == 0) {
7573 k = erasure_code->get_data_chunk_count();
7574 } else {
b32b8144 7575 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
7576 return err;
7577 }
7578
7579 if (n < k || n > p.size) {
494da23a 7580 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7c673cae
FG
7581 return -EINVAL;
7582 }
7583 }
7584 p.min_size = n;
11fdf7f2 7585 } else if (var == "pg_num_actual") {
7c673cae
FG
7586 if (interr.length()) {
7587 ss << "error parsing integer value '" << val << "': " << interr;
7588 return -EINVAL;
7589 }
11fdf7f2
TL
7590 if (n == (int)p.get_pg_num()) {
7591 return 0;
7592 }
7593 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7594 ss << "'pg_num' must be greater than 0 and less than or equal to "
7595 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7596 << " (you may adjust 'mon max pool pg num' for higher values)";
7597 return -ERANGE;
7598 }
7599 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
7600 ss << "cannot adjust pg_num while initial PGs are being created";
7601 return -EBUSY;
7602 }
7603 if (n > (int)p.get_pg_num()) {
7604 if (p.get_pg_num() != p.get_pg_num_pending()) {
7605 // force pre-nautilus clients to resend their ops, since they
7606 // don't understand pg_num_pending changes form a new interval
7607 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7608 }
7609 p.set_pg_num(n);
7610 } else {
7611 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7612 ss << "nautilus OSDs are required to adjust pg_num_pending";
7613 return -EPERM;
7614 }
7615 if (n < (int)p.get_pgp_num()) {
7616 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
7617 return -EINVAL;
7618 }
7619 if (n < (int)p.get_pg_num() - 1) {
7620 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
7621 << ") - 1; only single pg decrease is currently supported";
7622 return -EINVAL;
7623 }
7624 p.set_pg_num_pending(n);
7625 // force pre-nautilus clients to resend their ops, since they
7626 // don't understand pg_num_pending changes form a new interval
7627 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7c673cae 7628 }
11fdf7f2
TL
7629 // force pre-luminous clients to resend their ops, since they
7630 // don't understand that split PGs now form a new interval.
7631 p.last_force_op_resend_preluminous = pending_inc.epoch;
7c673cae
FG
7632 } else if (var == "pg_num") {
7633 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7634 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
7635 return -EPERM;
7636 }
7637 if (interr.length()) {
7638 ss << "error parsing integer value '" << val << "': " << interr;
7639 return -EINVAL;
7640 }
11fdf7f2 7641 if (n == (int)p.get_pg_num_target()) {
7c673cae
FG
7642 return 0;
7643 }
11fdf7f2
TL
7644 if (n <= 0 || static_cast<uint64_t>(n) >
7645 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
c07f9fc5 7646 ss << "'pg_num' must be greater than 0 and less than or equal to "
11fdf7f2 7647 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
c07f9fc5
FG
7648 << " (you may adjust 'mon max pool pg num' for higher values)";
7649 return -ERANGE;
7650 }
11fdf7f2
TL
7651 if (n > (int)p.get_pg_num_target()) {
7652 int r = check_pg_num(pool, n, p.get_size(), &ss);
7653 if (r) {
7654 return r;
7655 }
7656 bool force = false;
7657 cmd_getval(cct,cmdmap, "yes_i_really_mean_it", force);
7658 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
7659 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
7660 return -EPERM;
7661 }
7662 } else {
7663 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7664 ss << "nautilus OSDs are required to decrease pg_num";
7665 return -EPERM;
7666 }
7c673cae 7667 }
494da23a
TL
7668 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7669 // pre-nautilus osdmap format; increase pg_num directly
7670 assert(n > (int)p.get_pg_num());
7671 // force pre-nautilus clients to resend their ops, since they
7672 // don't understand pg_num_target changes form a new interval
7673 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7674 // force pre-luminous clients to resend their ops, since they
7675 // don't understand that split PGs now form a new interval.
7676 p.last_force_op_resend_preluminous = pending_inc.epoch;
7677 p.set_pg_num(n);
7678 } else {
7679 // set targets; mgr will adjust pg_num_actual and pgp_num later.
7680 // make pgp_num track pg_num if it already matches. if it is set
7681 // differently, leave it different and let the user control it
7682 // manually.
7683 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
7684 p.set_pgp_num_target(n);
7685 }
7686 p.set_pg_num_target(n);
7c673cae 7687 }
11fdf7f2 7688 } else if (var == "pgp_num_actual") {
7c673cae
FG
7689 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7690 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7691 return -EPERM;
7692 }
7693 if (interr.length()) {
7694 ss << "error parsing integer value '" << val << "': " << interr;
7695 return -EINVAL;
7696 }
7697 if (n <= 0) {
7698 ss << "specified pgp_num must > 0, but you set to " << n;
7699 return -EINVAL;
7700 }
7701 if (n > (int)p.get_pg_num()) {
7702 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
7703 return -EINVAL;
7704 }
11fdf7f2
TL
7705 if (n > (int)p.get_pg_num_pending()) {
7706 ss << "specified pgp_num " << n
7707 << " > pg_num_pending " << p.get_pg_num_pending();
7708 return -EINVAL;
7709 }
7c673cae 7710 p.set_pgp_num(n);
11fdf7f2
TL
7711 } else if (var == "pgp_num") {
7712 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7713 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7714 return -EPERM;
7715 }
7716 if (interr.length()) {
7717 ss << "error parsing integer value '" << val << "': " << interr;
7718 return -EINVAL;
7719 }
7720 if (n <= 0) {
7721 ss << "specified pgp_num must > 0, but you set to " << n;
7722 return -EINVAL;
7723 }
7724 if (n > (int)p.get_pg_num_target()) {
7725 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
7726 return -EINVAL;
7727 }
494da23a
TL
7728 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7729 // pre-nautilus osdmap format; increase pgp_num directly
7730 p.set_pgp_num(n);
7731 } else {
7732 p.set_pgp_num_target(n);
7733 }
11fdf7f2
TL
7734 } else if (var == "pg_autoscale_mode") {
7735 n = pg_pool_t::get_pg_autoscale_mode_by_name(val);
7736 if (n < 0) {
7737 ss << "specified invalid mode " << val;
7738 return -EINVAL;
7739 }
494da23a
TL
7740 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7741 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
7742 return -EINVAL;
7743 }
11fdf7f2 7744 p.pg_autoscale_mode = n;
7c673cae
FG
7745 } else if (var == "crush_rule") {
7746 int id = osdmap.crush->get_rule_id(val);
7747 if (id == -ENOENT) {
7748 ss << "crush rule " << val << " does not exist";
7749 return -ENOENT;
7750 }
7751 if (id < 0) {
7752 ss << cpp_strerror(id);
7753 return -ENOENT;
7754 }
7755 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
7756 return -EINVAL;
7757 }
31f18b77 7758 p.crush_rule = id;
7c673cae
FG
7759 } else if (var == "nodelete" || var == "nopgchange" ||
7760 var == "nosizechange" || var == "write_fadvise_dontneed" ||
7761 var == "noscrub" || var == "nodeep-scrub") {
7762 uint64_t flag = pg_pool_t::get_flag_by_name(var);
7763 // make sure we only compare against 'n' if we didn't receive a string
7764 if (val == "true" || (interr.empty() && n == 1)) {
7765 p.set_flag(flag);
7766 } else if (val == "false" || (interr.empty() && n == 0)) {
7767 p.unset_flag(flag);
7768 } else {
7769 ss << "expecting value 'true', 'false', '0', or '1'";
7770 return -EINVAL;
7771 }
7772 } else if (var == "hashpspool") {
7773 uint64_t flag = pg_pool_t::get_flag_by_name(var);
11fdf7f2
TL
7774 bool force = false;
7775 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7776
7777 if (!force) {
7c673cae
FG
7778 ss << "are you SURE? this will remap all placement groups in this pool,"
7779 " this triggers large data movement,"
7780 " pass --yes-i-really-mean-it if you really do.";
7781 return -EPERM;
7782 }
7783 // make sure we only compare against 'n' if we didn't receive a string
7784 if (val == "true" || (interr.empty() && n == 1)) {
7785 p.set_flag(flag);
7786 } else if (val == "false" || (interr.empty() && n == 0)) {
7787 p.unset_flag(flag);
7788 } else {
7789 ss << "expecting value 'true', 'false', '0', or '1'";
7790 return -EINVAL;
7791 }
7792 } else if (var == "hit_set_type") {
7793 if (val == "none")
7794 p.hit_set_params = HitSet::Params();
7795 else {
7796 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
7797 if (err)
7798 return err;
7799 if (val == "bloom") {
7800 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 7801 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae
FG
7802 p.hit_set_params = HitSet::Params(bsp);
7803 } else if (val == "explicit_hash")
7804 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
7805 else if (val == "explicit_object")
7806 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
7807 else {
7808 ss << "unrecognized hit_set type '" << val << "'";
7809 return -EINVAL;
7810 }
7811 }
7812 } else if (var == "hit_set_period") {
7813 if (interr.length()) {
7814 ss << "error parsing integer value '" << val << "': " << interr;
7815 return -EINVAL;
11fdf7f2
TL
7816 } else if (n < 0) {
7817 ss << "hit_set_period should be non-negative";
7818 return -EINVAL;
7c673cae
FG
7819 }
7820 p.hit_set_period = n;
7821 } else if (var == "hit_set_count") {
7822 if (interr.length()) {
7823 ss << "error parsing integer value '" << val << "': " << interr;
7824 return -EINVAL;
11fdf7f2
TL
7825 } else if (n < 0) {
7826 ss << "hit_set_count should be non-negative";
7827 return -EINVAL;
7c673cae
FG
7828 }
7829 p.hit_set_count = n;
7830 } else if (var == "hit_set_fpp") {
7831 if (floaterr.length()) {
7832 ss << "error parsing floating point value '" << val << "': " << floaterr;
7833 return -EINVAL;
11fdf7f2
TL
7834 } else if (f < 0 || f > 1.0) {
7835 ss << "hit_set_fpp should be in the range 0..1";
7836 return -EINVAL;
7c673cae
FG
7837 }
7838 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
7839 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
7840 return -EINVAL;
7841 }
7842 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
7843 bloomp->set_fpp(f);
7844 } else if (var == "use_gmt_hitset") {
7845 if (val == "true" || (interr.empty() && n == 1)) {
7c673cae
FG
7846 p.use_gmt_hitset = true;
7847 } else {
7848 ss << "expecting value 'true' or '1'";
7849 return -EINVAL;
7850 }
7851 } else if (var == "allow_ec_overwrites") {
7852 if (!p.is_erasure()) {
7853 ss << "ec overwrites can only be enabled for an erasure coded pool";
7854 return -EINVAL;
7855 }
224ce89b 7856 stringstream err;
11fdf7f2 7857 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
224ce89b
WB
7858 !is_pool_currently_all_bluestore(pool, p, &err)) {
7859 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
7860 return -EINVAL;
7861 }
7c673cae
FG
7862 if (val == "true" || (interr.empty() && n == 1)) {
7863 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
7864 } else if (val == "false" || (interr.empty() && n == 0)) {
7865 ss << "ec overwrites cannot be disabled once enabled";
7866 return -EINVAL;
7867 } else {
7868 ss << "expecting value 'true', 'false', '0', or '1'";
7869 return -EINVAL;
7870 }
7c673cae
FG
7871 } else if (var == "target_max_objects") {
7872 if (interr.length()) {
7873 ss << "error parsing int '" << val << "': " << interr;
7874 return -EINVAL;
7875 }
7876 p.target_max_objects = n;
7877 } else if (var == "target_max_bytes") {
7878 if (interr.length()) {
7879 ss << "error parsing int '" << val << "': " << interr;
7880 return -EINVAL;
7881 }
7882 p.target_max_bytes = n;
7883 } else if (var == "cache_target_dirty_ratio") {
7884 if (floaterr.length()) {
7885 ss << "error parsing float '" << val << "': " << floaterr;
7886 return -EINVAL;
7887 }
7888 if (f < 0 || f > 1.0) {
7889 ss << "value must be in the range 0..1";
7890 return -ERANGE;
7891 }
7892 p.cache_target_dirty_ratio_micro = uf;
7893 } else if (var == "cache_target_dirty_high_ratio") {
7894 if (floaterr.length()) {
7895 ss << "error parsing float '" << val << "': " << floaterr;
7896 return -EINVAL;
7897 }
7898 if (f < 0 || f > 1.0) {
7899 ss << "value must be in the range 0..1";
7900 return -ERANGE;
7901 }
7902 p.cache_target_dirty_high_ratio_micro = uf;
7903 } else if (var == "cache_target_full_ratio") {
7904 if (floaterr.length()) {
7905 ss << "error parsing float '" << val << "': " << floaterr;
7906 return -EINVAL;
7907 }
7908 if (f < 0 || f > 1.0) {
7909 ss << "value must be in the range 0..1";
7910 return -ERANGE;
7911 }
7912 p.cache_target_full_ratio_micro = uf;
7913 } else if (var == "cache_min_flush_age") {
7914 if (interr.length()) {
7915 ss << "error parsing int '" << val << "': " << interr;
7916 return -EINVAL;
7917 }
7918 p.cache_min_flush_age = n;
7919 } else if (var == "cache_min_evict_age") {
7920 if (interr.length()) {
7921 ss << "error parsing int '" << val << "': " << interr;
7922 return -EINVAL;
7923 }
7924 p.cache_min_evict_age = n;
7925 } else if (var == "min_read_recency_for_promote") {
7926 if (interr.length()) {
7927 ss << "error parsing integer value '" << val << "': " << interr;
7928 return -EINVAL;
7929 }
7930 p.min_read_recency_for_promote = n;
7931 } else if (var == "hit_set_grade_decay_rate") {
7932 if (interr.length()) {
7933 ss << "error parsing integer value '" << val << "': " << interr;
7934 return -EINVAL;
7935 }
7936 if (n > 100 || n < 0) {
7937 ss << "value out of range,valid range is 0 - 100";
7938 return -EINVAL;
7939 }
7940 p.hit_set_grade_decay_rate = n;
7941 } else if (var == "hit_set_search_last_n") {
7942 if (interr.length()) {
7943 ss << "error parsing integer value '" << val << "': " << interr;
7944 return -EINVAL;
7945 }
7946 if (n > p.hit_set_count || n < 0) {
7947 ss << "value out of range,valid range is 0 - hit_set_count";
7948 return -EINVAL;
7949 }
7950 p.hit_set_search_last_n = n;
7951 } else if (var == "min_write_recency_for_promote") {
7952 if (interr.length()) {
7953 ss << "error parsing integer value '" << val << "': " << interr;
7954 return -EINVAL;
7955 }
7956 p.min_write_recency_for_promote = n;
7957 } else if (var == "fast_read") {
7958 if (p.is_replicated()) {
7959 ss << "fast read is not supported in replication pool";
7960 return -EINVAL;
7961 }
7962 if (val == "true" || (interr.empty() && n == 1)) {
7963 p.fast_read = true;
7964 } else if (val == "false" || (interr.empty() && n == 0)) {
7965 p.fast_read = false;
7966 } else {
7967 ss << "expecting value 'true', 'false', '0', or '1'";
7968 return -EINVAL;
7969 }
7970 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 7971 bool unset = val == "unset";
7c673cae 7972 if (var == "compression_mode") {
224ce89b
WB
7973 if (!unset) {
7974 auto cmode = Compressor::get_comp_mode_type(val);
7975 if (!cmode) {
7976 ss << "unrecognized compression mode '" << val << "'";
7977 return -EINVAL;
7978 }
7c673cae
FG
7979 }
7980 } else if (var == "compression_algorithm") {
224ce89b
WB
7981 if (!unset) {
7982 auto alg = Compressor::get_comp_alg_type(val);
7983 if (!alg) {
7984 ss << "unrecognized compression_algorithm '" << val << "'";
7985 return -EINVAL;
7986 }
7c673cae
FG
7987 }
7988 } else if (var == "compression_required_ratio") {
7989 if (floaterr.length()) {
7990 ss << "error parsing float value '" << val << "': " << floaterr;
7991 return -EINVAL;
7992 }
224ce89b 7993 if (f < 0 || f > 1) {
7c673cae 7994 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 7995 return -EINVAL;
7c673cae
FG
7996 }
7997 } else if (var == "csum_type") {
224ce89b 7998 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
7999 if (t < 0 ) {
8000 ss << "unrecognized csum_type '" << val << "'";
224ce89b 8001 return -EINVAL;
7c673cae
FG
8002 }
8003 //preserve csum_type numeric value
8004 n = t;
8005 interr.clear();
8006 } else if (var == "compression_max_blob_size" ||
8007 var == "compression_min_blob_size" ||
8008 var == "csum_max_block" ||
8009 var == "csum_min_block") {
8010 if (interr.length()) {
8011 ss << "error parsing int value '" << val << "': " << interr;
8012 return -EINVAL;
8013 }
11fdf7f2
TL
8014 } else if (var == "fingerprint_algorithm") {
8015 if (!unset) {
8016 auto alg = pg_pool_t::get_fingerprint_from_str(val);
8017 if (!alg) {
8018 ss << "unrecognized fingerprint_algorithm '" << val << "'";
8019 return -EINVAL;
8020 }
8021 }
8022 } else if (var == "pg_num_min") {
8023 if (interr.length()) {
8024 ss << "error parsing int value '" << val << "': " << interr;
8025 return -EINVAL;
8026 }
8027 if (n > (int)p.get_pg_num_target()) {
8028 ss << "specified pg_num_min " << n
8029 << " > pg_num " << p.get_pg_num_target();
8030 return -EINVAL;
8031 }
8032 } else if (var == "recovery_priority") {
8033 if (interr.length()) {
8034 ss << "error parsing int value '" << val << "': " << interr;
8035 return -EINVAL;
8036 }
81eedcae
TL
8037 if (!g_conf()->debug_allow_any_pool_priority) {
8038 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8039 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8040 << " and " << OSD_POOL_PRIORITY_MAX;
8041 return -EINVAL;
8042 }
11fdf7f2
TL
8043 }
8044 } else if (var == "pg_autoscale_bias") {
8045 if (f < 0.0 || f > 1000.0) {
8046 ss << "pg_autoscale_bias must be between 0 and 1000";
8047 return -EINVAL;
8048 }
7c673cae
FG
8049 }
8050
8051 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8052 switch (desc.type) {
8053 case pool_opts_t::STR:
224ce89b 8054 if (unset) {
7c673cae
FG
8055 p.opts.unset(desc.key);
8056 } else {
8057 p.opts.set(desc.key, static_cast<std::string>(val));
8058 }
8059 break;
8060 case pool_opts_t::INT:
8061 if (interr.length()) {
8062 ss << "error parsing integer value '" << val << "': " << interr;
8063 return -EINVAL;
8064 }
8065 if (n == 0) {
8066 p.opts.unset(desc.key);
8067 } else {
11fdf7f2 8068 p.opts.set(desc.key, static_cast<int64_t>(n));
7c673cae
FG
8069 }
8070 break;
8071 case pool_opts_t::DOUBLE:
8072 if (floaterr.length()) {
8073 ss << "error parsing floating point value '" << val << "': " << floaterr;
8074 return -EINVAL;
8075 }
8076 if (f == 0) {
8077 p.opts.unset(desc.key);
8078 } else {
8079 p.opts.set(desc.key, static_cast<double>(f));
8080 }
8081 break;
8082 default:
11fdf7f2 8083 ceph_assert(!"unknown type");
7c673cae
FG
8084 }
8085 } else {
8086 ss << "unrecognized variable '" << var << "'";
8087 return -EINVAL;
8088 }
224ce89b
WB
8089 if (val != "unset") {
8090 ss << "set pool " << pool << " " << var << " to " << val;
8091 } else {
8092 ss << "unset pool " << pool << " " << var;
8093 }
7c673cae
FG
8094 p.last_change = pending_inc.epoch;
8095 pending_inc.new_pools[pool] = p;
8096 return 0;
8097}
8098
c07f9fc5 8099int OSDMonitor::prepare_command_pool_application(const string &prefix,
11fdf7f2 8100 const cmdmap_t& cmdmap,
c07f9fc5 8101 stringstream& ss)
11fdf7f2
TL
8102{
8103 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8104}
8105
8106int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8107 const cmdmap_t& cmdmap,
8108 stringstream& ss,
8109 bool *modified)
8110{
8111 return _command_pool_application(prefix, cmdmap, ss, modified, false);
8112}
8113
8114
8115/**
8116 * Common logic for preprocess and prepare phases of pool application
8117 * tag commands. In preprocess mode we're only detecting invalid
8118 * commands, and determining whether it was a modification or a no-op.
8119 * In prepare mode we're actually updating the pending state.
8120 */
8121int OSDMonitor::_command_pool_application(const string &prefix,
8122 const cmdmap_t& cmdmap,
8123 stringstream& ss,
8124 bool *modified,
8125 bool preparing)
c07f9fc5
FG
8126{
8127 string pool_name;
11fdf7f2 8128 cmd_getval(cct, cmdmap, "pool", pool_name);
c07f9fc5
FG
8129 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8130 if (pool < 0) {
8131 ss << "unrecognized pool '" << pool_name << "'";
8132 return -ENOENT;
8133 }
8134
8135 pg_pool_t p = *osdmap.get_pg_pool(pool);
11fdf7f2
TL
8136 if (preparing) {
8137 if (pending_inc.new_pools.count(pool)) {
8138 p = pending_inc.new_pools[pool];
8139 }
c07f9fc5
FG
8140 }
8141
8142 string app;
11fdf7f2 8143 cmd_getval(cct, cmdmap, "app", app);
c07f9fc5
FG
8144 bool app_exists = (p.application_metadata.count(app) > 0);
8145
11fdf7f2
TL
8146 string key;
8147 cmd_getval(cct, cmdmap, "key", key);
8148 if (key == "all") {
8149 ss << "key cannot be 'all'";
8150 return -EINVAL;
8151 }
8152
8153 string value;
8154 cmd_getval(cct, cmdmap, "value", value);
8155 if (value == "all") {
8156 ss << "value cannot be 'all'";
8157 return -EINVAL;
8158 }
8159
c07f9fc5
FG
8160 if (boost::algorithm::ends_with(prefix, "enable")) {
8161 if (app.empty()) {
8162 ss << "application name must be provided";
8163 return -EINVAL;
8164 }
8165
8166 if (p.is_tier()) {
8167 ss << "application must be enabled on base tier";
8168 return -EINVAL;
8169 }
8170
11fdf7f2
TL
8171 bool force = false;
8172 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8173
11fdf7f2 8174 if (!app_exists && !p.application_metadata.empty() && !force) {
c07f9fc5
FG
8175 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8176 << "application; pass --yes-i-really-mean-it to proceed anyway";
8177 return -EPERM;
8178 }
8179
8180 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8181 ss << "too many enabled applications on pool '" << pool_name << "'; "
8182 << "max " << MAX_POOL_APPLICATIONS;
8183 return -EINVAL;
8184 }
8185
8186 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8187 ss << "application name '" << app << "' too long; max length "
8188 << MAX_POOL_APPLICATION_LENGTH;
8189 return -EINVAL;
8190 }
8191
8192 if (!app_exists) {
8193 p.application_metadata[app] = {};
8194 }
8195 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8196
8197 } else if (boost::algorithm::ends_with(prefix, "disable")) {
11fdf7f2
TL
8198 bool force = false;
8199 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
c07f9fc5 8200
11fdf7f2 8201 if (!force) {
c07f9fc5
FG
8202 ss << "Are you SURE? Disabling an application within a pool might result "
8203 << "in loss of application functionality; pass "
8204 << "--yes-i-really-mean-it to proceed anyway";
8205 return -EPERM;
8206 }
8207
8208 if (!app_exists) {
8209 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8210 << "'";
8211 return 0; // idempotent
8212 }
8213
8214 p.application_metadata.erase(app);
8215 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8216
8217 } else if (boost::algorithm::ends_with(prefix, "set")) {
8218 if (p.is_tier()) {
8219 ss << "application metadata must be set on base tier";
8220 return -EINVAL;
8221 }
8222
8223 if (!app_exists) {
8224 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8225 << "'";
8226 return -ENOENT;
8227 }
8228
8229 string key;
11fdf7f2 8230 cmd_getval(cct, cmdmap, "key", key);
c07f9fc5
FG
8231
8232 if (key.empty()) {
8233 ss << "key must be provided";
8234 return -EINVAL;
8235 }
8236
8237 auto &app_keys = p.application_metadata[app];
8238 if (app_keys.count(key) == 0 &&
8239 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8240 ss << "too many keys set for application '" << app << "' on pool '"
8241 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8242 return -EINVAL;
8243 }
8244
8245 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8246 ss << "key '" << app << "' too long; max length "
8247 << MAX_POOL_APPLICATION_LENGTH;
8248 return -EINVAL;
8249 }
8250
8251 string value;
11fdf7f2 8252 cmd_getval(cct, cmdmap, "value", value);
c07f9fc5
FG
8253 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8254 ss << "value '" << value << "' too long; max length "
8255 << MAX_POOL_APPLICATION_LENGTH;
8256 return -EINVAL;
8257 }
8258
8259 p.application_metadata[app][key] = value;
8260 ss << "set application '" << app << "' key '" << key << "' to '"
8261 << value << "' on pool '" << pool_name << "'";
8262 } else if (boost::algorithm::ends_with(prefix, "rm")) {
8263 if (!app_exists) {
8264 ss << "application '" << app << "' is not enabled on pool '" << pool_name
8265 << "'";
8266 return -ENOENT;
8267 }
8268
8269 string key;
11fdf7f2 8270 cmd_getval(cct, cmdmap, "key", key);
c07f9fc5
FG
8271 auto it = p.application_metadata[app].find(key);
8272 if (it == p.application_metadata[app].end()) {
8273 ss << "application '" << app << "' on pool '" << pool_name
8274 << "' does not have key '" << key << "'";
8275 return 0; // idempotent
8276 }
8277
8278 p.application_metadata[app].erase(it);
8279 ss << "removed application '" << app << "' key '" << key << "' on pool '"
8280 << pool_name << "'";
8281 } else {
11fdf7f2
TL
8282 ceph_abort();
8283 }
8284
8285 if (preparing) {
8286 p.last_change = pending_inc.epoch;
8287 pending_inc.new_pools[pool] = p;
8288 }
8289
8290 // Because we fell through this far, we didn't hit no-op cases,
8291 // so pool was definitely modified
8292 if (modified != nullptr) {
8293 *modified = true;
c07f9fc5
FG
8294 }
8295
c07f9fc5
FG
8296 return 0;
8297}
8298
31f18b77
FG
8299int OSDMonitor::_prepare_command_osd_crush_remove(
8300 CrushWrapper &newcrush,
8301 int32_t id,
8302 int32_t ancestor,
8303 bool has_ancestor,
8304 bool unlink_only)
8305{
8306 int err = 0;
8307
8308 if (has_ancestor) {
11fdf7f2 8309 err = newcrush.remove_item_under(cct, id, ancestor,
31f18b77
FG
8310 unlink_only);
8311 } else {
11fdf7f2 8312 err = newcrush.remove_item(cct, id, unlink_only);
31f18b77
FG
8313 }
8314 return err;
8315}
8316
8317void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8318{
8319 pending_inc.crush.clear();
8320 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8321}
8322
8323int OSDMonitor::prepare_command_osd_crush_remove(
8324 CrushWrapper &newcrush,
8325 int32_t id,
8326 int32_t ancestor,
8327 bool has_ancestor,
8328 bool unlink_only)
8329{
8330 int err = _prepare_command_osd_crush_remove(
8331 newcrush, id, ancestor,
8332 has_ancestor, unlink_only);
8333
8334 if (err < 0)
8335 return err;
8336
11fdf7f2 8337 ceph_assert(err == 0);
31f18b77
FG
8338 do_osd_crush_remove(newcrush);
8339
8340 return 0;
8341}
8342
8343int OSDMonitor::prepare_command_osd_remove(int32_t id)
8344{
8345 if (osdmap.is_up(id)) {
8346 return -EBUSY;
8347 }
8348
8349 pending_inc.new_state[id] = osdmap.get_state(id);
8350 pending_inc.new_uuid[id] = uuid_d();
8351 pending_metadata_rm.insert(id);
8352 pending_metadata.erase(id);
8353
8354 return 0;
8355}
8356
8357int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8358{
11fdf7f2 8359 ceph_assert(existing_id);
31f18b77
FG
8360 *existing_id = -1;
8361
8362 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8363 if (!osdmap.exists(i) &&
8364 pending_inc.new_up_client.count(i) == 0 &&
8365 (pending_inc.new_state.count(i) == 0 ||
8366 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8367 *existing_id = i;
8368 return -1;
8369 }
8370 }
8371
8372 if (pending_inc.new_max_osd < 0) {
8373 return osdmap.get_max_osd();
8374 }
8375 return pending_inc.new_max_osd;
8376}
8377
8378void OSDMonitor::do_osd_create(
8379 const int32_t id,
8380 const uuid_d& uuid,
3a9019d9 8381 const string& device_class,
31f18b77
FG
8382 int32_t* new_id)
8383{
8384 dout(10) << __func__ << " uuid " << uuid << dendl;
11fdf7f2 8385 ceph_assert(new_id);
31f18b77
FG
8386
8387 // We presume validation has been performed prior to calling this
8388 // function. We assert with prejudice.
8389
8390 int32_t allocated_id = -1; // declare here so we can jump
8391 int32_t existing_id = -1;
8392 if (!uuid.is_zero()) {
8393 existing_id = osdmap.identify_osd(uuid);
8394 if (existing_id >= 0) {
11fdf7f2 8395 ceph_assert(id < 0 || id == existing_id);
31f18b77
FG
8396 *new_id = existing_id;
8397 goto out;
8398 } else if (id >= 0) {
8399 // uuid does not exist, and id has been provided, so just create
8400 // the new osd.id
8401 *new_id = id;
8402 goto out;
8403 }
8404 }
8405
8406 // allocate a new id
8407 allocated_id = _allocate_osd_id(&existing_id);
8408 dout(10) << __func__ << " allocated id " << allocated_id
8409 << " existing id " << existing_id << dendl;
8410 if (existing_id >= 0) {
11fdf7f2
TL
8411 ceph_assert(existing_id < osdmap.get_max_osd());
8412 ceph_assert(allocated_id < 0);
31f18b77
FG
8413 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8414 *new_id = existing_id;
31f18b77 8415 } else if (allocated_id >= 0) {
11fdf7f2 8416 ceph_assert(existing_id < 0);
31f18b77
FG
8417 // raise max_osd
8418 if (pending_inc.new_max_osd < 0) {
8419 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8420 } else {
8421 ++pending_inc.new_max_osd;
8422 }
8423 *new_id = pending_inc.new_max_osd - 1;
11fdf7f2 8424 ceph_assert(*new_id == allocated_id);
31f18b77 8425 } else {
11fdf7f2 8426 ceph_abort_msg("unexpected condition");
31f18b77
FG
8427 }
8428
8429out:
3a9019d9
FG
8430 if (device_class.size()) {
8431 CrushWrapper newcrush;
8432 _get_pending_crush(newcrush);
8433 if (newcrush.get_max_devices() < *new_id + 1) {
8434 newcrush.set_max_devices(*new_id + 1);
8435 }
8436 string name = string("osd.") + stringify(*new_id);
8437 if (!newcrush.item_exists(*new_id)) {
8438 newcrush.set_item_name(*new_id, name);
8439 }
8440 ostringstream ss;
8441 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8442 if (r < 0) {
8443 derr << __func__ << " failed to set " << name << " device_class "
8444 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8445 << dendl;
8446 // non-fatal... this might be a replay and we want to be idempotent.
8447 } else {
8448 dout(20) << __func__ << " set " << name << " device_class " << device_class
8449 << dendl;
8450 pending_inc.crush.clear();
8451 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8452 }
8453 } else {
8454 dout(20) << __func__ << " no device_class" << dendl;
8455 }
8456
31f18b77
FG
8457 dout(10) << __func__ << " using id " << *new_id << dendl;
8458 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8459 pending_inc.new_max_osd = *new_id + 1;
8460 }
8461
8462 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8463 if (!uuid.is_zero())
8464 pending_inc.new_uuid[*new_id] = uuid;
8465}
8466
8467int OSDMonitor::validate_osd_create(
8468 const int32_t id,
8469 const uuid_d& uuid,
8470 const bool check_osd_exists,
8471 int32_t* existing_id,
8472 stringstream& ss)
8473{
8474
8475 dout(10) << __func__ << " id " << id << " uuid " << uuid
8476 << " check_osd_exists " << check_osd_exists << dendl;
8477
11fdf7f2 8478 ceph_assert(existing_id);
31f18b77
FG
8479
8480 if (id < 0 && uuid.is_zero()) {
8481 // we have nothing to validate
8482 *existing_id = -1;
8483 return 0;
8484 } else if (uuid.is_zero()) {
8485 // we have an id but we will ignore it - because that's what
8486 // `osd create` does.
8487 return 0;
8488 }
8489
8490 /*
8491 * This function will be used to validate whether we are able to
8492 * create a new osd when the `uuid` is specified.
8493 *
8494 * It will be used by both `osd create` and `osd new`, as the checks
8495 * are basically the same when it pertains to osd id and uuid validation.
8496 * However, `osd create` presumes an `uuid` is optional, for legacy
8497 * reasons, while `osd new` requires the `uuid` to be provided. This
8498 * means that `osd create` will not be idempotent if an `uuid` is not
8499 * provided, but we will always guarantee the idempotency of `osd new`.
8500 */
8501
11fdf7f2 8502 ceph_assert(!uuid.is_zero());
31f18b77
FG
8503 if (pending_inc.identify_osd(uuid) >= 0) {
8504 // osd is about to exist
8505 return -EAGAIN;
8506 }
8507
8508 int32_t i = osdmap.identify_osd(uuid);
8509 if (i >= 0) {
8510 // osd already exists
8511 if (id >= 0 && i != id) {
8512 ss << "uuid " << uuid << " already in use for different id " << i;
8513 return -EEXIST;
8514 }
8515 // return a positive errno to distinguish between a blocking error
8516 // and an error we consider to not be a problem (i.e., this would be
8517 // an idempotent operation).
8518 *existing_id = i;
8519 return EEXIST;
8520 }
8521 // i < 0
8522 if (id >= 0) {
8523 if (pending_inc.new_state.count(id)) {
8524 // osd is about to exist
8525 return -EAGAIN;
8526 }
8527 // we may not care if an osd exists if we are recreating a previously
8528 // destroyed osd.
8529 if (check_osd_exists && osdmap.exists(id)) {
8530 ss << "id " << id << " already in use and does not match uuid "
8531 << uuid;
8532 return -EINVAL;
8533 }
8534 }
8535 return 0;
8536}
8537
8538int OSDMonitor::prepare_command_osd_create(
8539 const int32_t id,
8540 const uuid_d& uuid,
8541 int32_t* existing_id,
8542 stringstream& ss)
8543{
8544 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
11fdf7f2 8545 ceph_assert(existing_id);
b5b8bbf5
FG
8546 if (osdmap.is_destroyed(id)) {
8547 ss << "ceph osd create has been deprecated. Please use ceph osd new "
8548 "instead.";
8549 return -EINVAL;
8550 }
31f18b77
FG
8551
8552 if (uuid.is_zero()) {
8553 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8554 }
8555
8556 return validate_osd_create(id, uuid, true, existing_id, ss);
8557}
8558
8559int OSDMonitor::prepare_command_osd_new(
8560 MonOpRequestRef op,
11fdf7f2 8561 const cmdmap_t& cmdmap,
3a9019d9 8562 const map<string,string>& params,
31f18b77
FG
8563 stringstream &ss,
8564 Formatter *f)
8565{
8566 uuid_d uuid;
8567 string uuidstr;
8568 int64_t id = -1;
8569
11fdf7f2 8570 ceph_assert(paxos->is_plugged());
31f18b77
FG
8571
8572 dout(10) << __func__ << " " << op << dendl;
8573
8574 /* validate command. abort now if something's wrong. */
8575
8576 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8577 *
8578 * If `id` is not specified, we will identify any existing osd based
8579 * on `uuid`. Operation will be idempotent iff secrets match.
8580 *
8581 * If `id` is specified, we will identify any existing osd based on
8582 * `uuid` and match against `id`. If they match, operation will be
8583 * idempotent iff secrets match.
8584 *
8585 * `-i secrets.json` will be optional. If supplied, will be used
8586 * to check for idempotency when `id` and `uuid` match.
8587 *
8588 * If `id` is not specified, and `uuid` does not exist, an id will
8589 * be found or allocated for the osd.
8590 *
8591 * If `id` is specified, and the osd has been previously marked
8592 * as destroyed, then the `id` will be reused.
8593 */
11fdf7f2 8594 if (!cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
31f18b77
FG
8595 ss << "requires the OSD's UUID to be specified.";
8596 return -EINVAL;
8597 } else if (!uuid.parse(uuidstr.c_str())) {
8598 ss << "invalid UUID value '" << uuidstr << "'.";
8599 return -EINVAL;
8600 }
8601
11fdf7f2 8602 if (cmd_getval(cct, cmdmap, "id", id) &&
31f18b77
FG
8603 (id < 0)) {
8604 ss << "invalid OSD id; must be greater or equal than zero.";
8605 return -EINVAL;
8606 }
8607
8608 // are we running an `osd create`-like command, or recreating
8609 // a previously destroyed osd?
8610
8611 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
8612
8613 // we will care about `id` to assess whether osd is `destroyed`, or
8614 // to create a new osd.
8615 // we will need an `id` by the time we reach auth.
8616
8617 int32_t existing_id = -1;
8618 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
8619 &existing_id, ss);
8620
8621 bool may_be_idempotent = false;
8622 if (err == EEXIST) {
8623 // this is idempotent from the osdmon's point-of-view
8624 may_be_idempotent = true;
11fdf7f2 8625 ceph_assert(existing_id >= 0);
31f18b77
FG
8626 id = existing_id;
8627 } else if (err < 0) {
8628 return err;
8629 }
8630
8631 if (!may_be_idempotent) {
8632 // idempotency is out of the window. We are either creating a new
8633 // osd or recreating a destroyed osd.
8634 //
8635 // We now need to figure out if we have an `id` (and if it's valid),
8636 // of find an `id` if we don't have one.
8637
8638 // NOTE: we need to consider the case where the `id` is specified for
8639 // `osd create`, and we must honor it. So this means checking if
8640 // the `id` is destroyed, and if so assume the destroy; otherwise,
8641 // check if it `exists` - in which case we complain about not being
8642 // `destroyed`. In the end, if nothing fails, we must allow the
8643 // creation, so that we are compatible with `create`.
8644 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
8645 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
8646 ss << "OSD " << id << " has not yet been destroyed";
8647 return -EINVAL;
8648 } else if (id < 0) {
8649 // find an `id`
8650 id = _allocate_osd_id(&existing_id);
8651 if (id < 0) {
11fdf7f2 8652 ceph_assert(existing_id >= 0);
31f18b77
FG
8653 id = existing_id;
8654 }
8655 dout(10) << __func__ << " found id " << id << " to use" << dendl;
8656 } else if (id >= 0 && osdmap.is_destroyed(id)) {
8657 dout(10) << __func__ << " recreating osd." << id << dendl;
8658 } else {
8659 dout(10) << __func__ << " creating new osd." << id << dendl;
8660 }
8661 } else {
11fdf7f2
TL
8662 ceph_assert(id >= 0);
8663 ceph_assert(osdmap.exists(id));
31f18b77
FG
8664 }
8665
8666 // we are now able to either create a brand new osd or reuse an existing
8667 // osd that has been previously destroyed.
8668
8669 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8670
3a9019d9 8671 if (may_be_idempotent && params.empty()) {
31f18b77 8672 // nothing to do, really.
3a9019d9 8673 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
11fdf7f2 8674 ceph_assert(id >= 0);
31f18b77
FG
8675 if (f) {
8676 f->open_object_section("created_osd");
8677 f->dump_int("osdid", id);
8678 f->close_section();
8679 } else {
8680 ss << id;
8681 }
8682 return EEXIST;
8683 }
8684
3a9019d9
FG
8685 string device_class;
8686 auto p = params.find("crush_device_class");
8687 if (p != params.end()) {
8688 device_class = p->second;
8689 dout(20) << __func__ << " device_class will be " << device_class << dendl;
8690 }
31f18b77
FG
8691 string cephx_secret, lockbox_secret, dmcrypt_key;
8692 bool has_lockbox = false;
3a9019d9
FG
8693 bool has_secrets = params.count("cephx_secret")
8694 || params.count("cephx_lockbox_secret")
8695 || params.count("dmcrypt_key");
31f18b77
FG
8696
8697 ConfigKeyService *svc = nullptr;
8698 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
8699
8700 if (has_secrets) {
3a9019d9 8701 if (params.count("cephx_secret") == 0) {
31f18b77
FG
8702 ss << "requires a cephx secret.";
8703 return -EINVAL;
8704 }
3a9019d9 8705 cephx_secret = params.at("cephx_secret");
31f18b77 8706
3a9019d9
FG
8707 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
8708 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
8709
8710 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
8711 << " dmcrypt " << has_dmcrypt_key << dendl;
8712
8713 if (has_lockbox_secret && has_dmcrypt_key) {
8714 has_lockbox = true;
3a9019d9
FG
8715 lockbox_secret = params.at("cephx_lockbox_secret");
8716 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
8717 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
8718 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
8719 return -EINVAL;
8720 }
8721
8722 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
8723
8724 err = mon->authmon()->validate_osd_new(id, uuid,
8725 cephx_secret,
8726 lockbox_secret,
8727 cephx_entity,
8728 lockbox_entity,
8729 ss);
8730 if (err < 0) {
8731 return err;
8732 } else if (may_be_idempotent && err != EEXIST) {
8733 // for this to be idempotent, `id` should already be >= 0; no need
8734 // to use validate_id.
11fdf7f2 8735 ceph_assert(id >= 0);
31f18b77
FG
8736 ss << "osd." << id << " exists but secrets do not match";
8737 return -EEXIST;
8738 }
8739
8740 if (has_lockbox) {
8741 svc = (ConfigKeyService*)mon->config_key_service;
8742 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
8743 if (err < 0) {
8744 return err;
8745 } else if (may_be_idempotent && err != EEXIST) {
11fdf7f2 8746 ceph_assert(id >= 0);
31f18b77
FG
8747 ss << "osd." << id << " exists but dm-crypt key does not match.";
8748 return -EEXIST;
8749 }
8750 }
8751 }
11fdf7f2
TL
8752 ceph_assert(!has_secrets || !cephx_secret.empty());
8753 ceph_assert(!has_lockbox || !lockbox_secret.empty());
31f18b77
FG
8754
8755 if (may_be_idempotent) {
8756 // we have nothing to do for either the osdmon or the authmon,
8757 // and we have no lockbox - so the config key service will not be
8758 // touched. This is therefore an idempotent operation, and we can
8759 // just return right away.
8760 dout(10) << __func__ << " idempotent -- no op." << dendl;
11fdf7f2 8761 ceph_assert(id >= 0);
31f18b77
FG
8762 if (f) {
8763 f->open_object_section("created_osd");
8764 f->dump_int("osdid", id);
8765 f->close_section();
8766 } else {
8767 ss << id;
8768 }
8769 return EEXIST;
8770 }
11fdf7f2 8771 ceph_assert(!may_be_idempotent);
31f18b77
FG
8772
8773 // perform updates.
8774 if (has_secrets) {
11fdf7f2
TL
8775 ceph_assert(!cephx_secret.empty());
8776 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
31f18b77
FG
8777 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
8778
8779 err = mon->authmon()->do_osd_new(cephx_entity,
8780 lockbox_entity,
8781 has_lockbox);
11fdf7f2 8782 ceph_assert(0 == err);
31f18b77
FG
8783
8784 if (has_lockbox) {
11fdf7f2 8785 ceph_assert(nullptr != svc);
31f18b77
FG
8786 svc->do_osd_new(uuid, dmcrypt_key);
8787 }
8788 }
8789
8790 if (is_recreate_destroyed) {
11fdf7f2
TL
8791 ceph_assert(id >= 0);
8792 ceph_assert(osdmap.is_destroyed(id));
31f18b77 8793 pending_inc.new_weight[id] = CEPH_OSD_OUT;
11fdf7f2
TL
8794 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
8795 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
8796 pending_inc.new_state[id] |= CEPH_OSD_NEW;
8797 }
c07f9fc5
FG
8798 if (osdmap.get_state(id) & CEPH_OSD_UP) {
8799 // due to http://tracker.ceph.com/issues/20751 some clusters may
8800 // have UP set for non-existent OSDs; make sure it is cleared
8801 // for a newly created osd.
8802 pending_inc.new_state[id] |= CEPH_OSD_UP;
8803 }
31f18b77
FG
8804 pending_inc.new_uuid[id] = uuid;
8805 } else {
11fdf7f2 8806 ceph_assert(id >= 0);
31f18b77 8807 int32_t new_id = -1;
3a9019d9 8808 do_osd_create(id, uuid, device_class, &new_id);
11fdf7f2
TL
8809 ceph_assert(new_id >= 0);
8810 ceph_assert(id == new_id);
31f18b77
FG
8811 }
8812
8813 if (f) {
8814 f->open_object_section("created_osd");
8815 f->dump_int("osdid", id);
8816 f->close_section();
8817 } else {
8818 ss << id;
8819 }
8820
8821 return 0;
8822}
8823
7c673cae
FG
8824bool OSDMonitor::prepare_command(MonOpRequestRef op)
8825{
8826 op->mark_osdmon_event(__func__);
8827 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8828 stringstream ss;
11fdf7f2 8829 cmdmap_t cmdmap;
7c673cae
FG
8830 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
8831 string rs = ss.str();
8832 mon->reply_command(op, -EINVAL, rs, get_last_committed());
8833 return true;
8834 }
8835
11fdf7f2 8836 MonSession *session = op->get_session();
7c673cae 8837 if (!session) {
11fdf7f2 8838 derr << __func__ << " no session" << dendl;
7c673cae
FG
8839 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
8840 return true;
8841 }
8842
8843 return prepare_command_impl(op, cmdmap);
8844}
8845
8846static int parse_reweights(CephContext *cct,
11fdf7f2 8847 const cmdmap_t& cmdmap,
7c673cae
FG
8848 const OSDMap& osdmap,
8849 map<int32_t, uint32_t>* weights)
8850{
8851 string weights_str;
11fdf7f2 8852 if (!cmd_getval(cct, cmdmap, "weights", weights_str)) {
7c673cae
FG
8853 return -EINVAL;
8854 }
8855 std::replace(begin(weights_str), end(weights_str), '\'', '"');
8856 json_spirit::mValue json_value;
8857 if (!json_spirit::read(weights_str, json_value)) {
8858 return -EINVAL;
8859 }
8860 if (json_value.type() != json_spirit::obj_type) {
8861 return -EINVAL;
8862 }
8863 const auto obj = json_value.get_obj();
8864 try {
8865 for (auto& osd_weight : obj) {
8866 auto osd_id = std::stoi(osd_weight.first);
8867 if (!osdmap.exists(osd_id)) {
8868 return -ENOENT;
8869 }
8870 if (osd_weight.second.type() != json_spirit::str_type) {
8871 return -EINVAL;
8872 }
8873 auto weight = std::stoul(osd_weight.second.get_str());
8874 weights->insert({osd_id, weight});
8875 }
8876 } catch (const std::logic_error& e) {
8877 return -EINVAL;
8878 }
8879 return 0;
8880}
8881
31f18b77
FG
8882int OSDMonitor::prepare_command_osd_destroy(
8883 int32_t id,
8884 stringstream& ss)
8885{
11fdf7f2 8886 ceph_assert(paxos->is_plugged());
31f18b77
FG
8887
8888 // we check if the osd exists for the benefit of `osd purge`, which may
8889 // have previously removed the osd. If the osd does not exist, return
8890 // -ENOENT to convey this, and let the caller deal with it.
8891 //
8892 // we presume that all auth secrets and config keys were removed prior
8893 // to this command being called. if they exist by now, we also assume
8894 // they must have been created by some other command and do not pertain
8895 // to this non-existent osd.
8896 if (!osdmap.exists(id)) {
8897 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
8898 return -ENOENT;
8899 }
8900
8901 uuid_d uuid = osdmap.get_uuid(id);
8902 dout(10) << __func__ << " destroying osd." << id
8903 << " uuid " << uuid << dendl;
8904
8905 // if it has been destroyed, we assume our work here is done.
8906 if (osdmap.is_destroyed(id)) {
8907 ss << "destroyed osd." << id;
8908 return 0;
8909 }
8910
8911 EntityName cephx_entity, lockbox_entity;
8912 bool idempotent_auth = false, idempotent_cks = false;
8913
8914 int err = mon->authmon()->validate_osd_destroy(id, uuid,
8915 cephx_entity,
8916 lockbox_entity,
8917 ss);
8918 if (err < 0) {
8919 if (err == -ENOENT) {
8920 idempotent_auth = true;
31f18b77
FG
8921 } else {
8922 return err;
8923 }
8924 }
8925
8926 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
8927 err = svc->validate_osd_destroy(id, uuid);
8928 if (err < 0) {
11fdf7f2 8929 ceph_assert(err == -ENOENT);
31f18b77
FG
8930 err = 0;
8931 idempotent_cks = true;
8932 }
8933
8934 if (!idempotent_auth) {
8935 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
11fdf7f2 8936 ceph_assert(0 == err);
31f18b77
FG
8937 }
8938
8939 if (!idempotent_cks) {
8940 svc->do_osd_destroy(id, uuid);
8941 }
8942
8943 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
8944 pending_inc.new_uuid[id] = uuid_d();
8945
8946 // we can only propose_pending() once per service, otherwise we'll be
8947 // defying PaxosService and all laws of nature. Therefore, as we may
8948 // be used during 'osd purge', let's keep the caller responsible for
8949 // proposing.
11fdf7f2 8950 ceph_assert(err == 0);
31f18b77
FG
8951 return 0;
8952}
8953
8954int OSDMonitor::prepare_command_osd_purge(
8955 int32_t id,
8956 stringstream& ss)
8957{
11fdf7f2 8958 ceph_assert(paxos->is_plugged());
31f18b77
FG
8959 dout(10) << __func__ << " purging osd." << id << dendl;
8960
11fdf7f2 8961 ceph_assert(!osdmap.is_up(id));
31f18b77
FG
8962
8963 /*
8964 * This may look a bit weird, but this is what's going to happen:
8965 *
8966 * 1. we make sure that removing from crush works
8967 * 2. we call `prepare_command_osd_destroy()`. If it returns an
8968 * error, then we abort the whole operation, as no updates
8969 * have been made. However, we this function will have
8970 * side-effects, thus we need to make sure that all operations
8971 * performed henceforth will *always* succeed.
8972 * 3. we call `prepare_command_osd_remove()`. Although this
8973 * function can return an error, it currently only checks if the
8974 * osd is up - and we have made sure that it is not so, so there
8975 * is no conflict, and it is effectively an update.
8976 * 4. finally, we call `do_osd_crush_remove()`, which will perform
8977 * the crush update we delayed from before.
8978 */
8979
8980 CrushWrapper newcrush;
8981 _get_pending_crush(newcrush);
8982
8983 bool may_be_idempotent = false;
8984
8985 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
8986 if (err == -ENOENT) {
8987 err = 0;
8988 may_be_idempotent = true;
8989 } else if (err < 0) {
8990 ss << "error removing osd." << id << " from crush";
8991 return err;
8992 }
8993
8994 // no point destroying the osd again if it has already been marked destroyed
8995 if (!osdmap.is_destroyed(id)) {
8996 err = prepare_command_osd_destroy(id, ss);
8997 if (err < 0) {
8998 if (err == -ENOENT) {
8999 err = 0;
9000 } else {
9001 return err;
9002 }
9003 } else {
9004 may_be_idempotent = false;
9005 }
9006 }
11fdf7f2 9007 ceph_assert(0 == err);
31f18b77
FG
9008
9009 if (may_be_idempotent && !osdmap.exists(id)) {
9010 dout(10) << __func__ << " osd." << id << " does not exist and "
9011 << "we are idempotent." << dendl;
9012 return -ENOENT;
9013 }
9014
9015 err = prepare_command_osd_remove(id);
9016 // we should not be busy, as we should have made sure this id is not up.
11fdf7f2 9017 ceph_assert(0 == err);
31f18b77
FG
9018
9019 do_osd_crush_remove(newcrush);
9020 return 0;
9021}
9022
7c673cae 9023bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
11fdf7f2 9024 const cmdmap_t& cmdmap)
7c673cae
FG
9025{
9026 op->mark_osdmon_event(__func__);
9027 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
9028 bool ret = false;
9029 stringstream ss;
9030 string rs;
9031 bufferlist rdata;
9032 int err = 0;
9033
9034 string format;
11fdf7f2 9035 cmd_getval(cct, cmdmap, "format", format, string("plain"));
7c673cae
FG
9036 boost::scoped_ptr<Formatter> f(Formatter::create(format));
9037
9038 string prefix;
11fdf7f2 9039 cmd_getval(cct, cmdmap, "prefix", prefix);
7c673cae
FG
9040
9041 int64_t osdid;
11fdf7f2 9042 string osd_name;
b32b8144
FG
9043 bool osdid_present = false;
9044 if (prefix != "osd pg-temp" &&
9045 prefix != "osd pg-upmap" &&
9046 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
11fdf7f2 9047 osdid_present = cmd_getval(cct, cmdmap, "id", osdid);
b32b8144 9048 }
7c673cae
FG
9049 if (osdid_present) {
9050 ostringstream oss;
9051 oss << "osd." << osdid;
11fdf7f2 9052 osd_name = oss.str();
7c673cae
FG
9053 }
9054
9055 // Even if there's a pending state with changes that could affect
9056 // a command, considering that said state isn't yet committed, we
9057 // just don't care about those changes if the command currently being
9058 // handled acts as a no-op against the current committed state.
9059 // In a nutshell, we assume this command happens *before*.
9060 //
9061 // Let me make this clearer:
9062 //
9063 // - If we have only one client, and that client issues some
9064 // operation that would conflict with this operation but is
9065 // still on the pending state, then we would be sure that said
9066 // operation wouldn't have returned yet, so the client wouldn't
9067 // issue this operation (unless the client didn't wait for the
9068 // operation to finish, and that would be the client's own fault).
9069 //
9070 // - If we have more than one client, each client will observe
9071 // whatever is the state at the moment of the commit. So, if we
9072 // have two clients, one issuing an unlink and another issuing a
9073 // link, and if the link happens while the unlink is still on the
9074 // pending state, from the link's point-of-view this is a no-op.
9075 // If different clients are issuing conflicting operations and
9076 // they care about that, then the clients should make sure they
9077 // enforce some kind of concurrency mechanism -- from our
9078 // perspective that's what Douglas Adams would call an SEP.
9079 //
9080 // This should be used as a general guideline for most commands handled
9081 // in this function. Adapt as you see fit, but please bear in mind that
9082 // this is the expected behavior.
9083
9084
9085 if (prefix == "osd setcrushmap" ||
9086 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
9087 if (pending_inc.crush.length()) {
9088 dout(10) << __func__ << " waiting for pending crush update " << dendl;
9089 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9090 return true;
9091 }
7c673cae
FG
9092 dout(10) << "prepare_command setting new crush map" << dendl;
9093 bufferlist data(m->get_data());
9094 CrushWrapper crush;
9095 try {
11fdf7f2 9096 auto bl = data.cbegin();
7c673cae
FG
9097 crush.decode(bl);
9098 }
9099 catch (const std::exception &e) {
9100 err = -EINVAL;
9101 ss << "Failed to parse crushmap: " << e.what();
9102 goto reply;
9103 }
31f18b77
FG
9104
9105 int64_t prior_version = 0;
11fdf7f2 9106 if (cmd_getval(cct, cmdmap, "prior_version", prior_version)) {
31f18b77
FG
9107 if (prior_version == osdmap.get_crush_version() - 1) {
9108 // see if we are a resend of the last update. this is imperfect
9109 // (multiple racing updaters may not both get reliable success)
9110 // but we expect crush updaters (via this interface) to be rare-ish.
9111 bufferlist current, proposed;
9112 osdmap.crush->encode(current, mon->get_quorum_con_features());
9113 crush.encode(proposed, mon->get_quorum_con_features());
9114 if (current.contents_equal(proposed)) {
9115 dout(10) << __func__
9116 << " proposed matches current and version equals previous"
9117 << dendl;
9118 err = 0;
9119 ss << osdmap.get_crush_version();
9120 goto reply;
9121 }
9122 }
9123 if (prior_version != osdmap.get_crush_version()) {
9124 err = -EPERM;
9125 ss << "prior_version " << prior_version << " != crush version "
9126 << osdmap.get_crush_version();
9127 goto reply;
9128 }
9129 }
7c673cae 9130
3efd9988 9131 if (crush.has_legacy_rule_ids()) {
31f18b77
FG
9132 err = -EINVAL;
9133 ss << "crush maps with ruleset != ruleid are no longer allowed";
9134 goto reply;
9135 }
7c673cae
FG
9136 if (!validate_crush_against_features(&crush, ss)) {
9137 err = -EINVAL;
9138 goto reply;
9139 }
31f18b77 9140
3efd9988
FG
9141 err = osdmap.validate_crush_rules(&crush, &ss);
9142 if (err < 0) {
9143 goto reply;
7c673cae
FG
9144 }
9145
11fdf7f2 9146 if (g_conf()->mon_osd_crush_smoke_test) {
224ce89b
WB
9147 // sanity check: test some inputs to make sure this map isn't
9148 // totally broken
9149 dout(10) << " testing map" << dendl;
9150 stringstream ess;
9151 CrushTester tester(crush, ess);
b5b8bbf5 9152 tester.set_min_x(0);
224ce89b 9153 tester.set_max_x(50);
b5b8bbf5 9154 auto start = ceph::coarse_mono_clock::now();
11fdf7f2 9155 int r = tester.test_with_fork(g_conf()->mon_lease);
b5b8bbf5 9156 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
9157 if (r < 0) {
9158 dout(10) << " tester.test_with_fork returns " << r
9159 << ": " << ess.str() << dendl;
9160 ss << "crush smoke test failed with " << r << ": " << ess.str();
9161 err = r;
9162 goto reply;
9163 }
b5b8bbf5
FG
9164 dout(10) << __func__ << " crush somke test duration: "
9165 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
9166 }
9167
7c673cae 9168 pending_inc.crush = data;
31f18b77 9169 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
9170 goto update;
9171
3efd9988
FG
9172 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9173 CrushWrapper newcrush;
9174 _get_pending_crush(newcrush);
9175 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9176 int bid = -1 - b;
9177 if (newcrush.bucket_exists(bid) &&
11fdf7f2 9178 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
3efd9988
FG
9179 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9180 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9181 }
9182 }
9183 if (!validate_crush_against_features(&newcrush, ss)) {
9184 err = -EINVAL;
9185 goto reply;
9186 }
9187 pending_inc.crush.clear();
9188 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9189 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9190 get_last_committed() + 1));
9191 return true;
7c673cae 9192 } else if (prefix == "osd crush set-device-class") {
7c673cae 9193 string device_class;
11fdf7f2 9194 if (!cmd_getval(cct, cmdmap, "class", device_class)) {
7c673cae
FG
9195 err = -EINVAL; // no value!
9196 goto reply;
9197 }
9198
224ce89b
WB
9199 bool stop = false;
9200 vector<string> idvec;
11fdf7f2 9201 cmd_getval(cct, cmdmap, "ids", idvec);
7c673cae
FG
9202 CrushWrapper newcrush;
9203 _get_pending_crush(newcrush);
224ce89b
WB
9204 set<int> updated;
9205 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9206 set<int> osds;
9207 // wildcard?
9208 if (j == 0 &&
9209 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9210 osdmap.get_all_osds(osds);
9211 stop = true;
9212 } else {
9213 // try traditional single osd way
9214 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9215 if (osd < 0) {
9216 // ss has reason for failure
9217 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9218 err = -EINVAL;
9219 continue;
9220 }
9221 osds.insert(osd);
9222 }
7c673cae 9223
224ce89b
WB
9224 for (auto &osd : osds) {
9225 if (!osdmap.exists(osd)) {
9226 ss << "osd." << osd << " does not exist. ";
9227 continue;
9228 }
7c673cae 9229
224ce89b
WB
9230 ostringstream oss;
9231 oss << "osd." << osd;
9232 string name = oss.str();
7c673cae 9233
3a9019d9
FG
9234 if (newcrush.get_max_devices() < osd + 1) {
9235 newcrush.set_max_devices(osd + 1);
9236 }
224ce89b
WB
9237 string action;
9238 if (newcrush.item_exists(osd)) {
9239 action = "updating";
9240 } else {
9241 action = "creating";
9242 newcrush.set_item_name(osd, name);
9243 }
7c673cae 9244
224ce89b
WB
9245 dout(5) << action << " crush item id " << osd << " name '" << name
9246 << "' device_class '" << device_class << "'"
9247 << dendl;
9248 err = newcrush.update_device_class(osd, device_class, name, &ss);
9249 if (err < 0) {
9250 goto reply;
9251 }
9252 if (err == 0 && !_have_pending_crush()) {
9253 if (!stop) {
9254 // for single osd only, wildcard makes too much noise
9255 ss << "set-device-class item id " << osd << " name '" << name
11fdf7f2 9256 << "' device_class '" << device_class << "': no change. ";
224ce89b
WB
9257 }
9258 } else {
9259 updated.insert(osd);
9260 }
9261 }
7c673cae
FG
9262 }
9263
224ce89b
WB
9264 if (!updated.empty()) {
9265 pending_inc.crush.clear();
9266 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9267 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9268 getline(ss, rs);
9269 wait_for_finished_proposal(op,
9270 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9271 return true;
9272 }
7c673cae 9273
c07f9fc5
FG
9274 } else if (prefix == "osd crush rm-device-class") {
9275 bool stop = false;
9276 vector<string> idvec;
11fdf7f2 9277 cmd_getval(cct, cmdmap, "ids", idvec);
c07f9fc5
FG
9278 CrushWrapper newcrush;
9279 _get_pending_crush(newcrush);
9280 set<int> updated;
9281
9282 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9283 set<int> osds;
9284
9285 // wildcard?
9286 if (j == 0 &&
9287 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9288 osdmap.get_all_osds(osds);
9289 stop = true;
9290 } else {
9291 // try traditional single osd way
9292 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9293 if (osd < 0) {
9294 // ss has reason for failure
9295 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9296 err = -EINVAL;
9297 goto reply;
9298 }
9299 osds.insert(osd);
9300 }
9301
9302 for (auto &osd : osds) {
9303 if (!osdmap.exists(osd)) {
9304 ss << "osd." << osd << " does not exist. ";
9305 continue;
9306 }
9307
9308 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
9309 if (!class_name) {
9310 ss << "osd." << osd << " belongs to no class, ";
9311 continue;
9312 }
9313 // note that we do not verify if class_is_in_use here
9314 // in case the device is misclassified and user wants
9315 // to overridely reset...
9316
11fdf7f2 9317 err = newcrush.remove_device_class(cct, osd, &ss);
c07f9fc5
FG
9318 if (err < 0) {
9319 // ss has reason for failure
9320 goto reply;
9321 }
9322 updated.insert(osd);
9323 }
9324 }
9325
9326 if (!updated.empty()) {
9327 pending_inc.crush.clear();
9328 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9329 ss << "done removing class of osd(s): " << updated;
9330 getline(ss, rs);
9331 wait_for_finished_proposal(op,
9332 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9333 return true;
9334 }
11fdf7f2
TL
9335 } else if (prefix == "osd crush class create") {
9336 string device_class;
9337 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
9338 err = -EINVAL; // no value!
9339 goto reply;
9340 }
9341 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
9342 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9343 << "luminous' before using crush device classes";
9344 err = -EPERM;
9345 goto reply;
9346 }
9347 if (!_have_pending_crush() &&
9348 _get_stable_crush().class_exists(device_class)) {
9349 ss << "class '" << device_class << "' already exists";
9350 goto reply;
9351 }
9352 CrushWrapper newcrush;
9353 _get_pending_crush(newcrush);
9354 if (newcrush.class_exists(device_class)) {
9355 ss << "class '" << device_class << "' already exists";
9356 goto update;
9357 }
9358 int class_id = newcrush.get_or_create_class_id(device_class);
9359 pending_inc.crush.clear();
9360 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9361 ss << "created class " << device_class << " with id " << class_id
9362 << " to crush map";
9363 goto update;
9364 } else if (prefix == "osd crush class rm") {
9365 string device_class;
9366 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
9367 err = -EINVAL; // no value!
9368 goto reply;
9369 }
9370 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
9371 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9372 << "luminous' before using crush device classes";
9373 err = -EPERM;
9374 goto reply;
9375 }
9376
9377 if (!osdmap.crush->class_exists(device_class)) {
9378 err = 0;
9379 goto reply;
9380 }
9381
9382 CrushWrapper newcrush;
9383 _get_pending_crush(newcrush);
9384 if (!newcrush.class_exists(device_class)) {
9385 err = 0; // make command idempotent
9386 goto wait;
9387 }
9388 int class_id = newcrush.get_class_id(device_class);
9389 stringstream ts;
9390 if (newcrush.class_is_in_use(class_id, &ts)) {
9391 err = -EBUSY;
9392 ss << "class '" << device_class << "' " << ts.str();
9393 goto reply;
9394 }
9395
9396 // check if class is used by any erasure-code-profiles
9397 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9398 osdmap.get_erasure_code_profiles();
9399 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9400#ifdef HAVE_STDLIB_MAP_SPLICING
9401 ec_profiles.merge(old_ec_profiles);
9402#else
9403 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9404 make_move_iterator(end(old_ec_profiles)));
9405#endif
9406 list<string> referenced_by;
9407 for (auto &i: ec_profiles) {
9408 for (auto &j: i.second) {
9409 if ("crush-device-class" == j.first && device_class == j.second) {
9410 referenced_by.push_back(i.first);
9411 }
9412 }
9413 }
9414 if (!referenced_by.empty()) {
9415 err = -EBUSY;
9416 ss << "class '" << device_class
9417 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9418 goto reply;
9419 }
9420
9421 set<int> osds;
9422 newcrush.get_devices_by_class(device_class, &osds);
9423 for (auto& p: osds) {
9424 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9425 if (err < 0) {
9426 // ss has reason for failure
9427 goto reply;
9428 }
9429 }
9430
9431 if (osds.empty()) {
9432 // empty class, remove directly
9433 err = newcrush.remove_class_name(device_class);
9434 if (err < 0) {
9435 ss << "class '" << device_class << "' cannot be removed '"
9436 << cpp_strerror(err) << "'";
9437 goto reply;
9438 }
9439 }
9440
9441 pending_inc.crush.clear();
9442 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9443 ss << "removed class " << device_class << " with id " << class_id
9444 << " from crush map";
9445 goto update;
35e4c445
FG
9446 } else if (prefix == "osd crush class rename") {
9447 string srcname, dstname;
11fdf7f2 9448 if (!cmd_getval(cct, cmdmap, "srcname", srcname)) {
35e4c445
FG
9449 err = -EINVAL;
9450 goto reply;
9451 }
11fdf7f2 9452 if (!cmd_getval(cct, cmdmap, "dstname", dstname)) {
35e4c445
FG
9453 err = -EINVAL;
9454 goto reply;
9455 }
9456
9457 CrushWrapper newcrush;
9458 _get_pending_crush(newcrush);
181888fb
FG
9459 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9460 // suppose this is a replay and return success
9461 // so command is idempotent
9462 ss << "already renamed to '" << dstname << "'";
9463 err = 0;
35e4c445
FG
9464 goto reply;
9465 }
c07f9fc5 9466
35e4c445
FG
9467 err = newcrush.rename_class(srcname, dstname);
9468 if (err < 0) {
9469 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9470 << cpp_strerror(err);
9471 goto reply;
9472 }
9473
9474 pending_inc.crush.clear();
9475 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9476 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9477 goto update;
7c673cae
FG
9478 } else if (prefix == "osd crush add-bucket") {
9479 // os crush add-bucket <name> <type>
9480 string name, typestr;
11fdf7f2
TL
9481 vector<string> argvec;
9482 cmd_getval(cct, cmdmap, "name", name);
9483 cmd_getval(cct, cmdmap, "type", typestr);
9484 cmd_getval(cct, cmdmap, "args", argvec);
9485 map<string,string> loc;
9486 if (!argvec.empty()) {
9487 CrushWrapper::parse_loc_map(argvec, &loc);
9488 dout(0) << "will create and move bucket '" << name
9489 << "' to location " << loc << dendl;
9490 }
7c673cae
FG
9491
9492 if (!_have_pending_crush() &&
9493 _get_stable_crush().name_exists(name)) {
9494 ss << "bucket '" << name << "' already exists";
9495 goto reply;
9496 }
9497
9498 CrushWrapper newcrush;
9499 _get_pending_crush(newcrush);
9500
9501 if (newcrush.name_exists(name)) {
9502 ss << "bucket '" << name << "' already exists";
9503 goto update;
9504 }
9505 int type = newcrush.get_type_id(typestr);
9506 if (type < 0) {
9507 ss << "type '" << typestr << "' does not exist";
9508 err = -EINVAL;
9509 goto reply;
9510 }
9511 if (type == 0) {
9512 ss << "type '" << typestr << "' is for devices, not buckets";
9513 err = -EINVAL;
9514 goto reply;
9515 }
9516 int bucketno;
9517 err = newcrush.add_bucket(0, 0,
9518 CRUSH_HASH_DEFAULT, type, 0, NULL,
9519 NULL, &bucketno);
9520 if (err < 0) {
9521 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9522 goto reply;
9523 }
9524 err = newcrush.set_item_name(bucketno, name);
9525 if (err < 0) {
9526 ss << "error setting bucket name to '" << name << "'";
9527 goto reply;
9528 }
9529
11fdf7f2
TL
9530 if (!loc.empty()) {
9531 if (!newcrush.check_item_loc(cct, bucketno, loc,
9532 (int *)NULL)) {
9533 err = newcrush.move_bucket(cct, bucketno, loc);
9534 if (err < 0) {
9535 ss << "error moving bucket '" << name << "' to location " << loc;
9536 goto reply;
9537 }
9538 } else {
9539 ss << "no need to move item id " << bucketno << " name '" << name
9540 << "' to location " << loc << " in crush map";
9541 }
9542 }
9543
7c673cae
FG
9544 pending_inc.crush.clear();
9545 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
9546 if (loc.empty()) {
9547 ss << "added bucket " << name << " type " << typestr
9548 << " to crush map";
9549 } else {
9550 ss << "added bucket " << name << " type " << typestr
9551 << " to location " << loc;
9552 }
7c673cae
FG
9553 goto update;
9554 } else if (prefix == "osd crush rename-bucket") {
9555 string srcname, dstname;
11fdf7f2
TL
9556 cmd_getval(cct, cmdmap, "srcname", srcname);
9557 cmd_getval(cct, cmdmap, "dstname", dstname);
7c673cae
FG
9558
9559 err = crush_rename_bucket(srcname, dstname, &ss);
9560 if (err == -EALREADY) // equivalent to success for idempotency
9561 err = 0;
9562 if (err)
9563 goto reply;
9564 else
9565 goto update;
c07f9fc5
FG
9566 } else if (prefix == "osd crush weight-set create" ||
9567 prefix == "osd crush weight-set create-compat") {
9568 CrushWrapper newcrush;
9569 _get_pending_crush(newcrush);
9570 int64_t pool;
9571 int positions;
9572 if (newcrush.has_non_straw2_buckets()) {
9573 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
9574 err = -EPERM;
9575 goto reply;
9576 }
c07f9fc5
FG
9577 if (prefix == "osd crush weight-set create") {
9578 if (osdmap.require_min_compat_client > 0 &&
9579 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9580 ss << "require_min_compat_client "
9581 << ceph_release_name(osdmap.require_min_compat_client)
9582 << " < luminous, which is required for per-pool weight-sets. "
9583 << "Try 'ceph osd set-require-min-compat-client luminous' "
9584 << "before using the new interface";
9585 err = -EPERM;
9586 goto reply;
9587 }
9588 string poolname, mode;
11fdf7f2 9589 cmd_getval(cct, cmdmap, "pool", poolname);
c07f9fc5
FG
9590 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9591 if (pool < 0) {
9592 ss << "pool '" << poolname << "' not found";
9593 err = -ENOENT;
9594 goto reply;
9595 }
11fdf7f2 9596 cmd_getval(cct, cmdmap, "mode", mode);
c07f9fc5
FG
9597 if (mode != "flat" && mode != "positional") {
9598 ss << "unrecognized weight-set mode '" << mode << "'";
9599 err = -EINVAL;
9600 goto reply;
9601 }
9602 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
9603 } else {
9604 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9605 positions = 1;
224ce89b 9606 }
11fdf7f2
TL
9607 if (!newcrush.create_choose_args(pool, positions)) {
9608 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
9609 ss << "compat weight-set already created";
9610 } else {
9611 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
9612 << "' already created";
9613 }
9614 goto reply;
9615 }
c07f9fc5
FG
9616 pending_inc.crush.clear();
9617 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9618 goto update;
224ce89b 9619
c07f9fc5
FG
9620 } else if (prefix == "osd crush weight-set rm" ||
9621 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
9622 CrushWrapper newcrush;
9623 _get_pending_crush(newcrush);
c07f9fc5
FG
9624 int64_t pool;
9625 if (prefix == "osd crush weight-set rm") {
9626 string poolname;
11fdf7f2 9627 cmd_getval(cct, cmdmap, "pool", poolname);
c07f9fc5
FG
9628 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9629 if (pool < 0) {
9630 ss << "pool '" << poolname << "' not found";
9631 err = -ENOENT;
9632 goto reply;
9633 }
9634 } else {
9635 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 9636 }
c07f9fc5
FG
9637 newcrush.rm_choose_args(pool);
9638 pending_inc.crush.clear();
9639 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9640 goto update;
224ce89b 9641
c07f9fc5
FG
9642 } else if (prefix == "osd crush weight-set reweight" ||
9643 prefix == "osd crush weight-set reweight-compat") {
9644 string poolname, item;
9645 vector<double> weight;
11fdf7f2
TL
9646 cmd_getval(cct, cmdmap, "pool", poolname);
9647 cmd_getval(cct, cmdmap, "item", item);
9648 cmd_getval(cct, cmdmap, "weight", weight);
c07f9fc5
FG
9649 CrushWrapper newcrush;
9650 _get_pending_crush(newcrush);
9651 int64_t pool;
9652 if (prefix == "osd crush weight-set reweight") {
9653 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9654 if (pool < 0) {
9655 ss << "pool '" << poolname << "' not found";
9656 err = -ENOENT;
9657 goto reply;
9658 }
9659 if (!newcrush.have_choose_args(pool)) {
9660 ss << "no weight-set for pool '" << poolname << "'";
9661 err = -ENOENT;
9662 goto reply;
9663 }
9664 auto arg_map = newcrush.choose_args_get(pool);
9665 int positions = newcrush.get_choose_args_positions(arg_map);
9666 if (weight.size() != (size_t)positions) {
9667 ss << "must specify exact " << positions << " weight values";
9668 err = -EINVAL;
9669 goto reply;
9670 }
9671 } else {
9672 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9673 if (!newcrush.have_choose_args(pool)) {
9674 ss << "no backward-compatible weight-set";
9675 err = -ENOENT;
9676 goto reply;
9677 }
224ce89b 9678 }
c07f9fc5
FG
9679 if (!newcrush.name_exists(item)) {
9680 ss << "item '" << item << "' does not exist";
9681 err = -ENOENT;
224ce89b
WB
9682 goto reply;
9683 }
c07f9fc5 9684 err = newcrush.choose_args_adjust_item_weightf(
11fdf7f2 9685 cct,
c07f9fc5
FG
9686 newcrush.choose_args_get(pool),
9687 newcrush.get_item_id(item),
9688 weight,
9689 &ss);
224ce89b 9690 if (err < 0) {
224ce89b
WB
9691 goto reply;
9692 }
c07f9fc5 9693 err = 0;
224ce89b
WB
9694 pending_inc.crush.clear();
9695 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
224ce89b 9696 goto update;
7c673cae
FG
9697 } else if (osdid_present &&
9698 (prefix == "osd crush set" || prefix == "osd crush add")) {
9699 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
9700 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
9701 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
9702
9703 if (!osdmap.exists(osdid)) {
9704 err = -ENOENT;
11fdf7f2
TL
9705 ss << osd_name
9706 << " does not exist. Create it before updating the crush map";
7c673cae
FG
9707 goto reply;
9708 }
9709
9710 double weight;
11fdf7f2 9711 if (!cmd_getval(cct, cmdmap, "weight", weight)) {
7c673cae 9712 ss << "unable to parse weight value '"
11fdf7f2 9713 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
9714 err = -EINVAL;
9715 goto reply;
9716 }
9717
9718 string args;
9719 vector<string> argvec;
11fdf7f2 9720 cmd_getval(cct, cmdmap, "args", argvec);
7c673cae
FG
9721 map<string,string> loc;
9722 CrushWrapper::parse_loc_map(argvec, &loc);
9723
9724 if (prefix == "osd crush set"
9725 && !_get_stable_crush().item_exists(osdid)) {
9726 err = -ENOENT;
11fdf7f2 9727 ss << "unable to set item id " << osdid << " name '" << osd_name
7c673cae
FG
9728 << "' weight " << weight << " at location " << loc
9729 << ": does not exist";
9730 goto reply;
9731 }
9732
9733 dout(5) << "adding/updating crush item id " << osdid << " name '"
11fdf7f2 9734 << osd_name << "' weight " << weight << " at location "
7c673cae
FG
9735 << loc << dendl;
9736 CrushWrapper newcrush;
9737 _get_pending_crush(newcrush);
9738
9739 string action;
9740 if (prefix == "osd crush set" ||
11fdf7f2 9741 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
7c673cae 9742 action = "set";
11fdf7f2 9743 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
9744 } else {
9745 action = "add";
11fdf7f2 9746 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
7c673cae
FG
9747 if (err == 0)
9748 err = 1;
9749 }
9750
9751 if (err < 0)
9752 goto reply;
9753
9754 if (err == 0 && !_have_pending_crush()) {
11fdf7f2
TL
9755 ss << action << " item id " << osdid << " name '" << osd_name
9756 << "' weight " << weight << " at location " << loc << ": no change";
7c673cae
FG
9757 goto reply;
9758 }
9759
9760 pending_inc.crush.clear();
9761 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
9762 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
9763 << weight << " at location " << loc << " to crush map";
7c673cae
FG
9764 getline(ss, rs);
9765 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9766 get_last_committed() + 1));
9767 return true;
9768
9769 } else if (prefix == "osd crush create-or-move") {
9770 do {
9771 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
9772 if (!osdmap.exists(osdid)) {
9773 err = -ENOENT;
11fdf7f2
TL
9774 ss << osd_name
9775 << " does not exist. create it before updating the crush map";
7c673cae
FG
9776 goto reply;
9777 }
9778
9779 double weight;
11fdf7f2 9780 if (!cmd_getval(cct, cmdmap, "weight", weight)) {
7c673cae 9781 ss << "unable to parse weight value '"
11fdf7f2 9782 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
9783 err = -EINVAL;
9784 goto reply;
9785 }
9786
9787 string args;
9788 vector<string> argvec;
11fdf7f2 9789 cmd_getval(cct, cmdmap, "args", argvec);
7c673cae
FG
9790 map<string,string> loc;
9791 CrushWrapper::parse_loc_map(argvec, &loc);
9792
11fdf7f2
TL
9793 dout(0) << "create-or-move crush item name '" << osd_name
9794 << "' initial_weight " << weight << " at location " << loc
9795 << dendl;
7c673cae
FG
9796
9797 CrushWrapper newcrush;
9798 _get_pending_crush(newcrush);
9799
11fdf7f2
TL
9800 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
9801 g_conf()->osd_crush_update_weight_set);
7c673cae 9802 if (err == 0) {
11fdf7f2
TL
9803 ss << "create-or-move updated item name '" << osd_name
9804 << "' weight " << weight
7c673cae
FG
9805 << " at location " << loc << " to crush map";
9806 break;
9807 }
9808 if (err > 0) {
9809 pending_inc.crush.clear();
9810 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11fdf7f2
TL
9811 ss << "create-or-move updating item name '" << osd_name
9812 << "' weight " << weight
7c673cae
FG
9813 << " at location " << loc << " to crush map";
9814 getline(ss, rs);
9815 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9816 get_last_committed() + 1));
9817 return true;
9818 }
9819 } while (false);
9820
9821 } else if (prefix == "osd crush move") {
9822 do {
9823 // osd crush move <name> <loc1> [<loc2> ...]
11fdf7f2 9824 string name;
7c673cae 9825 vector<string> argvec;
11fdf7f2
TL
9826 cmd_getval(cct, cmdmap, "name", name);
9827 cmd_getval(cct, cmdmap, "args", argvec);
7c673cae
FG
9828 map<string,string> loc;
9829 CrushWrapper::parse_loc_map(argvec, &loc);
9830
9831 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
9832 CrushWrapper newcrush;
9833 _get_pending_crush(newcrush);
9834
9835 if (!newcrush.name_exists(name)) {
9836 err = -ENOENT;
9837 ss << "item " << name << " does not exist";
9838 break;
9839 }
9840 int id = newcrush.get_item_id(name);
9841
11fdf7f2 9842 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
7c673cae 9843 if (id >= 0) {
11fdf7f2
TL
9844 err = newcrush.create_or_move_item(
9845 cct, id, 0, name, loc,
9846 g_conf()->osd_crush_update_weight_set);
7c673cae 9847 } else {
11fdf7f2 9848 err = newcrush.move_bucket(cct, id, loc);
7c673cae
FG
9849 }
9850 if (err >= 0) {
9851 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9852 pending_inc.crush.clear();
9853 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9854 getline(ss, rs);
9855 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9856 get_last_committed() + 1));
9857 return true;
9858 }
9859 } else {
9860 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9861 err = 0;
9862 }
9863 } while (false);
31f18b77 9864 } else if (prefix == "osd crush swap-bucket") {
11fdf7f2
TL
9865 string source, dest;
9866 cmd_getval(cct, cmdmap, "source", source);
9867 cmd_getval(cct, cmdmap, "dest", dest);
9868
9869 bool force = false;
9870 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
9871
31f18b77
FG
9872 CrushWrapper newcrush;
9873 _get_pending_crush(newcrush);
9874 if (!newcrush.name_exists(source)) {
9875 ss << "source item " << source << " does not exist";
9876 err = -ENOENT;
9877 goto reply;
9878 }
9879 if (!newcrush.name_exists(dest)) {
9880 ss << "dest item " << dest << " does not exist";
9881 err = -ENOENT;
9882 goto reply;
9883 }
9884 int sid = newcrush.get_item_id(source);
9885 int did = newcrush.get_item_id(dest);
9886 int sparent;
11fdf7f2 9887 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
31f18b77
FG
9888 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
9889 err = -EPERM;
9890 goto reply;
9891 }
9892 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
11fdf7f2 9893 !force) {
31f18b77
FG
9894 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
9895 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
9896 << "; pass --yes-i-really-mean-it to proceed anyway";
9897 err = -EPERM;
9898 goto reply;
9899 }
11fdf7f2 9900 int r = newcrush.swap_bucket(cct, sid, did);
31f18b77
FG
9901 if (r < 0) {
9902 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 9903 err = r;
31f18b77
FG
9904 goto reply;
9905 }
9906 ss << "swapped bucket of " << source << " to " << dest;
9907 pending_inc.crush.clear();
9908 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9909 wait_for_finished_proposal(op,
9910 new Monitor::C_Command(mon, op, err, ss.str(),
9911 get_last_committed() + 1));
9912 return true;
9913 } else if (prefix == "osd crush link") {
9914 // osd crush link <name> <loc1> [<loc2> ...]
9915 string name;
11fdf7f2 9916 cmd_getval(cct, cmdmap, "name", name);
31f18b77 9917 vector<string> argvec;
11fdf7f2 9918 cmd_getval(cct, cmdmap, "args", argvec);
31f18b77
FG
9919 map<string,string> loc;
9920 CrushWrapper::parse_loc_map(argvec, &loc);
9921
9922 // Need an explicit check for name_exists because get_item_id returns
9923 // 0 on unfound.
9924 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
9925 if (!osdmap.crush->name_exists(name)) {
9926 err = -ENOENT;
9927 ss << "item " << name << " does not exist";
9928 goto reply;
9929 } else {
9930 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
9931 }
11fdf7f2 9932 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
7c673cae
FG
9933 ss << "no need to move item id " << id << " name '" << name
9934 << "' to location " << loc << " in crush map";
9935 err = 0;
9936 goto reply;
9937 }
9938
9939 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
9940 CrushWrapper newcrush;
9941 _get_pending_crush(newcrush);
9942
9943 if (!newcrush.name_exists(name)) {
9944 err = -ENOENT;
9945 ss << "item " << name << " does not exist";
9946 goto reply;
9947 } else {
9948 int id = newcrush.get_item_id(name);
11fdf7f2
TL
9949 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9950 err = newcrush.link_bucket(cct, id, loc);
7c673cae
FG
9951 if (err >= 0) {
9952 ss << "linked item id " << id << " name '" << name
9953 << "' to location " << loc << " in crush map";
9954 pending_inc.crush.clear();
9955 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9956 } else {
9957 ss << "cannot link item id " << id << " name '" << name
9958 << "' to location " << loc;
9959 goto reply;
9960 }
9961 } else {
9962 ss << "no need to move item id " << id << " name '" << name
9963 << "' to location " << loc << " in crush map";
9964 err = 0;
9965 }
9966 }
9967 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
9968 get_last_committed() + 1));
9969 return true;
9970 } else if (prefix == "osd crush rm" ||
9971 prefix == "osd crush remove" ||
9972 prefix == "osd crush unlink") {
9973 do {
9974 // osd crush rm <id> [ancestor]
9975 CrushWrapper newcrush;
9976 _get_pending_crush(newcrush);
9977
9978 string name;
11fdf7f2 9979 cmd_getval(cct, cmdmap, "name", name);
7c673cae
FG
9980
9981 if (!osdmap.crush->name_exists(name)) {
9982 err = 0;
9983 ss << "device '" << name << "' does not appear in the crush map";
9984 break;
9985 }
9986 if (!newcrush.name_exists(name)) {
9987 err = 0;
9988 ss << "device '" << name << "' does not appear in the crush map";
9989 getline(ss, rs);
9990 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9991 get_last_committed() + 1));
9992 return true;
9993 }
9994 int id = newcrush.get_item_id(name);
31f18b77
FG
9995 int ancestor = 0;
9996
7c673cae
FG
9997 bool unlink_only = prefix == "osd crush unlink";
9998 string ancestor_str;
11fdf7f2 9999 if (cmd_getval(cct, cmdmap, "ancestor", ancestor_str)) {
7c673cae
FG
10000 if (!newcrush.name_exists(ancestor_str)) {
10001 err = -ENOENT;
10002 ss << "ancestor item '" << ancestor_str
10003 << "' does not appear in the crush map";
10004 break;
10005 }
31f18b77 10006 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 10007 }
31f18b77
FG
10008
10009 err = prepare_command_osd_crush_remove(
10010 newcrush,
10011 id, ancestor,
10012 (ancestor < 0), unlink_only);
10013
7c673cae
FG
10014 if (err == -ENOENT) {
10015 ss << "item " << id << " does not appear in that position";
10016 err = 0;
10017 break;
10018 }
10019 if (err == 0) {
81eedcae
TL
10020 if (!unlink_only)
10021 pending_inc.new_crush_node_flags[id] = 0;
7c673cae
FG
10022 ss << "removed item id " << id << " name '" << name << "' from crush map";
10023 getline(ss, rs);
10024 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10025 get_last_committed() + 1));
10026 return true;
10027 }
10028 } while (false);
10029
10030 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
10031 CrushWrapper newcrush;
10032 _get_pending_crush(newcrush);
10033
11fdf7f2 10034 newcrush.reweight(cct);
7c673cae
FG
10035 pending_inc.crush.clear();
10036 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10037 ss << "reweighted crush hierarchy";
10038 getline(ss, rs);
10039 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10040 get_last_committed() + 1));
10041 return true;
10042 } else if (prefix == "osd crush reweight") {
10043 // osd crush reweight <name> <weight>
10044 CrushWrapper newcrush;
10045 _get_pending_crush(newcrush);
10046
10047 string name;
11fdf7f2 10048 cmd_getval(cct, cmdmap, "name", name);
7c673cae
FG
10049 if (!newcrush.name_exists(name)) {
10050 err = -ENOENT;
10051 ss << "device '" << name << "' does not appear in the crush map";
10052 goto reply;
10053 }
10054
10055 int id = newcrush.get_item_id(name);
10056 if (id < 0) {
10057 ss << "device '" << name << "' is not a leaf in the crush map";
10058 err = -EINVAL;
10059 goto reply;
10060 }
10061 double w;
11fdf7f2 10062 if (!cmd_getval(cct, cmdmap, "weight", w)) {
7c673cae 10063 ss << "unable to parse weight value '"
11fdf7f2 10064 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10065 err = -EINVAL;
10066 goto reply;
10067 }
10068
11fdf7f2
TL
10069 err = newcrush.adjust_item_weightf(cct, id, w,
10070 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10071 if (err < 0)
10072 goto reply;
10073 pending_inc.crush.clear();
10074 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10075 ss << "reweighted item id " << id << " name '" << name << "' to " << w
10076 << " in crush map";
10077 getline(ss, rs);
10078 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10079 get_last_committed() + 1));
10080 return true;
10081 } else if (prefix == "osd crush reweight-subtree") {
10082 // osd crush reweight <name> <weight>
10083 CrushWrapper newcrush;
10084 _get_pending_crush(newcrush);
10085
10086 string name;
11fdf7f2 10087 cmd_getval(cct, cmdmap, "name", name);
7c673cae
FG
10088 if (!newcrush.name_exists(name)) {
10089 err = -ENOENT;
10090 ss << "device '" << name << "' does not appear in the crush map";
10091 goto reply;
10092 }
10093
10094 int id = newcrush.get_item_id(name);
10095 if (id >= 0) {
10096 ss << "device '" << name << "' is not a subtree in the crush map";
10097 err = -EINVAL;
10098 goto reply;
10099 }
10100 double w;
11fdf7f2 10101 if (!cmd_getval(cct, cmdmap, "weight", w)) {
7c673cae 10102 ss << "unable to parse weight value '"
11fdf7f2 10103 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
10104 err = -EINVAL;
10105 goto reply;
10106 }
10107
11fdf7f2
TL
10108 err = newcrush.adjust_subtree_weightf(cct, id, w,
10109 g_conf()->osd_crush_update_weight_set);
7c673cae
FG
10110 if (err < 0)
10111 goto reply;
10112 pending_inc.crush.clear();
10113 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10114 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10115 << " in crush map";
10116 getline(ss, rs);
10117 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10118 get_last_committed() + 1));
10119 return true;
10120 } else if (prefix == "osd crush tunables") {
10121 CrushWrapper newcrush;
10122 _get_pending_crush(newcrush);
10123
10124 err = 0;
10125 string profile;
11fdf7f2 10126 cmd_getval(cct, cmdmap, "profile", profile);
7c673cae
FG
10127 if (profile == "legacy" || profile == "argonaut") {
10128 newcrush.set_tunables_legacy();
10129 } else if (profile == "bobtail") {
10130 newcrush.set_tunables_bobtail();
10131 } else if (profile == "firefly") {
10132 newcrush.set_tunables_firefly();
10133 } else if (profile == "hammer") {
10134 newcrush.set_tunables_hammer();
10135 } else if (profile == "jewel") {
10136 newcrush.set_tunables_jewel();
10137 } else if (profile == "optimal") {
10138 newcrush.set_tunables_optimal();
10139 } else if (profile == "default") {
10140 newcrush.set_tunables_default();
10141 } else {
10142 ss << "unrecognized profile '" << profile << "'";
10143 err = -EINVAL;
10144 goto reply;
10145 }
10146
10147 if (!validate_crush_against_features(&newcrush, ss)) {
10148 err = -EINVAL;
10149 goto reply;
10150 }
10151
10152 pending_inc.crush.clear();
10153 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10154 ss << "adjusted tunables profile to " << profile;
10155 getline(ss, rs);
10156 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10157 get_last_committed() + 1));
10158 return true;
10159 } else if (prefix == "osd crush set-tunable") {
10160 CrushWrapper newcrush;
10161 _get_pending_crush(newcrush);
10162
10163 err = 0;
10164 string tunable;
11fdf7f2 10165 cmd_getval(cct, cmdmap, "tunable", tunable);
7c673cae
FG
10166
10167 int64_t value = -1;
11fdf7f2 10168 if (!cmd_getval(cct, cmdmap, "value", value)) {
7c673cae 10169 err = -EINVAL;
11fdf7f2
TL
10170 ss << "failed to parse integer value "
10171 << cmd_vartype_stringify(cmdmap.at("value"));
7c673cae
FG
10172 goto reply;
10173 }
10174
10175 if (tunable == "straw_calc_version") {
224ce89b 10176 if (value != 0 && value != 1) {
7c673cae
FG
10177 ss << "value must be 0 or 1; got " << value;
10178 err = -EINVAL;
10179 goto reply;
10180 }
10181 newcrush.set_straw_calc_version(value);
10182 } else {
10183 ss << "unrecognized tunable '" << tunable << "'";
10184 err = -EINVAL;
10185 goto reply;
10186 }
10187
10188 if (!validate_crush_against_features(&newcrush, ss)) {
10189 err = -EINVAL;
10190 goto reply;
10191 }
10192
10193 pending_inc.crush.clear();
10194 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10195 ss << "adjusted tunable " << tunable << " to " << value;
10196 getline(ss, rs);
10197 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10198 get_last_committed() + 1));
10199 return true;
10200
10201 } else if (prefix == "osd crush rule create-simple") {
10202 string name, root, type, mode;
11fdf7f2
TL
10203 cmd_getval(cct, cmdmap, "name", name);
10204 cmd_getval(cct, cmdmap, "root", root);
10205 cmd_getval(cct, cmdmap, "type", type);
10206 cmd_getval(cct, cmdmap, "mode", mode);
7c673cae
FG
10207 if (mode == "")
10208 mode = "firstn";
10209
10210 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
10211 // The name is uniquely associated to a ruleid and the rule it contains
10212 // From the user point of view, the rule is more meaningfull.
10213 ss << "rule " << name << " already exists";
7c673cae
FG
10214 err = 0;
10215 goto reply;
10216 }
10217
10218 CrushWrapper newcrush;
10219 _get_pending_crush(newcrush);
10220
10221 if (newcrush.rule_exists(name)) {
31f18b77
FG
10222 // The name is uniquely associated to a ruleid and the rule it contains
10223 // From the user point of view, the rule is more meaningfull.
10224 ss << "rule " << name << " already exists";
7c673cae
FG
10225 err = 0;
10226 } else {
224ce89b 10227 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
10228 pg_pool_t::TYPE_REPLICATED, &ss);
10229 if (ruleno < 0) {
10230 err = ruleno;
10231 goto reply;
10232 }
10233
10234 pending_inc.crush.clear();
10235 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10236 }
10237 getline(ss, rs);
10238 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10239 get_last_committed() + 1));
10240 return true;
10241
224ce89b
WB
10242 } else if (prefix == "osd crush rule create-replicated") {
10243 string name, root, type, device_class;
11fdf7f2
TL
10244 cmd_getval(cct, cmdmap, "name", name);
10245 cmd_getval(cct, cmdmap, "root", root);
10246 cmd_getval(cct, cmdmap, "type", type);
10247 cmd_getval(cct, cmdmap, "class", device_class);
224ce89b
WB
10248
10249 if (osdmap.crush->rule_exists(name)) {
10250 // The name is uniquely associated to a ruleid and the rule it contains
10251 // From the user point of view, the rule is more meaningfull.
10252 ss << "rule " << name << " already exists";
10253 err = 0;
10254 goto reply;
10255 }
10256
10257 CrushWrapper newcrush;
10258 _get_pending_crush(newcrush);
10259
10260 if (newcrush.rule_exists(name)) {
10261 // The name is uniquely associated to a ruleid and the rule it contains
10262 // From the user point of view, the rule is more meaningfull.
10263 ss << "rule " << name << " already exists";
10264 err = 0;
10265 } else {
10266 int ruleno = newcrush.add_simple_rule(
10267 name, root, type, device_class,
10268 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10269 if (ruleno < 0) {
10270 err = ruleno;
10271 goto reply;
10272 }
10273
10274 pending_inc.crush.clear();
10275 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10276 }
10277 getline(ss, rs);
10278 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10279 get_last_committed() + 1));
10280 return true;
10281
7c673cae
FG
10282 } else if (prefix == "osd erasure-code-profile rm") {
10283 string name;
11fdf7f2 10284 cmd_getval(cct, cmdmap, "name", name);
7c673cae
FG
10285
10286 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10287 goto wait;
10288
10289 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10290 err = -EBUSY;
10291 goto reply;
10292 }
10293
10294 if (osdmap.has_erasure_code_profile(name) ||
10295 pending_inc.new_erasure_code_profiles.count(name)) {
10296 if (osdmap.has_erasure_code_profile(name)) {
10297 pending_inc.old_erasure_code_profiles.push_back(name);
10298 } else {
10299 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10300 pending_inc.new_erasure_code_profiles.erase(name);
10301 }
10302
10303 getline(ss, rs);
10304 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10305 get_last_committed() + 1));
10306 return true;
10307 } else {
10308 ss << "erasure-code-profile " << name << " does not exist";
10309 err = 0;
10310 goto reply;
10311 }
10312
10313 } else if (prefix == "osd erasure-code-profile set") {
10314 string name;
11fdf7f2 10315 cmd_getval(cct, cmdmap, "name", name);
7c673cae 10316 vector<string> profile;
11fdf7f2
TL
10317 cmd_getval(cct, cmdmap, "profile", profile);
10318
10319 bool force = false;
10320 cmd_getval(cct, cmdmap, "force", force);
10321
7c673cae
FG
10322 map<string,string> profile_map;
10323 err = parse_erasure_code_profile(profile, &profile_map, &ss);
10324 if (err)
10325 goto reply;
10326 if (profile_map.find("plugin") == profile_map.end()) {
10327 ss << "erasure-code-profile " << profile_map
10328 << " must contain a plugin entry" << std::endl;
10329 err = -EINVAL;
10330 goto reply;
10331 }
10332 string plugin = profile_map["plugin"];
10333
10334 if (pending_inc.has_erasure_code_profile(name)) {
10335 dout(20) << "erasure code profile " << name << " try again" << dendl;
10336 goto wait;
10337 } else {
7c673cae
FG
10338 err = normalize_profile(name, profile_map, force, &ss);
10339 if (err)
10340 goto reply;
10341
10342 if (osdmap.has_erasure_code_profile(name)) {
10343 ErasureCodeProfile existing_profile_map =
10344 osdmap.get_erasure_code_profile(name);
10345 err = normalize_profile(name, existing_profile_map, force, &ss);
10346 if (err)
10347 goto reply;
10348
10349 if (existing_profile_map == profile_map) {
10350 err = 0;
10351 goto reply;
10352 }
10353 if (!force) {
10354 err = -EPERM;
10355 ss << "will not override erasure code profile " << name
10356 << " because the existing profile "
10357 << existing_profile_map
10358 << " is different from the proposed profile "
10359 << profile_map;
10360 goto reply;
10361 }
10362 }
10363
10364 dout(20) << "erasure code profile set " << name << "="
10365 << profile_map << dendl;
10366 pending_inc.set_erasure_code_profile(name, profile_map);
10367 }
10368
10369 getline(ss, rs);
10370 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10371 get_last_committed() + 1));
10372 return true;
10373
10374 } else if (prefix == "osd crush rule create-erasure") {
10375 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10376 if (err == -EAGAIN)
10377 goto wait;
10378 if (err)
10379 goto reply;
10380 string name, poolstr;
11fdf7f2 10381 cmd_getval(cct, cmdmap, "name", name);
7c673cae 10382 string profile;
11fdf7f2 10383 cmd_getval(cct, cmdmap, "profile", profile);
7c673cae
FG
10384 if (profile == "")
10385 profile = "default";
10386 if (profile == "default") {
10387 if (!osdmap.has_erasure_code_profile(profile)) {
10388 if (pending_inc.has_erasure_code_profile(profile)) {
10389 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10390 goto wait;
10391 }
10392
10393 map<string,string> profile_map;
11fdf7f2 10394 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
10395 profile_map,
10396 &ss);
10397 if (err)
10398 goto reply;
10399 err = normalize_profile(name, profile_map, true, &ss);
10400 if (err)
10401 goto reply;
10402 dout(20) << "erasure code profile set " << profile << "="
10403 << profile_map << dendl;
10404 pending_inc.set_erasure_code_profile(profile, profile_map);
10405 goto wait;
10406 }
10407 }
10408
31f18b77
FG
10409 int rule;
10410 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
10411 if (err < 0) {
10412 switch(err) {
10413 case -EEXIST: // return immediately
10414 ss << "rule " << name << " already exists";
10415 err = 0;
10416 goto reply;
10417 break;
10418 case -EALREADY: // wait for pending to be proposed
10419 ss << "rule " << name << " already exists";
10420 err = 0;
10421 break;
10422 default: // non recoverable error
10423 goto reply;
10424 break;
10425 }
10426 } else {
31f18b77 10427 ss << "created rule " << name << " at " << rule;
7c673cae
FG
10428 }
10429
10430 getline(ss, rs);
10431 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10432 get_last_committed() + 1));
10433 return true;
10434
10435 } else if (prefix == "osd crush rule rm") {
10436 string name;
11fdf7f2 10437 cmd_getval(cct, cmdmap, "name", name);
7c673cae
FG
10438
10439 if (!osdmap.crush->rule_exists(name)) {
10440 ss << "rule " << name << " does not exist";
10441 err = 0;
10442 goto reply;
10443 }
10444
10445 CrushWrapper newcrush;
10446 _get_pending_crush(newcrush);
10447
10448 if (!newcrush.rule_exists(name)) {
10449 ss << "rule " << name << " does not exist";
10450 err = 0;
10451 } else {
10452 int ruleno = newcrush.get_rule_id(name);
11fdf7f2 10453 ceph_assert(ruleno >= 0);
7c673cae
FG
10454
10455 // make sure it is not in use.
10456 // FIXME: this is ok in some situations, but let's not bother with that
10457 // complexity now.
10458 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
3efd9988 10459 if (osdmap.crush_rule_in_use(ruleset)) {
7c673cae
FG
10460 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10461 err = -EBUSY;
10462 goto reply;
10463 }
10464
10465 err = newcrush.remove_rule(ruleno);
10466 if (err < 0) {
10467 goto reply;
10468 }
10469
10470 pending_inc.crush.clear();
10471 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10472 }
10473 getline(ss, rs);
10474 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10475 get_last_committed() + 1));
10476 return true;
10477
b5b8bbf5
FG
10478 } else if (prefix == "osd crush rule rename") {
10479 string srcname;
10480 string dstname;
11fdf7f2
TL
10481 cmd_getval(cct, cmdmap, "srcname", srcname);
10482 cmd_getval(cct, cmdmap, "dstname", dstname);
b5b8bbf5
FG
10483 if (srcname.empty() || dstname.empty()) {
10484 ss << "must specify both source rule name and destination rule name";
10485 err = -EINVAL;
10486 goto reply;
10487 }
10488 if (srcname == dstname) {
10489 ss << "destination rule name is equal to source rule name";
10490 err = 0;
10491 goto reply;
10492 }
10493
10494 CrushWrapper newcrush;
10495 _get_pending_crush(newcrush);
181888fb
FG
10496 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10497 // srcname does not exist and dstname already exists
10498 // suppose this is a replay and return success
10499 // (so this command is idempotent)
10500 ss << "already renamed to '" << dstname << "'";
10501 err = 0;
10502 goto reply;
10503 }
10504
b5b8bbf5
FG
10505 err = newcrush.rename_rule(srcname, dstname, &ss);
10506 if (err < 0) {
10507 // ss has reason for failure
10508 goto reply;
10509 }
10510 pending_inc.crush.clear();
10511 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10512 getline(ss, rs);
10513 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10514 get_last_committed() + 1));
10515 return true;
10516
7c673cae
FG
10517 } else if (prefix == "osd setmaxosd") {
10518 int64_t newmax;
11fdf7f2 10519 if (!cmd_getval(cct, cmdmap, "newmax", newmax)) {
7c673cae 10520 ss << "unable to parse 'newmax' value '"
11fdf7f2 10521 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
7c673cae
FG
10522 err = -EINVAL;
10523 goto reply;
10524 }
10525
11fdf7f2 10526 if (newmax > g_conf()->mon_max_osd) {
7c673cae
FG
10527 err = -ERANGE;
10528 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11fdf7f2 10529 << g_conf()->mon_max_osd << ")";
7c673cae
FG
10530 goto reply;
10531 }
10532
10533 // Don't allow shrinking OSD number as this will cause data loss
10534 // and may cause kernel crashes.
10535 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10536 if (newmax < osdmap.get_max_osd()) {
10537 // Check if the OSDs exist between current max and new value.
10538 // If there are any OSDs exist, then don't allow shrinking number
10539 // of OSDs.
10540 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10541 if (osdmap.exists(i)) {
10542 err = -EBUSY;
10543 ss << "cannot shrink max_osd to " << newmax
10544 << " because osd." << i << " (and possibly others) still in use";
10545 goto reply;
10546 }
10547 }
10548 }
10549
10550 pending_inc.new_max_osd = newmax;
10551 ss << "set new max_osd = " << pending_inc.new_max_osd;
10552 getline(ss, rs);
10553 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10554 get_last_committed() + 1));
10555 return true;
10556
10557 } else if (prefix == "osd set-full-ratio" ||
10558 prefix == "osd set-backfillfull-ratio" ||
10559 prefix == "osd set-nearfull-ratio") {
7c673cae 10560 double n;
11fdf7f2 10561 if (!cmd_getval(cct, cmdmap, "ratio", n)) {
7c673cae 10562 ss << "unable to parse 'ratio' value '"
11fdf7f2 10563 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
7c673cae
FG
10564 err = -EINVAL;
10565 goto reply;
10566 }
10567 if (prefix == "osd set-full-ratio")
10568 pending_inc.new_full_ratio = n;
10569 else if (prefix == "osd set-backfillfull-ratio")
10570 pending_inc.new_backfillfull_ratio = n;
10571 else if (prefix == "osd set-nearfull-ratio")
10572 pending_inc.new_nearfull_ratio = n;
10573 ss << prefix << " " << n;
10574 getline(ss, rs);
10575 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10576 get_last_committed() + 1));
10577 return true;
10578 } else if (prefix == "osd set-require-min-compat-client") {
7c673cae 10579 string v;
11fdf7f2 10580 cmd_getval(cct, cmdmap, "version", v);
31f18b77
FG
10581 int vno = ceph_release_from_name(v.c_str());
10582 if (vno <= 0) {
7c673cae
FG
10583 ss << "version " << v << " is not recognized";
10584 err = -EINVAL;
10585 goto reply;
10586 }
10587 OSDMap newmap;
10588 newmap.deepish_copy_from(osdmap);
10589 newmap.apply_incremental(pending_inc);
31f18b77
FG
10590 newmap.require_min_compat_client = vno;
10591 auto mvno = newmap.get_min_compat_client();
10592 if (vno < mvno) {
10593 ss << "osdmap current utilizes features that require "
10594 << ceph_release_name(mvno)
10595 << "; cannot set require_min_compat_client below that to "
10596 << ceph_release_name(vno);
7c673cae
FG
10597 err = -EPERM;
10598 goto reply;
10599 }
11fdf7f2
TL
10600 bool sure = false;
10601 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10602 if (!sure) {
31f18b77
FG
10603 FeatureMap m;
10604 mon->get_combined_feature_map(&m);
10605 uint64_t features = ceph_release_features(vno);
10606 bool first = true;
10607 bool ok = true;
10608 for (int type : {
10609 CEPH_ENTITY_TYPE_CLIENT,
10610 CEPH_ENTITY_TYPE_MDS,
10611 CEPH_ENTITY_TYPE_MGR }) {
10612 auto p = m.m.find(type);
10613 if (p == m.m.end()) {
10614 continue;
10615 }
10616 for (auto& q : p->second) {
10617 uint64_t missing = ~q.first & features;
10618 if (missing) {
10619 if (first) {
10620 ss << "cannot set require_min_compat_client to " << v << ": ";
10621 } else {
10622 ss << "; ";
10623 }
10624 first = false;
10625 ss << q.second << " connected " << ceph_entity_type_name(type)
10626 << "(s) look like " << ceph_release_name(
10627 ceph_release_from_features(q.first))
10628 << " (missing 0x" << std::hex << missing << std::dec << ")";
10629 ok = false;
10630 }
10631 }
10632 }
10633 if (!ok) {
10634 ss << "; add --yes-i-really-mean-it to do it anyway";
10635 err = -EPERM;
10636 goto reply;
10637 }
10638 }
10639 ss << "set require_min_compat_client to " << ceph_release_name(vno);
10640 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
10641 getline(ss, rs);
10642 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10643 get_last_committed() + 1));
10644 return true;
10645 } else if (prefix == "osd pause") {
10646 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10647
10648 } else if (prefix == "osd unpause") {
10649 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10650
10651 } else if (prefix == "osd set") {
11fdf7f2
TL
10652 bool sure = false;
10653 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
10654
7c673cae 10655 string key;
11fdf7f2 10656 cmd_getval(cct, cmdmap, "key", key);
7c673cae
FG
10657 if (key == "full")
10658 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
10659 else if (key == "pause")
10660 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10661 else if (key == "noup")
10662 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
10663 else if (key == "nodown")
10664 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
10665 else if (key == "noout")
10666 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
10667 else if (key == "noin")
10668 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
10669 else if (key == "nobackfill")
10670 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
10671 else if (key == "norebalance")
10672 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
10673 else if (key == "norecover")
10674 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
10675 else if (key == "noscrub")
10676 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
10677 else if (key == "nodeep-scrub")
10678 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10679 else if (key == "notieragent")
10680 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
10681 else if (key == "nosnaptrim")
10682 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10683 else if (key == "pglog_hardlimit") {
10684 if (!osdmap.get_num_up_osds() && !sure) {
f64942e4
AA
10685 ss << "Not advisable to continue since no OSDs are up. Pass "
10686 << "--yes-i-really-mean-it if you really wish to continue.";
10687 err = -EPERM;
10688 goto reply;
10689 }
10690 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
10691 // we are reusing a jewel feature bit that was retired in luminous.
10692 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
10693 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11fdf7f2 10694 || sure)) {
f64942e4
AA
10695 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
10696 } else {
10697 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
10698 err = -EPERM;
10699 goto reply;
10700 }
7c673cae
FG
10701 } else {
10702 ss << "unrecognized flag '" << key << "'";
10703 err = -EINVAL;
10704 }
10705
10706 } else if (prefix == "osd unset") {
10707 string key;
11fdf7f2 10708 cmd_getval(cct, cmdmap, "key", key);
7c673cae
FG
10709 if (key == "full")
10710 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
10711 else if (key == "pause")
10712 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10713 else if (key == "noup")
10714 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
10715 else if (key == "nodown")
10716 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
10717 else if (key == "noout")
10718 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
10719 else if (key == "noin")
10720 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
10721 else if (key == "nobackfill")
10722 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
10723 else if (key == "norebalance")
10724 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
10725 else if (key == "norecover")
10726 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
10727 else if (key == "noscrub")
10728 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
10729 else if (key == "nodeep-scrub")
10730 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10731 else if (key == "notieragent")
10732 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11fdf7f2
TL
10733 else if (key == "nosnaptrim")
10734 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
224ce89b 10735 else {
7c673cae
FG
10736 ss << "unrecognized flag '" << key << "'";
10737 err = -EINVAL;
10738 }
10739
31f18b77
FG
10740 } else if (prefix == "osd require-osd-release") {
10741 string release;
11fdf7f2
TL
10742 cmd_getval(cct, cmdmap, "release", release);
10743 bool sure = false;
10744 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
31f18b77
FG
10745 int rel = ceph_release_from_name(release.c_str());
10746 if (rel <= 0) {
10747 ss << "unrecognized release " << release;
10748 err = -EINVAL;
10749 goto reply;
10750 }
d2e6a577
FG
10751 if (rel == osdmap.require_osd_release) {
10752 // idempotent
10753 err = 0;
10754 goto reply;
10755 }
11fdf7f2
TL
10756 ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
10757 if (!osdmap.get_num_up_osds() && !sure) {
10758 ss << "Not advisable to continue since no OSDs are up. Pass "
10759 << "--yes-i-really-mean-it if you really wish to continue.";
10760 err = -EPERM;
10761 goto reply;
10762 }
10763 if (rel == CEPH_RELEASE_MIMIC) {
10764 if (!mon->monmap->get_required_features().contains_all(
10765 ceph::features::mon::FEATURE_MIMIC)) {
10766 ss << "not all mons are mimic";
10767 err = -EPERM;
10768 goto reply;
10769 }
10770 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
10771 && !sure) {
10772 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
10773 err = -EPERM;
10774 goto reply;
10775 }
10776 } else if (rel == CEPH_RELEASE_NAUTILUS) {
10777 if (!mon->monmap->get_required_features().contains_all(
10778 ceph::features::mon::FEATURE_NAUTILUS)) {
10779 ss << "not all mons are nautilus";
10780 err = -EPERM;
10781 goto reply;
10782 }
10783 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
10784 && !sure) {
10785 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
31f18b77
FG
10786 err = -EPERM;
10787 goto reply;
10788 }
10789 } else {
10790 ss << "not supported for this release yet";
10791 err = -EPERM;
10792 goto reply;
10793 }
10794 if (rel < osdmap.require_osd_release) {
10795 ss << "require_osd_release cannot be lowered once it has been set";
10796 err = -EPERM;
10797 goto reply;
10798 }
10799 pending_inc.new_require_osd_release = rel;
10800 goto update;
7c673cae
FG
10801 } else if (prefix == "osd down" ||
10802 prefix == "osd out" ||
10803 prefix == "osd in" ||
10804 prefix == "osd rm") {
10805
10806 bool any = false;
31f18b77
FG
10807 bool stop = false;
10808 bool verbose = true;
7c673cae
FG
10809
10810 vector<string> idvec;
11fdf7f2 10811 cmd_getval(cct, cmdmap, "ids", idvec);
31f18b77
FG
10812 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10813 set<int> osds;
10814
10815 // wildcard?
10816 if (j == 0 &&
10817 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10818 if (prefix == "osd in") {
10819 // touch out osds only
81eedcae 10820 osdmap.get_out_existing_osds(osds);
31f18b77
FG
10821 } else {
10822 osdmap.get_all_osds(osds);
10823 }
10824 stop = true;
10825 verbose = false; // so the output is less noisy.
10826 } else {
10827 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10828 if (osd < 0) {
10829 ss << "invalid osd id" << osd;
10830 err = -EINVAL;
10831 continue;
10832 } else if (!osdmap.exists(osd)) {
10833 ss << "osd." << osd << " does not exist. ";
10834 continue;
10835 }
10836
10837 osds.insert(osd);
7c673cae 10838 }
31f18b77
FG
10839
10840 for (auto &osd : osds) {
10841 if (prefix == "osd down") {
10842 if (osdmap.is_down(osd)) {
10843 if (verbose)
10844 ss << "osd." << osd << " is already down. ";
10845 } else {
10846 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
10847 ss << "marked down osd." << osd << ". ";
10848 any = true;
10849 }
10850 } else if (prefix == "osd out") {
10851 if (osdmap.is_out(osd)) {
10852 if (verbose)
10853 ss << "osd." << osd << " is already out. ";
10854 } else {
10855 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
10856 if (osdmap.osd_weight[osd]) {
10857 if (pending_inc.new_xinfo.count(osd) == 0) {
10858 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10859 }
10860 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 10861 }
31f18b77 10862 ss << "marked out osd." << osd << ". ";
224ce89b
WB
10863 std::ostringstream msg;
10864 msg << "Client " << op->get_session()->entity_name
10865 << " marked osd." << osd << " out";
10866 if (osdmap.is_up(osd)) {
10867 msg << ", while it was still marked up";
10868 } else {
3efd9988
FG
10869 auto period = ceph_clock_now() - down_pending_out[osd];
10870 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
10871 << " seconds";
10872 }
10873
10874 mon->clog->info() << msg.str();
31f18b77 10875 any = true;
7c673cae 10876 }
31f18b77
FG
10877 } else if (prefix == "osd in") {
10878 if (osdmap.is_in(osd)) {
10879 if (verbose)
10880 ss << "osd." << osd << " is already in. ";
10881 } else {
10882 if (osdmap.osd_xinfo[osd].old_weight > 0) {
10883 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
10884 if (pending_inc.new_xinfo.count(osd) == 0) {
10885 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10886 }
10887 pending_inc.new_xinfo[osd].old_weight = 0;
10888 } else {
10889 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 10890 }
31f18b77
FG
10891 ss << "marked in osd." << osd << ". ";
10892 any = true;
10893 }
10894 } else if (prefix == "osd rm") {
10895 err = prepare_command_osd_remove(osd);
10896
10897 if (err == -EBUSY) {
10898 if (any)
10899 ss << ", ";
10900 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 10901 } else {
11fdf7f2 10902 ceph_assert(err == 0);
31f18b77
FG
10903 if (any) {
10904 ss << ", osd." << osd;
10905 } else {
10906 ss << "removed osd." << osd;
10907 }
10908 any = true;
7c673cae 10909 }
31f18b77
FG
10910 }
10911 }
10912 }
10913 if (any) {
10914 getline(ss, rs);
10915 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10916 get_last_committed() + 1));
10917 return true;
10918 }
81eedcae
TL
10919 } else if (prefix == "osd set-group" ||
10920 prefix == "osd unset-group" ||
10921 prefix == "osd add-noup" ||
31f18b77
FG
10922 prefix == "osd add-nodown" ||
10923 prefix == "osd add-noin" ||
81eedcae
TL
10924 prefix == "osd add-noout" ||
10925 prefix == "osd rm-noup" ||
10926 prefix == "osd rm-nodown" ||
10927 prefix == "osd rm-noin" ||
10928 prefix == "osd rm-noout") {
10929 bool do_set = prefix == "osd set-group" ||
10930 prefix.find("add") != string::npos;
10931 string flag_str;
10932 unsigned flags = 0;
10933 vector<string> who;
10934 if (prefix == "osd set-group" || prefix == "osd unset-group") {
10935 cmd_getval(cct, cmdmap, "flags", flag_str);
10936 cmd_getval(cct, cmdmap, "who", who);
10937 vector<string> raw_flags;
10938 boost::split(raw_flags, flag_str, boost::is_any_of(","));
10939 for (auto& f : raw_flags) {
10940 if (f == "noup")
10941 flags |= CEPH_OSD_NOUP;
10942 else if (f == "nodown")
10943 flags |= CEPH_OSD_NODOWN;
10944 else if (f == "noin")
10945 flags |= CEPH_OSD_NOIN;
10946 else if (f == "noout")
10947 flags |= CEPH_OSD_NOOUT;
10948 else {
10949 ss << "unrecognized flag '" << f << "', must be one of "
10950 << "{noup,nodown,noin,noout}";
10951 err = -EINVAL;
10952 goto reply;
10953 }
10954 }
31f18b77 10955 } else {
81eedcae
TL
10956 cmd_getval(cct, cmdmap, "ids", who);
10957 if (prefix.find("noup") != string::npos)
10958 flags = CEPH_OSD_NOUP;
10959 else if (prefix.find("nodown") != string::npos)
10960 flags = CEPH_OSD_NODOWN;
10961 else if (prefix.find("noin") != string::npos)
10962 flags = CEPH_OSD_NOIN;
10963 else if (prefix.find("noout") != string::npos)
10964 flags = CEPH_OSD_NOOUT;
10965 else
10966 ceph_assert(0 == "Unreachable!");
31f18b77 10967 }
81eedcae
TL
10968 if (flags == 0) {
10969 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
10970 err = -EINVAL;
10971 goto reply;
10972 }
10973 if (who.empty()) {
10974 ss << "must specify at least one or more targets to set/unset";
10975 err = -EINVAL;
10976 goto reply;
10977 }
10978 set<int> osds;
10979 set<int> crush_nodes;
10980 set<int> device_classes;
10981 for (auto& w : who) {
10982 if (w == "any" || w == "all" || w == "*") {
31f18b77 10983 osdmap.get_all_osds(osds);
81eedcae 10984 break;
31f18b77 10985 }
81eedcae
TL
10986 std::stringstream ts;
10987 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
10988 osds.insert(osd);
10989 } else if (osdmap.crush->name_exists(w)) {
10990 crush_nodes.insert(osdmap.crush->get_item_id(w));
10991 } else if (osdmap.crush->class_exists(w)) {
10992 device_classes.insert(osdmap.crush->get_class_id(w));
10993 } else {
10994 ss << "unable to parse osd id or crush node or device class: "
10995 << "\"" << w << "\". ";
7c673cae
FG
10996 }
10997 }
81eedcae
TL
10998 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
10999 // ss has reason for failure
11000 err = -EINVAL;
11001 goto reply;
31f18b77 11002 }
31f18b77 11003 bool any = false;
81eedcae
TL
11004 for (auto osd : osds) {
11005 if (!osdmap.exists(osd)) {
11006 ss << "osd." << osd << " does not exist. ";
11007 continue;
11008 }
11009 if (do_set) {
11010 if (flags & CEPH_OSD_NOUP) {
11011 any |= osdmap.is_noup_by_osd(osd) ?
11012 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11013 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
31f18b77 11014 }
81eedcae
TL
11015 if (flags & CEPH_OSD_NODOWN) {
11016 any |= osdmap.is_nodown_by_osd(osd) ?
11017 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11018 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11019 }
11020 if (flags & CEPH_OSD_NOIN) {
11021 any |= osdmap.is_noin_by_osd(osd) ?
11022 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11023 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11024 }
11025 if (flags & CEPH_OSD_NOOUT) {
11026 any |= osdmap.is_noout_by_osd(osd) ?
11027 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11028 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
31f18b77 11029 }
31f18b77 11030 } else {
81eedcae
TL
11031 if (flags & CEPH_OSD_NOUP) {
11032 any |= osdmap.is_noup_by_osd(osd) ?
11033 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11034 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
31f18b77 11035 }
81eedcae
TL
11036 if (flags & CEPH_OSD_NODOWN) {
11037 any |= osdmap.is_nodown_by_osd(osd) ?
11038 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11039 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
31f18b77 11040 }
81eedcae
TL
11041 if (flags & CEPH_OSD_NOIN) {
11042 any |= osdmap.is_noin_by_osd(osd) ?
11043 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11044 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11045 }
11046 if (flags & CEPH_OSD_NOOUT) {
11047 any |= osdmap.is_noout_by_osd(osd) ?
11048 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11049 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
31f18b77
FG
11050 }
11051 }
11052 }
81eedcae
TL
11053 for (auto& id : crush_nodes) {
11054 auto old_flags = osdmap.get_crush_node_flags(id);
11055 auto& pending_flags = pending_inc.new_crush_node_flags[id];
11056 pending_flags |= old_flags; // adopt existing flags first!
11057 if (do_set) {
11058 pending_flags |= flags;
11059 } else {
11060 pending_flags &= ~flags;
11061 }
11062 any = true;
11063 }
11064 for (auto& id : device_classes) {
11065 auto old_flags = osdmap.get_device_class_flags(id);
11066 auto& pending_flags = pending_inc.new_device_class_flags[id];
11067 pending_flags |= old_flags;
11068 if (do_set) {
11069 pending_flags |= flags;
11070 } else {
11071 pending_flags &= ~flags;
11072 }
11073 any = true;
11074 }
31f18b77
FG
11075 if (any) {
11076 getline(ss, rs);
11077 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11078 get_last_committed() + 1));
7c673cae
FG
11079 return true;
11080 }
11081 } else if (prefix == "osd pg-temp") {
11082 string pgidstr;
11fdf7f2 11083 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
7c673cae 11084 ss << "unable to parse 'pgid' value '"
11fdf7f2 11085 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11086 err = -EINVAL;
11087 goto reply;
11088 }
11089 pg_t pgid;
11090 if (!pgid.parse(pgidstr.c_str())) {
11091 ss << "invalid pgid '" << pgidstr << "'";
11092 err = -EINVAL;
11093 goto reply;
11094 }
11095 if (!osdmap.pg_exists(pgid)) {
11096 ss << "pg " << pgid << " does not exist";
11097 err = -ENOENT;
11098 goto reply;
11099 }
11100 if (pending_inc.new_pg_temp.count(pgid)) {
11101 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11102 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11103 return true;
11104 }
11105
11106 vector<int64_t> id_vec;
11107 vector<int32_t> new_pg_temp;
11fdf7f2
TL
11108 cmd_getval(cct, cmdmap, "id", id_vec);
11109 if (id_vec.empty()) {
11110 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11111 ss << "done cleaning up pg_temp of " << pgid;
11112 goto update;
7c673cae
FG
11113 }
11114 for (auto osd : id_vec) {
11115 if (!osdmap.exists(osd)) {
11116 ss << "osd." << osd << " does not exist";
11117 err = -ENOENT;
11118 goto reply;
11119 }
11120 new_pg_temp.push_back(osd);
11121 }
11122
224ce89b
WB
11123 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11124 if ((int)new_pg_temp.size() < pool_min_size) {
11125 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11126 << pool_min_size << ")";
11127 err = -EINVAL;
11128 goto reply;
11129 }
11130
11131 int pool_size = osdmap.get_pg_pool_size(pgid);
11132 if ((int)new_pg_temp.size() > pool_size) {
11133 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11134 << pool_size << ")";
11135 err = -EINVAL;
11136 goto reply;
11137 }
11138
7c673cae
FG
11139 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11140 new_pg_temp.begin(), new_pg_temp.end());
11141 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11142 goto update;
11143 } else if (prefix == "osd primary-temp") {
11144 string pgidstr;
11fdf7f2 11145 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
7c673cae 11146 ss << "unable to parse 'pgid' value '"
11fdf7f2 11147 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11148 err = -EINVAL;
11149 goto reply;
11150 }
11151 pg_t pgid;
11152 if (!pgid.parse(pgidstr.c_str())) {
11153 ss << "invalid pgid '" << pgidstr << "'";
11154 err = -EINVAL;
11155 goto reply;
11156 }
11157 if (!osdmap.pg_exists(pgid)) {
11158 ss << "pg " << pgid << " does not exist";
11159 err = -ENOENT;
11160 goto reply;
11161 }
11162
11163 int64_t osd;
11fdf7f2 11164 if (!cmd_getval(cct, cmdmap, "id", osd)) {
7c673cae 11165 ss << "unable to parse 'id' value '"
11fdf7f2 11166 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11167 err = -EINVAL;
11168 goto reply;
11169 }
11170 if (osd != -1 && !osdmap.exists(osd)) {
11171 ss << "osd." << osd << " does not exist";
11172 err = -ENOENT;
11173 goto reply;
11174 }
11175
31f18b77
FG
11176 if (osdmap.require_min_compat_client > 0 &&
11177 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
11178 ss << "require_min_compat_client "
11179 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
11180 << " < firefly, which is required for primary-temp";
11181 err = -EPERM;
11182 goto reply;
7c673cae
FG
11183 }
11184
11185 pending_inc.new_primary_temp[pgid] = osd;
11186 ss << "set " << pgid << " primary_temp mapping to " << osd;
11187 goto update;
11fdf7f2
TL
11188 } else if (prefix == "pg repeer") {
11189 pg_t pgid;
11190 string pgidstr;
11191 cmd_getval(cct, cmdmap, "pgid", pgidstr);
11192 if (!pgid.parse(pgidstr.c_str())) {
11193 ss << "invalid pgid '" << pgidstr << "'";
11194 err = -EINVAL;
11195 goto reply;
11196 }
11197 if (!osdmap.pg_exists(pgid)) {
11198 ss << "pg '" << pgidstr << "' does not exist";
11199 err = -ENOENT;
11200 goto reply;
11201 }
11202 vector<int> acting;
11203 int primary;
11204 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11205 if (primary < 0) {
11206 err = -EAGAIN;
11207 ss << "pg currently has no primary";
11208 goto reply;
11209 }
11210 if (acting.size() > 1) {
11211 // map to just primary; it will map back to what it wants
11212 pending_inc.new_pg_temp[pgid] = { primary };
11213 } else {
11214 // hmm, pick another arbitrary osd to induce a change. Note
11215 // that this won't work if there is only one suitable OSD in the cluster.
11216 int i;
11217 bool done = false;
11218 for (i = 0; i < osdmap.get_max_osd(); ++i) {
11219 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11220 continue;
11221 }
11222 pending_inc.new_pg_temp[pgid] = { primary, i };
11223 done = true;
11224 break;
11225 }
11226 if (!done) {
11227 err = -EAGAIN;
11228 ss << "not enough up OSDs in the cluster to force repeer";
11229 goto reply;
11230 }
11231 }
11232 goto update;
224ce89b
WB
11233 } else if (prefix == "osd pg-upmap" ||
11234 prefix == "osd rm-pg-upmap" ||
11235 prefix == "osd pg-upmap-items" ||
11236 prefix == "osd rm-pg-upmap-items") {
31f18b77
FG
11237 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
11238 ss << "min_compat_client "
11239 << ceph_release_name(osdmap.require_min_compat_client)
224ce89b
WB
11240 << " < luminous, which is required for pg-upmap. "
11241 << "Try 'ceph osd set-require-min-compat-client luminous' "
11242 << "before using the new interface";
7c673cae
FG
11243 err = -EPERM;
11244 goto reply;
11245 }
11246 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11247 if (err == -EAGAIN)
11248 goto wait;
11249 if (err < 0)
11250 goto reply;
11251 string pgidstr;
11fdf7f2 11252 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
7c673cae 11253 ss << "unable to parse 'pgid' value '"
11fdf7f2 11254 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
7c673cae
FG
11255 err = -EINVAL;
11256 goto reply;
11257 }
11258 pg_t pgid;
11259 if (!pgid.parse(pgidstr.c_str())) {
11260 ss << "invalid pgid '" << pgidstr << "'";
11261 err = -EINVAL;
11262 goto reply;
11263 }
11264 if (!osdmap.pg_exists(pgid)) {
11265 ss << "pg " << pgid << " does not exist";
11266 err = -ENOENT;
11267 goto reply;
11268 }
94b18763
FG
11269 if (pending_inc.old_pools.count(pgid.pool())) {
11270 ss << "pool of " << pgid << " is pending removal";
11271 err = -ENOENT;
11272 getline(ss, rs);
11273 wait_for_finished_proposal(op,
11274 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11275 return true;
11276 }
224ce89b
WB
11277
11278 enum {
11279 OP_PG_UPMAP,
11280 OP_RM_PG_UPMAP,
11281 OP_PG_UPMAP_ITEMS,
11282 OP_RM_PG_UPMAP_ITEMS,
11283 } option;
11284
11285 if (prefix == "osd pg-upmap") {
11286 option = OP_PG_UPMAP;
11287 } else if (prefix == "osd rm-pg-upmap") {
11288 option = OP_RM_PG_UPMAP;
11289 } else if (prefix == "osd pg-upmap-items") {
11290 option = OP_PG_UPMAP_ITEMS;
11291 } else {
11292 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 11293 }
224ce89b
WB
11294
11295 // check pending upmap changes
11296 switch (option) {
11297 case OP_PG_UPMAP: // fall through
11298 case OP_RM_PG_UPMAP:
11299 if (pending_inc.new_pg_upmap.count(pgid) ||
11300 pending_inc.old_pg_upmap.count(pgid)) {
11301 dout(10) << __func__ << " waiting for pending update on "
11302 << pgid << dendl;
11303 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11304 return true;
7c673cae 11305 }
224ce89b 11306 break;
7c673cae 11307
224ce89b
WB
11308 case OP_PG_UPMAP_ITEMS: // fall through
11309 case OP_RM_PG_UPMAP_ITEMS:
11310 if (pending_inc.new_pg_upmap_items.count(pgid) ||
11311 pending_inc.old_pg_upmap_items.count(pgid)) {
11312 dout(10) << __func__ << " waiting for pending update on "
11313 << pgid << dendl;
11314 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11315 return true;
11316 }
11317 break;
7c673cae 11318
224ce89b 11319 default:
11fdf7f2 11320 ceph_abort_msg("invalid option");
7c673cae 11321 }
224ce89b
WB
11322
11323 switch (option) {
11324 case OP_PG_UPMAP:
11325 {
11326 vector<int64_t> id_vec;
11fdf7f2 11327 if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
224ce89b 11328 ss << "unable to parse 'id' value(s) '"
11fdf7f2 11329 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
11330 err = -EINVAL;
11331 goto reply;
11332 }
11333
11334 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11335 if ((int)id_vec.size() < pool_min_size) {
11336 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11337 << pool_min_size << ")";
11338 err = -EINVAL;
11339 goto reply;
11340 }
11341
11342 int pool_size = osdmap.get_pg_pool_size(pgid);
11343 if ((int)id_vec.size() > pool_size) {
11344 ss << "num of osds (" << id_vec.size() <<") > pool size ("
11345 << pool_size << ")";
11346 err = -EINVAL;
11347 goto reply;
11348 }
11349
11350 vector<int32_t> new_pg_upmap;
11351 for (auto osd : id_vec) {
11352 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11353 ss << "osd." << osd << " does not exist";
11354 err = -ENOENT;
11355 goto reply;
11356 }
11357 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11358 if (it != new_pg_upmap.end()) {
11359 ss << "osd." << osd << " already exists, ";
11360 continue;
11361 }
11362 new_pg_upmap.push_back(osd);
11363 }
11364
11365 if (new_pg_upmap.empty()) {
11366 ss << "no valid upmap items(pairs) is specified";
11367 err = -EINVAL;
11368 goto reply;
11369 }
11370
11371 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11372 new_pg_upmap.begin(), new_pg_upmap.end());
11373 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 11374 }
224ce89b
WB
11375 break;
11376
11377 case OP_RM_PG_UPMAP:
11378 {
11379 pending_inc.old_pg_upmap.insert(pgid);
11380 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 11381 }
224ce89b 11382 break;
7c673cae 11383
224ce89b
WB
11384 case OP_PG_UPMAP_ITEMS:
11385 {
11386 vector<int64_t> id_vec;
11fdf7f2 11387 if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
224ce89b 11388 ss << "unable to parse 'id' value(s) '"
11fdf7f2 11389 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
224ce89b
WB
11390 err = -EINVAL;
11391 goto reply;
11392 }
11393
11394 if (id_vec.size() % 2) {
11395 ss << "you must specify pairs of osd ids to be remapped";
11396 err = -EINVAL;
11397 goto reply;
11398 }
11399
11400 int pool_size = osdmap.get_pg_pool_size(pgid);
11401 if ((int)(id_vec.size() / 2) > pool_size) {
11402 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11403 << pool_size << ")";
11404 err = -EINVAL;
11405 goto reply;
11406 }
11407
11408 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11409 ostringstream items;
11410 items << "[";
11411 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11412 int from = *p++;
11413 int to = *p;
11414 if (from == to) {
11415 ss << "from osd." << from << " == to osd." << to << ", ";
11416 continue;
11417 }
11418 if (!osdmap.exists(from)) {
11419 ss << "osd." << from << " does not exist";
11420 err = -ENOENT;
11421 goto reply;
11422 }
11423 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11424 ss << "osd." << to << " does not exist";
11425 err = -ENOENT;
11426 goto reply;
11427 }
c07f9fc5
FG
11428 pair<int32_t,int32_t> entry = make_pair(from, to);
11429 auto it = std::find(new_pg_upmap_items.begin(),
11430 new_pg_upmap_items.end(), entry);
11431 if (it != new_pg_upmap_items.end()) {
11432 ss << "osd." << from << " -> osd." << to << " already exists, ";
11433 continue;
11434 }
11435 new_pg_upmap_items.push_back(entry);
224ce89b
WB
11436 items << from << "->" << to << ",";
11437 }
11438 string out(items.str());
11439 out.resize(out.size() - 1); // drop last ','
11440 out += "]";
11441
11442 if (new_pg_upmap_items.empty()) {
11443 ss << "no valid upmap items(pairs) is specified";
11444 err = -EINVAL;
11445 goto reply;
11446 }
11447
11448 pending_inc.new_pg_upmap_items[pgid] =
11449 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11450 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11451 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11452 }
11453 break;
11454
11455 case OP_RM_PG_UPMAP_ITEMS:
11456 {
11457 pending_inc.old_pg_upmap_items.insert(pgid);
11458 ss << "clear " << pgid << " pg_upmap_items mapping";
11459 }
11460 break;
11461
11462 default:
11fdf7f2 11463 ceph_abort_msg("invalid option");
7c673cae
FG
11464 }
11465
7c673cae
FG
11466 goto update;
11467 } else if (prefix == "osd primary-affinity") {
11468 int64_t id;
11fdf7f2 11469 if (!cmd_getval(cct, cmdmap, "id", id)) {
7c673cae 11470 ss << "invalid osd id value '"
11fdf7f2 11471 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11472 err = -EINVAL;
11473 goto reply;
11474 }
11475 double w;
11fdf7f2 11476 if (!cmd_getval(cct, cmdmap, "weight", w)) {
7c673cae 11477 ss << "unable to parse 'weight' value '"
11fdf7f2 11478 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
11479 err = -EINVAL;
11480 goto reply;
11481 }
11482 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11483 if (ww < 0L) {
11484 ss << "weight must be >= 0";
11485 err = -EINVAL;
11486 goto reply;
11487 }
31f18b77
FG
11488 if (osdmap.require_min_compat_client > 0 &&
11489 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
11490 ss << "require_min_compat_client "
11491 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
11492 << " < firefly, which is required for primary-affinity";
11493 err = -EPERM;
11494 goto reply;
7c673cae 11495 }
7c673cae
FG
11496 if (osdmap.exists(id)) {
11497 pending_inc.new_primary_affinity[id] = ww;
11498 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11499 getline(ss, rs);
11500 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11501 get_last_committed() + 1));
11502 return true;
11503 } else {
11504 ss << "osd." << id << " does not exist";
11505 err = -ENOENT;
11506 goto reply;
11507 }
11508 } else if (prefix == "osd reweight") {
11509 int64_t id;
11fdf7f2 11510 if (!cmd_getval(cct, cmdmap, "id", id)) {
7c673cae 11511 ss << "unable to parse osd id value '"
11fdf7f2 11512 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11513 err = -EINVAL;
11514 goto reply;
11515 }
11516 double w;
11fdf7f2 11517 if (!cmd_getval(cct, cmdmap, "weight", w)) {
7c673cae 11518 ss << "unable to parse weight value '"
11fdf7f2 11519 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
7c673cae
FG
11520 err = -EINVAL;
11521 goto reply;
11522 }
11523 long ww = (int)((double)CEPH_OSD_IN*w);
11524 if (ww < 0L) {
11525 ss << "weight must be >= 0";
11526 err = -EINVAL;
11527 goto reply;
11528 }
11529 if (osdmap.exists(id)) {
11530 pending_inc.new_weight[id] = ww;
11531 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
11532 getline(ss, rs);
11533 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11534 get_last_committed() + 1));
11535 return true;
11536 } else {
11537 ss << "osd." << id << " does not exist";
11538 err = -ENOENT;
11539 goto reply;
11540 }
11541 } else if (prefix == "osd reweightn") {
11542 map<int32_t, uint32_t> weights;
11fdf7f2 11543 err = parse_reweights(cct, cmdmap, osdmap, &weights);
7c673cae
FG
11544 if (err) {
11545 ss << "unable to parse 'weights' value '"
11fdf7f2 11546 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
7c673cae
FG
11547 goto reply;
11548 }
11549 pending_inc.new_weight.insert(weights.begin(), weights.end());
11550 wait_for_finished_proposal(
11551 op,
11552 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 11553 return true;
7c673cae
FG
11554 } else if (prefix == "osd lost") {
11555 int64_t id;
11fdf7f2 11556 if (!cmd_getval(cct, cmdmap, "id", id)) {
7c673cae 11557 ss << "unable to parse osd id value '"
11fdf7f2 11558 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
7c673cae
FG
11559 err = -EINVAL;
11560 goto reply;
11561 }
11fdf7f2
TL
11562 bool sure = false;
11563 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11564 if (!sure) {
7c673cae
FG
11565 ss << "are you SURE? this might mean real, permanent data loss. pass "
11566 "--yes-i-really-mean-it if you really do.";
11567 err = -EPERM;
11568 goto reply;
11569 } else if (!osdmap.exists(id)) {
11570 ss << "osd." << id << " does not exist";
11571 err = -ENOENT;
11572 goto reply;
11573 } else if (!osdmap.is_down(id)) {
11574 ss << "osd." << id << " is not down";
11575 err = -EBUSY;
11576 goto reply;
11577 } else {
11578 epoch_t e = osdmap.get_info(id).down_at;
11579 pending_inc.new_lost[id] = e;
11580 ss << "marked osd lost in epoch " << e;
11581 getline(ss, rs);
11582 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11583 get_last_committed() + 1));
11584 return true;
11585 }
11586
11fdf7f2
TL
11587 } else if (prefix == "osd destroy-actual" ||
11588 prefix == "osd purge-actual" ||
11589 prefix == "osd purge-new") {
31f18b77
FG
11590 /* Destroying an OSD means that we don't expect to further make use of
11591 * the OSDs data (which may even become unreadable after this operation),
11592 * and that we are okay with scrubbing all its cephx keys and config-key
11593 * data (which may include lockbox keys, thus rendering the osd's data
11594 * unreadable).
11595 *
11596 * The OSD will not be removed. Instead, we will mark it as destroyed,
11597 * such that a subsequent call to `create` will not reuse the osd id.
11598 * This will play into being able to recreate the OSD, at the same
11599 * crush location, with minimal data movement.
11600 */
11601
11602 // make sure authmon is writeable.
11603 if (!mon->authmon()->is_writeable()) {
11604 dout(10) << __func__ << " waiting for auth mon to be writeable for "
11605 << "osd destroy" << dendl;
11606 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11607 return false;
11608 }
11609
11610 int64_t id;
11fdf7f2
TL
11611 if (!cmd_getval(cct, cmdmap, "id", id)) {
11612 auto p = cmdmap.find("id");
11613 if (p == cmdmap.end()) {
11614 ss << "no osd id specified";
11615 } else {
11616 ss << "unable to parse osd id value '"
11617 << cmd_vartype_stringify(cmdmap.at("id")) << "";
11618 }
31f18b77
FG
11619 err = -EINVAL;
11620 goto reply;
11621 }
11622
11fdf7f2 11623 bool is_destroy = (prefix == "osd destroy-actual");
31f18b77 11624 if (!is_destroy) {
11fdf7f2
TL
11625 ceph_assert("osd purge-actual" == prefix ||
11626 "osd purge-new" == prefix);
31f18b77
FG
11627 }
11628
11fdf7f2
TL
11629 bool sure = false;
11630 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11631 if (!sure) {
11632 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
11633 << "This will mean real, permanent data loss, as well "
11634 << "as deletion of cephx and lockbox keys. "
11635 << "Pass --yes-i-really-mean-it if you really do.";
31f18b77
FG
11636 err = -EPERM;
11637 goto reply;
d2e6a577 11638 } else if (!osdmap.exists(id)) {
31f18b77 11639 ss << "osd." << id << " does not exist";
d2e6a577 11640 err = 0; // idempotent
31f18b77
FG
11641 goto reply;
11642 } else if (osdmap.is_up(id)) {
11643 ss << "osd." << id << " is not `down`.";
11644 err = -EBUSY;
11645 goto reply;
11646 } else if (is_destroy && osdmap.is_destroyed(id)) {
11647 ss << "destroyed osd." << id;
11648 err = 0;
11649 goto reply;
11650 }
11651
11fdf7f2
TL
11652 if (prefix == "osd purge-new" &&
11653 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
11654 ss << "osd." << id << " is not new";
11655 err = -EPERM;
11656 goto reply;
11657 }
11658
31f18b77
FG
11659 bool goto_reply = false;
11660
11661 paxos->plug();
11662 if (is_destroy) {
11663 err = prepare_command_osd_destroy(id, ss);
11664 // we checked above that it should exist.
11fdf7f2 11665 ceph_assert(err != -ENOENT);
31f18b77
FG
11666 } else {
11667 err = prepare_command_osd_purge(id, ss);
11668 if (err == -ENOENT) {
11669 err = 0;
11670 ss << "osd." << id << " does not exist.";
11671 goto_reply = true;
11672 }
11673 }
11674 paxos->unplug();
11675
11676 if (err < 0 || goto_reply) {
11677 goto reply;
11678 }
11679
11680 if (is_destroy) {
11681 ss << "destroyed osd." << id;
11682 } else {
11683 ss << "purged osd." << id;
11684 }
11685
11686 getline(ss, rs);
11687 wait_for_finished_proposal(op,
11688 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11689 force_immediate_propose();
11690 return true;
11691
11692 } else if (prefix == "osd new") {
11693
11694 // make sure authmon is writeable.
11695 if (!mon->authmon()->is_writeable()) {
11696 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 11697 << "osd new" << dendl;
31f18b77
FG
11698 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11699 return false;
11700 }
11701
3a9019d9 11702 map<string,string> param_map;
31f18b77
FG
11703
11704 bufferlist bl = m->get_data();
3a9019d9
FG
11705 string param_json = bl.to_str();
11706 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 11707
3a9019d9 11708 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
11709 if (err < 0)
11710 goto reply;
11711
3a9019d9 11712 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77
FG
11713
11714 paxos->plug();
3a9019d9 11715 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
31f18b77
FG
11716 paxos->unplug();
11717
11718 if (err < 0) {
11719 goto reply;
11720 }
11721
11722 if (f) {
11723 f->flush(rdata);
11724 } else {
11725 rdata.append(ss);
11726 }
11727
11728 if (err == EEXIST) {
11729 // idempotent operation
11730 err = 0;
11731 goto reply;
11732 }
11733
11734 wait_for_finished_proposal(op,
11735 new Monitor::C_Command(mon, op, 0, rs, rdata,
11736 get_last_committed() + 1));
11737 force_immediate_propose();
11738 return true;
11739
7c673cae 11740 } else if (prefix == "osd create") {
7c673cae
FG
11741
11742 // optional id provided?
31f18b77 11743 int64_t id = -1, cmd_id = -1;
11fdf7f2 11744 if (cmd_getval(cct, cmdmap, "id", cmd_id)) {
31f18b77
FG
11745 if (cmd_id < 0) {
11746 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
11747 err = -EINVAL;
11748 goto reply;
11749 }
31f18b77 11750 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
11751 }
11752
7c673cae
FG
11753 uuid_d uuid;
11754 string uuidstr;
11fdf7f2 11755 if (cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
7c673cae 11756 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
11757 ss << "invalid uuid value '" << uuidstr << "'";
11758 err = -EINVAL;
11759 goto reply;
7c673cae 11760 }
31f18b77
FG
11761 // we only care about the id if we also have the uuid, to
11762 // ensure the operation's idempotency.
11763 id = cmd_id;
7c673cae
FG
11764 }
11765
31f18b77
FG
11766 int32_t new_id = -1;
11767 err = prepare_command_osd_create(id, uuid, &new_id, ss);
11768 if (err < 0) {
11769 if (err == -EAGAIN) {
11770 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11771 return true;
11772 }
11773 // a check has failed; reply to the user.
11774 goto reply;
11775
11776 } else if (err == EEXIST) {
11777 // this is an idempotent operation; we can go ahead and reply.
11778 if (f) {
11779 f->open_object_section("created_osd");
11780 f->dump_int("osdid", new_id);
11781 f->close_section();
11782 f->flush(rdata);
11783 } else {
11784 ss << new_id;
11785 rdata.append(ss);
7c673cae 11786 }
31f18b77
FG
11787 err = 0;
11788 goto reply;
7c673cae
FG
11789 }
11790
3a9019d9
FG
11791 string empty_device_class;
11792 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 11793
7c673cae
FG
11794 if (f) {
11795 f->open_object_section("created_osd");
31f18b77 11796 f->dump_int("osdid", new_id);
7c673cae
FG
11797 f->close_section();
11798 f->flush(rdata);
11799 } else {
31f18b77 11800 ss << new_id;
7c673cae
FG
11801 rdata.append(ss);
11802 }
31f18b77
FG
11803 wait_for_finished_proposal(op,
11804 new Monitor::C_Command(mon, op, 0, rs, rdata,
11805 get_last_committed() + 1));
7c673cae
FG
11806 return true;
11807
11808 } else if (prefix == "osd blacklist clear") {
11809 pending_inc.new_blacklist.clear();
11810 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
11811 osdmap.get_blacklist(&blacklist);
11812 for (const auto &entry : blacklist) {
11813 pending_inc.old_blacklist.push_back(entry.first);
11814 }
11815 ss << " removed all blacklist entries";
11816 getline(ss, rs);
11817 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11818 get_last_committed() + 1));
11819 return true;
11820 } else if (prefix == "osd blacklist") {
11821 string addrstr;
11fdf7f2 11822 cmd_getval(cct, cmdmap, "addr", addrstr);
7c673cae
FG
11823 entity_addr_t addr;
11824 if (!addr.parse(addrstr.c_str(), 0)) {
11825 ss << "unable to parse address " << addrstr;
11826 err = -EINVAL;
11827 goto reply;
11828 }
11829 else {
11fdf7f2
TL
11830 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
11831 // always blacklist type ANY
11832 addr.set_type(entity_addr_t::TYPE_ANY);
11833 } else {
11834 addr.set_type(entity_addr_t::TYPE_LEGACY);
11835 }
11836
7c673cae 11837 string blacklistop;
11fdf7f2 11838 cmd_getval(cct, cmdmap, "blacklistop", blacklistop);
7c673cae
FG
11839 if (blacklistop == "add") {
11840 utime_t expires = ceph_clock_now();
11841 double d;
11842 // default one hour
11fdf7f2
TL
11843 cmd_getval(cct, cmdmap, "expire", d,
11844 g_conf()->mon_osd_blacklist_default_expire);
7c673cae
FG
11845 expires += d;
11846
11847 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
11848
11849 {
11850 // cancel any pending un-blacklisting request too
11851 auto it = std::find(pending_inc.old_blacklist.begin(),
11852 pending_inc.old_blacklist.end(), addr);
11853 if (it != pending_inc.old_blacklist.end()) {
11854 pending_inc.old_blacklist.erase(it);
11855 }
11856 }
11857
7c673cae
FG
11858 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
11859 getline(ss, rs);
11860 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11861 get_last_committed() + 1));
11862 return true;
11863 } else if (blacklistop == "rm") {
11864 if (osdmap.is_blacklisted(addr) ||
11865 pending_inc.new_blacklist.count(addr)) {
11866 if (osdmap.is_blacklisted(addr))
11867 pending_inc.old_blacklist.push_back(addr);
11868 else
11869 pending_inc.new_blacklist.erase(addr);
11870 ss << "un-blacklisting " << addr;
11871 getline(ss, rs);
11872 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11873 get_last_committed() + 1));
11874 return true;
11875 }
11876 ss << addr << " isn't blacklisted";
11877 err = 0;
11878 goto reply;
11879 }
11880 }
11881 } else if (prefix == "osd pool mksnap") {
11882 string poolstr;
11fdf7f2 11883 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
11884 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11885 if (pool < 0) {
11886 ss << "unrecognized pool '" << poolstr << "'";
11887 err = -ENOENT;
11888 goto reply;
11889 }
11890 string snapname;
11fdf7f2 11891 cmd_getval(cct, cmdmap, "snap", snapname);
7c673cae
FG
11892 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11893 if (p->is_unmanaged_snaps_mode()) {
11894 ss << "pool " << poolstr << " is in unmanaged snaps mode";
11895 err = -EINVAL;
11896 goto reply;
11897 } else if (p->snap_exists(snapname.c_str())) {
11898 ss << "pool " << poolstr << " snap " << snapname << " already exists";
11899 err = 0;
11900 goto reply;
11901 } else if (p->is_tier()) {
11902 ss << "pool " << poolstr << " is a cache tier";
11903 err = -EINVAL;
11904 goto reply;
11905 }
11906 pg_pool_t *pp = 0;
11907 if (pending_inc.new_pools.count(pool))
11908 pp = &pending_inc.new_pools[pool];
11909 if (!pp) {
11910 pp = &pending_inc.new_pools[pool];
11911 *pp = *p;
11912 }
11913 if (pp->snap_exists(snapname.c_str())) {
11914 ss << "pool " << poolstr << " snap " << snapname << " already exists";
11915 } else {
11916 pp->add_snap(snapname.c_str(), ceph_clock_now());
11917 pp->set_snap_epoch(pending_inc.epoch);
11918 ss << "created pool " << poolstr << " snap " << snapname;
11919 }
11920 getline(ss, rs);
11921 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11922 get_last_committed() + 1));
11923 return true;
11924 } else if (prefix == "osd pool rmsnap") {
11925 string poolstr;
11fdf7f2 11926 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
11927 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11928 if (pool < 0) {
11929 ss << "unrecognized pool '" << poolstr << "'";
11930 err = -ENOENT;
11931 goto reply;
11932 }
11933 string snapname;
11fdf7f2 11934 cmd_getval(cct, cmdmap, "snap", snapname);
7c673cae
FG
11935 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11936 if (p->is_unmanaged_snaps_mode()) {
11937 ss << "pool " << poolstr << " is in unmanaged snaps mode";
11938 err = -EINVAL;
11939 goto reply;
11940 } else if (!p->snap_exists(snapname.c_str())) {
11941 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
11942 err = 0;
11943 goto reply;
11944 }
11945 pg_pool_t *pp = 0;
11946 if (pending_inc.new_pools.count(pool))
11947 pp = &pending_inc.new_pools[pool];
11948 if (!pp) {
11949 pp = &pending_inc.new_pools[pool];
11950 *pp = *p;
11951 }
11952 snapid_t sn = pp->snap_exists(snapname.c_str());
11953 if (sn) {
11954 pp->remove_snap(sn);
11955 pp->set_snap_epoch(pending_inc.epoch);
11956 ss << "removed pool " << poolstr << " snap " << snapname;
11957 } else {
11958 ss << "already removed pool " << poolstr << " snap " << snapname;
11959 }
11960 getline(ss, rs);
11961 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11962 get_last_committed() + 1));
11963 return true;
11964 } else if (prefix == "osd pool create") {
11fdf7f2 11965 int64_t pg_num, pg_num_min;
7c673cae 11966 int64_t pgp_num;
11fdf7f2
TL
11967 cmd_getval(cct, cmdmap, "pg_num", pg_num, int64_t(0));
11968 cmd_getval(cct, cmdmap, "pgp_num", pgp_num, pg_num);
11969 cmd_getval(cct, cmdmap, "pg_num_min", pg_num_min, int64_t(0));
7c673cae
FG
11970
11971 string pool_type_str;
11fdf7f2 11972 cmd_getval(cct, cmdmap, "pool_type", pool_type_str);
7c673cae 11973 if (pool_type_str.empty())
11fdf7f2 11974 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
7c673cae
FG
11975
11976 string poolstr;
11fdf7f2 11977 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
11978 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11979 if (pool_id >= 0) {
11980 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11981 if (pool_type_str != p->get_type_name()) {
11982 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
11983 err = -EINVAL;
11984 } else {
11985 ss << "pool '" << poolstr << "' already exists";
11986 err = 0;
11987 }
11988 goto reply;
11989 }
11990
11991 int pool_type;
11992 if (pool_type_str == "replicated") {
11993 pool_type = pg_pool_t::TYPE_REPLICATED;
11994 } else if (pool_type_str == "erasure") {
7c673cae
FG
11995 pool_type = pg_pool_t::TYPE_ERASURE;
11996 } else {
11997 ss << "unknown pool type '" << pool_type_str << "'";
11998 err = -EINVAL;
11999 goto reply;
12000 }
12001
31f18b77 12002 bool implicit_rule_creation = false;
94b18763 12003 int64_t expected_num_objects = 0;
31f18b77 12004 string rule_name;
11fdf7f2 12005 cmd_getval(cct, cmdmap, "rule", rule_name);
7c673cae 12006 string erasure_code_profile;
11fdf7f2 12007 cmd_getval(cct, cmdmap, "erasure_code_profile", erasure_code_profile);
7c673cae
FG
12008
12009 if (pool_type == pg_pool_t::TYPE_ERASURE) {
12010 if (erasure_code_profile == "")
12011 erasure_code_profile = "default";
12012 //handle the erasure code profile
12013 if (erasure_code_profile == "default") {
12014 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12015 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12016 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12017 goto wait;
12018 }
12019
12020 map<string,string> profile_map;
11fdf7f2 12021 err = osdmap.get_erasure_code_profile_default(cct,
7c673cae
FG
12022 profile_map,
12023 &ss);
12024 if (err)
12025 goto reply;
12026 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12027 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12028 goto wait;
12029 }
12030 }
31f18b77
FG
12031 if (rule_name == "") {
12032 implicit_rule_creation = true;
7c673cae 12033 if (erasure_code_profile == "default") {
31f18b77 12034 rule_name = "erasure-code";
7c673cae 12035 } else {
31f18b77 12036 dout(1) << "implicitly use rule named after the pool: "
7c673cae 12037 << poolstr << dendl;
31f18b77 12038 rule_name = poolstr;
7c673cae
FG
12039 }
12040 }
11fdf7f2 12041 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
94b18763 12042 expected_num_objects, int64_t(0));
7c673cae 12043 } else {
31f18b77 12044 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
12045 // and put expected_num_objects to rule field
12046 if (erasure_code_profile != "") { // cmd is from CLI
12047 if (rule_name != "") {
12048 string interr;
12049 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12050 if (interr.length()) {
12051 ss << "error parsing integer value '" << rule_name << "': " << interr;
12052 err = -EINVAL;
12053 goto reply;
12054 }
12055 }
12056 rule_name = erasure_code_profile;
12057 } else { // cmd is well-formed
11fdf7f2 12058 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
94b18763
FG
12059 expected_num_objects, int64_t(0));
12060 }
7c673cae
FG
12061 }
12062
31f18b77
FG
12063 if (!implicit_rule_creation && rule_name != "") {
12064 int rule;
12065 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
12066 if (err == -EAGAIN) {
12067 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12068 return true;
12069 }
12070 if (err)
12071 goto reply;
12072 }
12073
7c673cae
FG
12074 if (expected_num_objects < 0) {
12075 ss << "'expected_num_objects' must be non-negative";
12076 err = -EINVAL;
12077 goto reply;
12078 }
12079
91327a77
AA
12080 if (expected_num_objects > 0 &&
12081 cct->_conf->osd_objectstore == "filestore" &&
12082 cct->_conf->filestore_merge_threshold > 0) {
12083 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12084 err = -EINVAL;
12085 goto reply;
12086 }
12087
12088 if (expected_num_objects == 0 &&
12089 cct->_conf->osd_objectstore == "filestore" &&
12090 cct->_conf->filestore_merge_threshold < 0) {
12091 int osds = osdmap.get_num_osds();
12092 if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12093 ss << "For better initial performance on pools expected to store a "
12094 << "large number of objects, consider supplying the "
12095 << "expected_num_objects parameter when creating the pool.\n";
12096 }
12097 }
12098
7c673cae 12099 int64_t fast_read_param;
11fdf7f2 12100 cmd_getval(cct, cmdmap, "fast_read", fast_read_param, int64_t(-1));
7c673cae
FG
12101 FastReadType fast_read = FAST_READ_DEFAULT;
12102 if (fast_read_param == 0)
12103 fast_read = FAST_READ_OFF;
12104 else if (fast_read_param > 0)
12105 fast_read = FAST_READ_ON;
11fdf7f2
TL
12106
12107 int64_t repl_size = 0;
12108 cmd_getval(cct, cmdmap, "size", repl_size);
12109 int64_t target_size_bytes = 0;
12110 double target_size_ratio = 0.0;
12111 cmd_getval(cct, cmdmap, "target_size_bytes", target_size_bytes);
12112 cmd_getval(cct, cmdmap, "target_size_ratio", target_size_ratio);
12113
12114 err = prepare_new_pool(poolstr,
7c673cae 12115 -1, // default crush rule
31f18b77 12116 rule_name,
11fdf7f2
TL
12117 pg_num, pgp_num, pg_num_min,
12118 repl_size, target_size_bytes, target_size_ratio,
7c673cae
FG
12119 erasure_code_profile, pool_type,
12120 (uint64_t)expected_num_objects,
12121 fast_read,
12122 &ss);
12123 if (err < 0) {
12124 switch(err) {
12125 case -EEXIST:
12126 ss << "pool '" << poolstr << "' already exists";
12127 break;
12128 case -EAGAIN:
12129 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12130 return true;
12131 case -ERANGE:
12132 goto reply;
12133 default:
12134 goto reply;
12135 break;
12136 }
12137 } else {
12138 ss << "pool '" << poolstr << "' created";
12139 }
12140 getline(ss, rs);
12141 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12142 get_last_committed() + 1));
12143 return true;
12144
12145 } else if (prefix == "osd pool delete" ||
12146 prefix == "osd pool rm") {
12147 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12148 string poolstr, poolstr2, sure;
11fdf7f2
TL
12149 cmd_getval(cct, cmdmap, "pool", poolstr);
12150 cmd_getval(cct, cmdmap, "pool2", poolstr2);
7c673cae
FG
12151 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12152 if (pool < 0) {
12153 ss << "pool '" << poolstr << "' does not exist";
12154 err = 0;
12155 goto reply;
12156 }
12157
11fdf7f2
TL
12158 bool force_no_fake = false;
12159 cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12160 bool force = false;
12161 cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it_not_faking", force);
7c673cae 12162 if (poolstr2 != poolstr ||
11fdf7f2 12163 (!force && !force_no_fake)) {
7c673cae
FG
12164 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12165 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12166 << "followed by --yes-i-really-really-mean-it.";
12167 err = -EPERM;
12168 goto reply;
12169 }
12170 err = _prepare_remove_pool(pool, &ss, force_no_fake);
12171 if (err == -EAGAIN) {
12172 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12173 return true;
12174 }
12175 if (err < 0)
12176 goto reply;
12177 goto update;
12178 } else if (prefix == "osd pool rename") {
12179 string srcpoolstr, destpoolstr;
11fdf7f2
TL
12180 cmd_getval(cct, cmdmap, "srcpool", srcpoolstr);
12181 cmd_getval(cct, cmdmap, "destpool", destpoolstr);
7c673cae
FG
12182 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12183 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12184
12185 if (pool_src < 0) {
12186 if (pool_dst >= 0) {
12187 // src pool doesn't exist, dst pool does exist: to ensure idempotency
12188 // of operations, assume this rename succeeded, as it is not changing
12189 // the current state. Make sure we output something understandable
12190 // for whoever is issuing the command, if they are paying attention,
12191 // in case it was not intentional; or to avoid a "wtf?" and a bug
12192 // report in case it was intentional, while expecting a failure.
12193 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12194 << destpoolstr << "' does -- assuming successful rename";
12195 err = 0;
12196 } else {
12197 ss << "unrecognized pool '" << srcpoolstr << "'";
12198 err = -ENOENT;
12199 }
12200 goto reply;
12201 } else if (pool_dst >= 0) {
12202 // source pool exists and so does the destination pool
12203 ss << "pool '" << destpoolstr << "' already exists";
12204 err = -EEXIST;
12205 goto reply;
12206 }
12207
12208 int ret = _prepare_rename_pool(pool_src, destpoolstr);
12209 if (ret == 0) {
12210 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12211 } else {
12212 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12213 << cpp_strerror(ret);
12214 }
12215 getline(ss, rs);
12216 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12217 get_last_committed() + 1));
12218 return true;
12219
12220 } else if (prefix == "osd pool set") {
12221 err = prepare_command_pool_set(cmdmap, ss);
12222 if (err == -EAGAIN)
12223 goto wait;
12224 if (err < 0)
12225 goto reply;
12226
12227 getline(ss, rs);
12228 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12229 get_last_committed() + 1));
12230 return true;
12231 } else if (prefix == "osd tier add") {
12232 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12233 if (err == -EAGAIN)
12234 goto wait;
12235 if (err)
12236 goto reply;
12237 string poolstr;
11fdf7f2 12238 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
12239 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12240 if (pool_id < 0) {
12241 ss << "unrecognized pool '" << poolstr << "'";
12242 err = -ENOENT;
12243 goto reply;
12244 }
12245 string tierpoolstr;
11fdf7f2 12246 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
12247 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12248 if (tierpool_id < 0) {
12249 ss << "unrecognized pool '" << tierpoolstr << "'";
12250 err = -ENOENT;
12251 goto reply;
12252 }
12253 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12254 ceph_assert(p);
7c673cae 12255 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 12256 ceph_assert(tp);
7c673cae
FG
12257
12258 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12259 goto reply;
12260 }
12261
12262 // make sure new tier is empty
12263 string force_nonempty;
11fdf7f2
TL
12264 cmd_getval(cct, cmdmap, "force_nonempty", force_nonempty);
12265 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 12266 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
12267 force_nonempty != "--force-nonempty") {
12268 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12269 err = -ENOTEMPTY;
12270 goto reply;
12271 }
11fdf7f2 12272 if (tp->is_erasure()) {
7c673cae
FG
12273 ss << "tier pool '" << tierpoolstr
12274 << "' is an ec pool, which cannot be a tier";
12275 err = -ENOTSUP;
12276 goto reply;
12277 }
12278 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12279 ((force_nonempty != "--force-nonempty") ||
11fdf7f2 12280 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
7c673cae
FG
12281 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12282 err = -ENOTEMPTY;
12283 goto reply;
12284 }
12285 // go
12286 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12287 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12288 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12289 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12290 return true;
12291 }
12292 np->tiers.insert(tierpool_id);
12293 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12294 ntp->tier_of = pool_id;
12295 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12296 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12297 get_last_committed() + 1));
12298 return true;
12299 } else if (prefix == "osd tier remove" ||
12300 prefix == "osd tier rm") {
12301 string poolstr;
11fdf7f2 12302 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
12303 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12304 if (pool_id < 0) {
12305 ss << "unrecognized pool '" << poolstr << "'";
12306 err = -ENOENT;
12307 goto reply;
12308 }
12309 string tierpoolstr;
11fdf7f2 12310 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
12311 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12312 if (tierpool_id < 0) {
12313 ss << "unrecognized pool '" << tierpoolstr << "'";
12314 err = -ENOENT;
12315 goto reply;
12316 }
12317 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12318 ceph_assert(p);
7c673cae 12319 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 12320 ceph_assert(tp);
7c673cae
FG
12321
12322 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12323 goto reply;
12324 }
12325
12326 if (p->tiers.count(tierpool_id) == 0) {
12327 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12328 err = 0;
12329 goto reply;
12330 }
12331 if (tp->tier_of != pool_id) {
12332 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12333 << osdmap.get_pool_name(tp->tier_of) << "': "
12334 // be scary about it; this is an inconsistency and bells must go off
12335 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12336 err = -EINVAL;
12337 goto reply;
12338 }
12339 if (p->read_tier == tierpool_id) {
12340 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12341 err = -EBUSY;
12342 goto reply;
12343 }
12344 // go
12345 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12346 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12347 if (np->tiers.count(tierpool_id) == 0 ||
12348 ntp->tier_of != pool_id ||
12349 np->read_tier == tierpool_id) {
12350 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12351 return true;
12352 }
12353 np->tiers.erase(tierpool_id);
12354 ntp->clear_tier();
12355 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12356 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12357 get_last_committed() + 1));
12358 return true;
12359 } else if (prefix == "osd tier set-overlay") {
12360 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12361 if (err == -EAGAIN)
12362 goto wait;
12363 if (err)
12364 goto reply;
12365 string poolstr;
11fdf7f2 12366 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
12367 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12368 if (pool_id < 0) {
12369 ss << "unrecognized pool '" << poolstr << "'";
12370 err = -ENOENT;
12371 goto reply;
12372 }
12373 string overlaypoolstr;
11fdf7f2 12374 cmd_getval(cct, cmdmap, "overlaypool", overlaypoolstr);
7c673cae
FG
12375 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12376 if (overlaypool_id < 0) {
12377 ss << "unrecognized pool '" << overlaypoolstr << "'";
12378 err = -ENOENT;
12379 goto reply;
12380 }
12381 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12382 ceph_assert(p);
7c673cae 12383 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11fdf7f2 12384 ceph_assert(overlay_p);
7c673cae
FG
12385 if (p->tiers.count(overlaypool_id) == 0) {
12386 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12387 err = -EINVAL;
12388 goto reply;
12389 }
12390 if (p->read_tier == overlaypool_id) {
12391 err = 0;
12392 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12393 goto reply;
12394 }
12395 if (p->has_read_tier()) {
12396 ss << "pool '" << poolstr << "' has overlay '"
12397 << osdmap.get_pool_name(p->read_tier)
12398 << "'; please remove-overlay first";
12399 err = -EINVAL;
12400 goto reply;
12401 }
12402
12403 // go
12404 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12405 np->read_tier = overlaypool_id;
12406 np->write_tier = overlaypool_id;
12407 np->set_last_force_op_resend(pending_inc.epoch);
12408 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12409 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12410 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12411 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12412 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12413 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12414 get_last_committed() + 1));
12415 return true;
12416 } else if (prefix == "osd tier remove-overlay" ||
12417 prefix == "osd tier rm-overlay") {
12418 string poolstr;
11fdf7f2 12419 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
12420 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12421 if (pool_id < 0) {
12422 ss << "unrecognized pool '" << poolstr << "'";
12423 err = -ENOENT;
12424 goto reply;
12425 }
12426 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12427 ceph_assert(p);
7c673cae
FG
12428 if (!p->has_read_tier()) {
12429 err = 0;
12430 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12431 goto reply;
12432 }
12433
12434 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12435 goto reply;
12436 }
12437
12438 // go
12439 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12440 if (np->has_read_tier()) {
12441 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12442 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12443 nop->set_last_force_op_resend(pending_inc.epoch);
12444 }
12445 if (np->has_write_tier()) {
12446 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12447 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12448 nop->set_last_force_op_resend(pending_inc.epoch);
12449 }
12450 np->clear_read_tier();
12451 np->clear_write_tier();
12452 np->set_last_force_op_resend(pending_inc.epoch);
12453 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12454 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12455 get_last_committed() + 1));
12456 return true;
12457 } else if (prefix == "osd tier cache-mode") {
12458 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12459 if (err == -EAGAIN)
12460 goto wait;
12461 if (err)
12462 goto reply;
12463 string poolstr;
11fdf7f2 12464 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
12465 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12466 if (pool_id < 0) {
12467 ss << "unrecognized pool '" << poolstr << "'";
12468 err = -ENOENT;
12469 goto reply;
12470 }
12471 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12472 ceph_assert(p);
7c673cae
FG
12473 if (!p->is_tier()) {
12474 ss << "pool '" << poolstr << "' is not a tier";
12475 err = -EINVAL;
12476 goto reply;
12477 }
12478 string modestr;
11fdf7f2 12479 cmd_getval(cct, cmdmap, "mode", modestr);
7c673cae
FG
12480 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12481 if (mode < 0) {
12482 ss << "'" << modestr << "' is not a valid cache mode";
12483 err = -EINVAL;
12484 goto reply;
12485 }
12486
11fdf7f2
TL
12487 bool sure = false;
12488 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12489
7c673cae
FG
12490 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12491 mode != pg_pool_t::CACHEMODE_NONE &&
12492 mode != pg_pool_t::CACHEMODE_PROXY &&
12493 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11fdf7f2 12494 !sure) {
7c673cae
FG
12495 ss << "'" << modestr << "' is not a well-supported cache mode and may "
12496 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12497 err = -EPERM;
12498 goto reply;
12499 }
12500
12501 // pool already has this cache-mode set and there are no pending changes
12502 if (p->cache_mode == mode &&
12503 (pending_inc.new_pools.count(pool_id) == 0 ||
12504 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12505 ss << "set cache-mode for pool '" << poolstr << "'"
12506 << " to " << pg_pool_t::get_cache_mode_name(mode);
12507 err = 0;
12508 goto reply;
12509 }
12510
12511 /* Mode description:
12512 *
12513 * none: No cache-mode defined
12514 * forward: Forward all reads and writes to base pool
12515 * writeback: Cache writes, promote reads from base pool
12516 * readonly: Forward writes to base pool
12517 * readforward: Writes are in writeback mode, Reads are in forward mode
12518 * proxy: Proxy all reads and writes to base pool
12519 * readproxy: Writes are in writeback mode, Reads are in proxy mode
12520 *
12521 * Hence, these are the allowed transitions:
12522 *
12523 * none -> any
12524 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12525 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12526 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12527 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
12528 * writeback -> readforward || readproxy || forward || proxy
12529 * readonly -> any
12530 */
12531
12532 // We check if the transition is valid against the current pool mode, as
12533 // it is the only committed state thus far. We will blantly squash
12534 // whatever mode is on the pending state.
12535
12536 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
12537 (mode != pg_pool_t::CACHEMODE_FORWARD &&
12538 mode != pg_pool_t::CACHEMODE_PROXY &&
12539 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12540 mode != pg_pool_t::CACHEMODE_READPROXY)) {
12541 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
12542 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
12543 << "' pool; only '"
12544 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
12545 << "','"
12546 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
12547 << "','"
12548 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
12549 << "','"
12550 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
12551 << "' allowed.";
12552 err = -EINVAL;
12553 goto reply;
12554 }
12555 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
12556 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12557 mode != pg_pool_t::CACHEMODE_FORWARD &&
12558 mode != pg_pool_t::CACHEMODE_PROXY &&
12559 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12560
12561 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
12562 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12563 mode != pg_pool_t::CACHEMODE_FORWARD &&
12564 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12565 mode != pg_pool_t::CACHEMODE_PROXY)) ||
12566
12567 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
12568 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12569 mode != pg_pool_t::CACHEMODE_FORWARD &&
12570 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12571 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12572
12573 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
12574 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12575 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12576 mode != pg_pool_t::CACHEMODE_PROXY &&
12577 mode != pg_pool_t::CACHEMODE_READPROXY))) {
12578
31f18b77 12579 const pool_stat_t* pstats =
11fdf7f2 12580 mon->mgrstatmon()->get_pool_stat(pool_id);
7c673cae 12581
31f18b77 12582 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
12583 ss << "unable to set cache-mode '"
12584 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
12585 << "': dirty objects found";
12586 err = -EBUSY;
12587 goto reply;
12588 }
12589 }
12590 // go
12591 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12592 np->cache_mode = mode;
12593 // set this both when moving to and from cache_mode NONE. this is to
12594 // capture legacy pools that were set up before this flag existed.
12595 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
12596 ss << "set cache-mode for pool '" << poolstr
12597 << "' to " << pg_pool_t::get_cache_mode_name(mode);
12598 if (mode == pg_pool_t::CACHEMODE_NONE) {
12599 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11fdf7f2 12600 ceph_assert(base_pool);
7c673cae
FG
12601 if (base_pool->read_tier == pool_id ||
12602 base_pool->write_tier == pool_id)
12603 ss <<" (WARNING: pool is still configured as read or write tier)";
12604 }
12605 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12606 get_last_committed() + 1));
12607 return true;
12608 } else if (prefix == "osd tier add-cache") {
12609 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12610 if (err == -EAGAIN)
12611 goto wait;
12612 if (err)
12613 goto reply;
12614 string poolstr;
11fdf7f2 12615 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
12616 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12617 if (pool_id < 0) {
12618 ss << "unrecognized pool '" << poolstr << "'";
12619 err = -ENOENT;
12620 goto reply;
12621 }
12622 string tierpoolstr;
11fdf7f2 12623 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
12624 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12625 if (tierpool_id < 0) {
12626 ss << "unrecognized pool '" << tierpoolstr << "'";
12627 err = -ENOENT;
12628 goto reply;
12629 }
12630 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11fdf7f2 12631 ceph_assert(p);
7c673cae 12632 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11fdf7f2 12633 ceph_assert(tp);
7c673cae
FG
12634
12635 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12636 goto reply;
12637 }
12638
12639 int64_t size = 0;
11fdf7f2 12640 if (!cmd_getval(cct, cmdmap, "size", size)) {
7c673cae 12641 ss << "unable to parse 'size' value '"
11fdf7f2 12642 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
7c673cae
FG
12643 err = -EINVAL;
12644 goto reply;
12645 }
12646 // make sure new tier is empty
31f18b77 12647 const pool_stat_t *pstats =
11fdf7f2 12648 mon->mgrstatmon()->get_pool_stat(tierpool_id);
31f18b77 12649 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
12650 ss << "tier pool '" << tierpoolstr << "' is not empty";
12651 err = -ENOTEMPTY;
12652 goto reply;
12653 }
11fdf7f2 12654 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
7c673cae
FG
12655 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12656 if (mode < 0) {
12657 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
12658 err = -EINVAL;
12659 goto reply;
12660 }
12661 HitSet::Params hsp;
11fdf7f2
TL
12662 auto& cache_hit_set_type =
12663 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
12664 if (cache_hit_set_type == "bloom") {
7c673cae 12665 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11fdf7f2 12666 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7c673cae 12667 hsp = HitSet::Params(bsp);
11fdf7f2 12668 } else if (cache_hit_set_type == "explicit_hash") {
7c673cae 12669 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11fdf7f2 12670 } else if (cache_hit_set_type == "explicit_object") {
7c673cae
FG
12671 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
12672 } else {
11fdf7f2
TL
12673 ss << "osd tier cache default hit set type '"
12674 << cache_hit_set_type << "' is not a known type";
7c673cae
FG
12675 err = -EINVAL;
12676 goto reply;
12677 }
12678 // go
12679 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12680 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12681 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12682 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12683 return true;
12684 }
12685 np->tiers.insert(tierpool_id);
12686 np->read_tier = np->write_tier = tierpool_id;
12687 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12688 np->set_last_force_op_resend(pending_inc.epoch);
12689 ntp->set_last_force_op_resend(pending_inc.epoch);
12690 ntp->tier_of = pool_id;
12691 ntp->cache_mode = mode;
11fdf7f2
TL
12692 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
12693 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
12694 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
12695 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
12696 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
12697 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
7c673cae
FG
12698 ntp->hit_set_params = hsp;
12699 ntp->target_max_bytes = size;
12700 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
12701 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12702 get_last_committed() + 1));
12703 return true;
12704 } else if (prefix == "osd pool set-quota") {
12705 string poolstr;
11fdf7f2 12706 cmd_getval(cct, cmdmap, "pool", poolstr);
7c673cae
FG
12707 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12708 if (pool_id < 0) {
12709 ss << "unrecognized pool '" << poolstr << "'";
12710 err = -ENOENT;
12711 goto reply;
12712 }
12713
12714 string field;
11fdf7f2 12715 cmd_getval(cct, cmdmap, "field", field);
7c673cae
FG
12716 if (field != "max_objects" && field != "max_bytes") {
12717 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
12718 err = -EINVAL;
12719 goto reply;
12720 }
12721
12722 // val could contain unit designations, so we treat as a string
12723 string val;
11fdf7f2 12724 cmd_getval(cct, cmdmap, "val", val);
1adf2230
AA
12725 string tss;
12726 int64_t value;
12727 if (field == "max_objects") {
12728 value = strict_sistrtoll(val.c_str(), &tss);
12729 } else if (field == "max_bytes") {
12730 value = strict_iecstrtoll(val.c_str(), &tss);
12731 } else {
11fdf7f2 12732 ceph_abort_msg("unrecognized option");
1adf2230
AA
12733 }
12734 if (!tss.empty()) {
12735 ss << "error parsing value '" << val << "': " << tss;
12736 err = -EINVAL;
7c673cae
FG
12737 goto reply;
12738 }
12739
12740 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
12741 if (field == "max_objects") {
12742 pi->quota_max_objects = value;
12743 } else if (field == "max_bytes") {
12744 pi->quota_max_bytes = value;
12745 } else {
11fdf7f2 12746 ceph_abort_msg("unrecognized option");
7c673cae
FG
12747 }
12748 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
12749 rs = ss.str();
12750 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12751 get_last_committed() + 1));
12752 return true;
c07f9fc5
FG
12753 } else if (prefix == "osd pool application enable" ||
12754 prefix == "osd pool application disable" ||
12755 prefix == "osd pool application set" ||
12756 prefix == "osd pool application rm") {
12757 err = prepare_command_pool_application(prefix, cmdmap, ss);
11fdf7f2 12758 if (err == -EAGAIN) {
c07f9fc5 12759 goto wait;
11fdf7f2 12760 } else if (err < 0) {
7c673cae 12761 goto reply;
7c673cae 12762 } else {
11fdf7f2 12763 goto update;
7c673cae 12764 }
c07f9fc5
FG
12765 } else if (prefix == "osd force-create-pg") {
12766 pg_t pgid;
12767 string pgidstr;
11fdf7f2 12768 cmd_getval(cct, cmdmap, "pgid", pgidstr);
c07f9fc5
FG
12769 if (!pgid.parse(pgidstr.c_str())) {
12770 ss << "invalid pgid '" << pgidstr << "'";
12771 err = -EINVAL;
12772 goto reply;
12773 }
94b18763
FG
12774 if (!osdmap.pg_exists(pgid)) {
12775 ss << "pg " << pgid << " should not exist";
12776 err = -ENOENT;
12777 goto reply;
12778 }
11fdf7f2
TL
12779 bool sure = false;
12780 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12781 if (!sure) {
12782 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
12783 << "that the cluster will give up ever trying to recover the lost data. Do this "
12784 << "only if you are certain that all copies of the PG are in fact lost and you are "
12785 << "willing to accept that the data is permanently destroyed. Pass "
12786 << "--yes-i-really-mean-it to proceed.";
12787 err = -EPERM;
12788 goto reply;
12789 }
c07f9fc5
FG
12790 bool creating_now;
12791 {
12792 std::lock_guard<std::mutex> l(creating_pgs_lock);
12793 auto emplaced = creating_pgs.pgs.emplace(pgid,
12794 make_pair(osdmap.get_epoch(),
12795 ceph_clock_now()));
12796 creating_now = emplaced.second;
12797 }
12798 if (creating_now) {
12799 ss << "pg " << pgidstr << " now creating, ok";
11fdf7f2
TL
12800 // set the pool's CREATING flag so that (1) the osd won't ignore our
12801 // create message and (2) we won't propose any future pg_num changes
12802 // until after the PG has been instantiated.
12803 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
12804 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
12805 }
12806 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
c07f9fc5
FG
12807 err = 0;
12808 goto update;
12809 } else {
12810 ss << "pg " << pgid << " already creating";
12811 err = 0;
12812 goto reply;
12813 }
7c673cae
FG
12814 } else {
12815 err = -EINVAL;
12816 }
12817
12818 reply:
12819 getline(ss, rs);
12820 if (err < 0 && rs.length() == 0)
12821 rs = cpp_strerror(err);
12822 mon->reply_command(op, err, rs, rdata, get_last_committed());
12823 return ret;
12824
12825 update:
12826 getline(ss, rs);
12827 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12828 get_last_committed() + 1));
12829 return true;
12830
12831 wait:
12832 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12833 return true;
12834}
12835
28e407b8 12836bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
7c673cae
FG
12837{
12838 op->mark_osdmon_event(__func__);
28e407b8 12839
7c673cae 12840 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11fdf7f2 12841 MonSession *session = op->get_session();
28e407b8
AA
12842 if (!session) {
12843 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12844 return true;
12845 }
12846
12847 switch (m->op) {
12848 case POOL_OP_CREATE_UNMANAGED_SNAP:
12849 case POOL_OP_DELETE_UNMANAGED_SNAP:
12850 {
12851 const std::string* pool_name = nullptr;
12852 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
12853 if (pg_pool != nullptr) {
12854 pool_name = &osdmap.get_pool_name(m->pool);
12855 }
12856
12857 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
12858 session->entity_name, session->caps,
11fdf7f2 12859 session->get_peer_socket_addr(),
28e407b8
AA
12860 pool_name)) {
12861 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
12862 << "privileges. message: " << *m << std::endl
12863 << "caps: " << session->caps << dendl;
12864 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12865 return true;
12866 }
12867 }
12868 break;
12869 default:
12870 if (!session->is_capable("osd", MON_CAP_W)) {
12871 dout(0) << "got pool op from entity with insufficient privileges. "
12872 << "message: " << *m << std::endl
12873 << "caps: " << session->caps << dendl;
12874 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12875 return true;
12876 }
12877 break;
12878 }
12879
12880 return false;
12881}
12882
12883bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
12884{
12885 op->mark_osdmon_event(__func__);
12886 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12887
12888 if (enforce_pool_op_caps(op)) {
12889 return true;
12890 }
12891
7c673cae
FG
12892 if (m->fsid != mon->monmap->fsid) {
12893 dout(0) << __func__ << " drop message on fsid " << m->fsid
12894 << " != " << mon->monmap->fsid << " for " << *m << dendl;
12895 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12896 return true;
12897 }
12898
12899 if (m->op == POOL_OP_CREATE)
12900 return preprocess_pool_op_create(op);
12901
11fdf7f2
TL
12902 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
12903 if (p == nullptr) {
7c673cae 12904 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11fdf7f2
TL
12905 if (m->op == POOL_OP_DELETE) {
12906 _pool_op_reply(op, 0, osdmap.get_epoch());
12907 } else {
12908 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12909 }
7c673cae
FG
12910 return true;
12911 }
12912
12913 // check if the snap and snapname exist
12914 bool snap_exists = false;
7c673cae
FG
12915 if (p->snap_exists(m->name.c_str()))
12916 snap_exists = true;
12917
12918 switch (m->op) {
12919 case POOL_OP_CREATE_SNAP:
12920 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
12921 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12922 return true;
12923 }
12924 if (snap_exists) {
12925 _pool_op_reply(op, 0, osdmap.get_epoch());
12926 return true;
12927 }
12928 return false;
12929 case POOL_OP_CREATE_UNMANAGED_SNAP:
12930 if (p->is_pool_snaps_mode()) {
12931 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12932 return true;
12933 }
12934 return false;
12935 case POOL_OP_DELETE_SNAP:
12936 if (p->is_unmanaged_snaps_mode()) {
12937 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12938 return true;
12939 }
12940 if (!snap_exists) {
12941 _pool_op_reply(op, 0, osdmap.get_epoch());
12942 return true;
12943 }
12944 return false;
12945 case POOL_OP_DELETE_UNMANAGED_SNAP:
12946 if (p->is_pool_snaps_mode()) {
12947 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12948 return true;
12949 }
12950 if (p->is_removed_snap(m->snapid)) {
12951 _pool_op_reply(op, 0, osdmap.get_epoch());
12952 return true;
12953 }
12954 return false;
12955 case POOL_OP_DELETE:
12956 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
12957 _pool_op_reply(op, 0, osdmap.get_epoch());
12958 return true;
12959 }
12960 return false;
12961 case POOL_OP_AUID_CHANGE:
12962 return false;
12963 default:
12964 ceph_abort();
12965 break;
12966 }
12967
12968 return false;
12969}
12970
12971bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
12972{
12973 op->mark_osdmon_event(__func__);
12974 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
7c673cae
FG
12975 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
12976 if (pool >= 0) {
12977 _pool_op_reply(op, 0, osdmap.get_epoch());
12978 return true;
12979 }
12980
12981 return false;
12982}
12983
12984bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
12985{
12986 op->mark_osdmon_event(__func__);
12987 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12988 dout(10) << "prepare_pool_op " << *m << dendl;
12989 if (m->op == POOL_OP_CREATE) {
12990 return prepare_pool_op_create(op);
12991 } else if (m->op == POOL_OP_DELETE) {
12992 return prepare_pool_op_delete(op);
12993 }
12994
12995 int ret = 0;
12996 bool changed = false;
12997
12998 if (!osdmap.have_pg_pool(m->pool)) {
12999 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13000 return false;
13001 }
13002
13003 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13004
13005 switch (m->op) {
13006 case POOL_OP_CREATE_SNAP:
13007 if (pool->is_tier()) {
13008 ret = -EINVAL;
13009 _pool_op_reply(op, ret, osdmap.get_epoch());
13010 return false;
13011 } // else, fall through
13012 case POOL_OP_DELETE_SNAP:
13013 if (!pool->is_unmanaged_snaps_mode()) {
13014 bool snap_exists = pool->snap_exists(m->name.c_str());
13015 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13016 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13017 ret = 0;
13018 } else {
13019 break;
13020 }
13021 } else {
13022 ret = -EINVAL;
13023 }
13024 _pool_op_reply(op, ret, osdmap.get_epoch());
13025 return false;
13026
13027 case POOL_OP_DELETE_UNMANAGED_SNAP:
13028 // we won't allow removal of an unmanaged snapshot from a pool
13029 // not in unmanaged snaps mode.
13030 if (!pool->is_unmanaged_snaps_mode()) {
13031 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13032 return false;
13033 }
13034 /* fall-thru */
13035 case POOL_OP_CREATE_UNMANAGED_SNAP:
13036 // but we will allow creating an unmanaged snapshot on any pool
13037 // as long as it is not in 'pool' snaps mode.
13038 if (pool->is_pool_snaps_mode()) {
13039 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13040 return false;
13041 }
13042 }
13043
13044 // projected pool info
13045 pg_pool_t pp;
13046 if (pending_inc.new_pools.count(m->pool))
13047 pp = pending_inc.new_pools[m->pool];
13048 else
13049 pp = *osdmap.get_pg_pool(m->pool);
13050
13051 bufferlist reply_data;
13052
13053 // pool snaps vs unmanaged snaps are mutually exclusive
13054 switch (m->op) {
13055 case POOL_OP_CREATE_SNAP:
13056 case POOL_OP_DELETE_SNAP:
13057 if (pp.is_unmanaged_snaps_mode()) {
13058 ret = -EINVAL;
13059 goto out;
13060 }
13061 break;
13062
13063 case POOL_OP_CREATE_UNMANAGED_SNAP:
13064 case POOL_OP_DELETE_UNMANAGED_SNAP:
13065 if (pp.is_pool_snaps_mode()) {
13066 ret = -EINVAL;
13067 goto out;
13068 }
13069 }
13070
13071 switch (m->op) {
13072 case POOL_OP_CREATE_SNAP:
13073 if (!pp.snap_exists(m->name.c_str())) {
13074 pp.add_snap(m->name.c_str(), ceph_clock_now());
11fdf7f2
TL
13075 dout(10) << "create snap in pool " << m->pool << " " << m->name
13076 << " seq " << pp.get_snap_epoch() << dendl;
7c673cae
FG
13077 changed = true;
13078 }
13079 break;
13080
13081 case POOL_OP_DELETE_SNAP:
13082 {
13083 snapid_t s = pp.snap_exists(m->name.c_str());
13084 if (s) {
13085 pp.remove_snap(s);
11fdf7f2 13086 pending_inc.new_removed_snaps[m->pool].insert(s);
7c673cae
FG
13087 changed = true;
13088 }
13089 }
13090 break;
13091
13092 case POOL_OP_CREATE_UNMANAGED_SNAP:
13093 {
13094 uint64_t snapid;
13095 pp.add_unmanaged_snap(snapid);
11fdf7f2 13096 encode(snapid, reply_data);
7c673cae
FG
13097 changed = true;
13098 }
13099 break;
13100
13101 case POOL_OP_DELETE_UNMANAGED_SNAP:
13102 if (!pp.is_removed_snap(m->snapid)) {
28e407b8
AA
13103 if (m->snapid > pp.get_snap_seq()) {
13104 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13105 return false;
13106 }
7c673cae 13107 pp.remove_unmanaged_snap(m->snapid);
11fdf7f2 13108 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
7c673cae
FG
13109 changed = true;
13110 }
13111 break;
13112
13113 case POOL_OP_AUID_CHANGE:
11fdf7f2
TL
13114 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13115 return false;
7c673cae
FG
13116
13117 default:
13118 ceph_abort();
13119 break;
13120 }
13121
13122 if (changed) {
13123 pp.set_snap_epoch(pending_inc.epoch);
13124 pending_inc.new_pools[m->pool] = pp;
13125 }
13126
13127 out:
13128 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13129 return true;
13130}
13131
13132bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13133{
13134 op->mark_osdmon_event(__func__);
13135 int err = prepare_new_pool(op);
13136 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13137 return true;
13138}
13139
13140int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13141 ostream *ss)
13142{
13143 const string& poolstr = osdmap.get_pool_name(pool_id);
13144
13145 // If the Pool is in use by CephFS, refuse to delete it
28e407b8 13146 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
13147 if (pending_fsmap.pool_in_use(pool_id)) {
13148 *ss << "pool '" << poolstr << "' is in use by CephFS";
13149 return -EBUSY;
13150 }
13151
13152 if (pool.tier_of >= 0) {
13153 *ss << "pool '" << poolstr << "' is a tier of '"
13154 << osdmap.get_pool_name(pool.tier_of) << "'";
13155 return -EBUSY;
13156 }
13157 if (!pool.tiers.empty()) {
13158 *ss << "pool '" << poolstr << "' has tiers";
13159 for(auto tier : pool.tiers) {
13160 *ss << " " << osdmap.get_pool_name(tier);
13161 }
13162 return -EBUSY;
13163 }
13164
11fdf7f2 13165 if (!g_conf()->mon_allow_pool_delete) {
7c673cae
FG
13166 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13167 return -EPERM;
13168 }
13169
13170 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13171 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13172 return -EPERM;
13173 }
13174
13175 *ss << "pool '" << poolstr << "' removed";
13176 return 0;
13177}
13178
13179/**
13180 * Check if it is safe to add a tier to a base pool
13181 *
13182 * @return
13183 * True if the operation should proceed, false if we should abort here
13184 * (abort doesn't necessarily mean error, could be idempotency)
13185 */
13186bool OSDMonitor::_check_become_tier(
13187 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13188 const int64_t base_pool_id, const pg_pool_t *base_pool,
13189 int *err,
13190 ostream *ss) const
13191{
13192 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13193 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13194
28e407b8 13195 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
13196 if (pending_fsmap.pool_in_use(tier_pool_id)) {
13197 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13198 *err = -EBUSY;
13199 return false;
13200 }
13201
13202 if (base_pool->tiers.count(tier_pool_id)) {
11fdf7f2 13203 ceph_assert(tier_pool->tier_of == base_pool_id);
7c673cae
FG
13204 *err = 0;
13205 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13206 << base_pool_name << "'";
13207 return false;
13208 }
13209
13210 if (base_pool->is_tier()) {
13211 *ss << "pool '" << base_pool_name << "' is already a tier of '"
13212 << osdmap.get_pool_name(base_pool->tier_of) << "', "
13213 << "multiple tiers are not yet supported.";
13214 *err = -EINVAL;
13215 return false;
13216 }
13217
13218 if (tier_pool->has_tiers()) {
13219 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13220 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13221 it != tier_pool->tiers.end(); ++it)
13222 *ss << "'" << osdmap.get_pool_name(*it) << "',";
13223 *ss << " multiple tiers are not yet supported.";
13224 *err = -EINVAL;
13225 return false;
13226 }
13227
13228 if (tier_pool->is_tier()) {
13229 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13230 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13231 *err = -EINVAL;
13232 return false;
13233 }
13234
13235 *err = 0;
13236 return true;
13237}
13238
13239
13240/**
13241 * Check if it is safe to remove a tier from this base pool
13242 *
13243 * @return
13244 * True if the operation should proceed, false if we should abort here
13245 * (abort doesn't necessarily mean error, could be idempotency)
13246 */
13247bool OSDMonitor::_check_remove_tier(
13248 const int64_t base_pool_id, const pg_pool_t *base_pool,
13249 const pg_pool_t *tier_pool,
13250 int *err, ostream *ss) const
13251{
13252 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13253
13254 // Apply CephFS-specific checks
28e407b8 13255 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae 13256 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
13257 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13258 // If the underlying pool is erasure coded and does not allow EC
13259 // overwrites, we can't permit the removal of the replicated tier that
13260 // CephFS relies on to access it
13261 *ss << "pool '" << base_pool_name <<
13262 "' does not allow EC overwrites and is in use by CephFS"
13263 " via its tier";
7c673cae
FG
13264 *err = -EBUSY;
13265 return false;
13266 }
13267
13268 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13269 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13270 "tier is still in use as a writeback cache. Change the cache "
13271 "mode and flush the cache before removing it";
13272 *err = -EBUSY;
13273 return false;
13274 }
13275 }
13276
13277 *err = 0;
13278 return true;
13279}
13280
13281int OSDMonitor::_prepare_remove_pool(
13282 int64_t pool, ostream *ss, bool no_fake)
13283{
224ce89b 13284 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
13285 const pg_pool_t *p = osdmap.get_pg_pool(pool);
13286 int r = _check_remove_pool(pool, *p, ss);
13287 if (r < 0)
13288 return r;
13289
13290 auto new_pool = pending_inc.new_pools.find(pool);
13291 if (new_pool != pending_inc.new_pools.end()) {
13292 // if there is a problem with the pending info, wait and retry
13293 // this op.
13294 const auto& p = new_pool->second;
13295 int r = _check_remove_pool(pool, p, ss);
13296 if (r < 0)
13297 return -EAGAIN;
13298 }
13299
13300 if (pending_inc.old_pools.count(pool)) {
224ce89b 13301 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
13302 << dendl;
13303 return 0;
13304 }
13305
11fdf7f2 13306 if (g_conf()->mon_fake_pool_delete && !no_fake) {
7c673cae
FG
13307 string old_name = osdmap.get_pool_name(pool);
13308 string new_name = old_name + "." + stringify(pool) + ".DELETED";
13309 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13310 << old_name << " -> " << new_name << dendl;
13311 pending_inc.new_pool_names[pool] = new_name;
13312 return 0;
13313 }
13314
13315 // remove
13316 pending_inc.old_pools.insert(pool);
13317
224ce89b 13318 // remove any pg_temp mappings for this pool
7c673cae
FG
13319 for (auto p = osdmap.pg_temp->begin();
13320 p != osdmap.pg_temp->end();
13321 ++p) {
11fdf7f2 13322 if (p->first.pool() == pool) {
224ce89b 13323 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
13324 << p->first << dendl;
13325 pending_inc.new_pg_temp[p->first].clear();
13326 }
13327 }
224ce89b 13328 // remove any primary_temp mappings for this pool
7c673cae
FG
13329 for (auto p = osdmap.primary_temp->begin();
13330 p != osdmap.primary_temp->end();
13331 ++p) {
11fdf7f2 13332 if (p->first.pool() == pool) {
224ce89b 13333 dout(10) << __func__ << " " << pool
7c673cae
FG
13334 << " removing obsolete primary_temp" << p->first << dendl;
13335 pending_inc.new_primary_temp[p->first] = -1;
13336 }
13337 }
224ce89b
WB
13338 // remove any pg_upmap mappings for this pool
13339 for (auto& p : osdmap.pg_upmap) {
11fdf7f2 13340 if (p.first.pool() == pool) {
224ce89b
WB
13341 dout(10) << __func__ << " " << pool
13342 << " removing obsolete pg_upmap "
13343 << p.first << dendl;
13344 pending_inc.old_pg_upmap.insert(p.first);
13345 }
13346 }
94b18763
FG
13347 // remove any pending pg_upmap mappings for this pool
13348 {
13349 auto it = pending_inc.new_pg_upmap.begin();
13350 while (it != pending_inc.new_pg_upmap.end()) {
11fdf7f2 13351 if (it->first.pool() == pool) {
94b18763
FG
13352 dout(10) << __func__ << " " << pool
13353 << " removing pending pg_upmap "
13354 << it->first << dendl;
13355 it = pending_inc.new_pg_upmap.erase(it);
13356 } else {
13357 it++;
13358 }
13359 }
13360 }
224ce89b
WB
13361 // remove any pg_upmap_items mappings for this pool
13362 for (auto& p : osdmap.pg_upmap_items) {
11fdf7f2 13363 if (p.first.pool() == pool) {
224ce89b
WB
13364 dout(10) << __func__ << " " << pool
13365 << " removing obsolete pg_upmap_items " << p.first
13366 << dendl;
13367 pending_inc.old_pg_upmap_items.insert(p.first);
13368 }
13369 }
94b18763
FG
13370 // remove any pending pg_upmap mappings for this pool
13371 {
13372 auto it = pending_inc.new_pg_upmap_items.begin();
13373 while (it != pending_inc.new_pg_upmap_items.end()) {
11fdf7f2 13374 if (it->first.pool() == pool) {
94b18763
FG
13375 dout(10) << __func__ << " " << pool
13376 << " removing pending pg_upmap_items "
13377 << it->first << dendl;
13378 it = pending_inc.new_pg_upmap_items.erase(it);
13379 } else {
13380 it++;
13381 }
13382 }
13383 }
35e4c445
FG
13384
13385 // remove any choose_args for this pool
13386 CrushWrapper newcrush;
13387 _get_pending_crush(newcrush);
13388 if (newcrush.have_choose_args(pool)) {
13389 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13390 newcrush.rm_choose_args(pool);
13391 pending_inc.crush.clear();
13392 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13393 }
7c673cae
FG
13394 return 0;
13395}
13396
13397int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13398{
13399 dout(10) << "_prepare_rename_pool " << pool << dendl;
13400 if (pending_inc.old_pools.count(pool)) {
13401 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13402 return -ENOENT;
13403 }
13404 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13405 p != pending_inc.new_pool_names.end();
13406 ++p) {
13407 if (p->second == newname && p->first != pool) {
13408 return -EEXIST;
13409 }
13410 }
13411
13412 pending_inc.new_pool_names[pool] = newname;
13413 return 0;
13414}
13415
13416bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13417{
13418 op->mark_osdmon_event(__func__);
13419 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13420 ostringstream ss;
13421 int ret = _prepare_remove_pool(m->pool, &ss, false);
13422 if (ret == -EAGAIN) {
13423 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13424 return true;
13425 }
13426 if (ret < 0)
13427 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13428 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13429 pending_inc.epoch));
13430 return true;
13431}
13432
13433void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13434 int ret, epoch_t epoch, bufferlist *blp)
13435{
13436 op->mark_osdmon_event(__func__);
13437 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13438 dout(20) << "_pool_op_reply " << ret << dendl;
13439 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13440 ret, epoch, get_last_committed(), blp);
13441 mon->send_reply(op, reply);
13442}
81eedcae
TL
13443
13444void OSDMonitor::convert_pool_priorities(void)
13445{
13446 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13447 int64_t max_prio = 0;
13448 int64_t min_prio = 0;
13449 for (const auto &i : osdmap.get_pools()) {
13450 const auto &pool = i.second;
13451
13452 if (pool.opts.is_set(key)) {
13453 int64_t prio;
13454 pool.opts.get(key, &prio);
13455 if (prio > max_prio)
13456 max_prio = prio;
13457 if (prio < min_prio)
13458 min_prio = prio;
13459 }
13460 }
13461 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13462 dout(20) << __func__ << " nothing to fix" << dendl;
13463 return;
13464 }
13465 // Current pool priorities exceeds new maximum
13466 for (const auto &i : osdmap.get_pools()) {
13467 const auto pool_id = i.first;
13468 pg_pool_t pool = i.second;
13469
13470 int64_t prio = 0;
13471 pool.opts.get(key, &prio);
13472 int64_t n;
13473
13474 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13475 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13476 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13477 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13478 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13479 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13480 } else {
13481 continue;
13482 }
13483 if (n == 0) {
13484 pool.opts.unset(key);
13485 } else {
13486 pool.opts.set(key, static_cast<int64_t>(n));
13487 }
13488 dout(10) << __func__ << " pool " << pool_id
13489 << " recovery_priority adjusted "
13490 << prio << " to " << n << dendl;
13491 pool.last_change = pending_inc.epoch;
13492 pending_inc.new_pools[pool_id] = pool;
13493 }
13494}