]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
bump version to 12.2.12-pve1
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b
WB
20#include <boost/algorithm/string.hpp>
21#include <locale>
7c673cae
FG
22#include <sstream>
23
31f18b77
FG
24#include "mon/OSDMonitor.h"
25#include "mon/Monitor.h"
26#include "mon/MDSMonitor.h"
27#include "mon/PGMonitor.h"
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
42#include "messages/MOSDFull.h"
43#include "messages/MOSDMap.h"
44#include "messages/MMonGetOSDMap.h"
45#include "messages/MOSDBoot.h"
46#include "messages/MOSDAlive.h"
47#include "messages/MPoolOp.h"
48#include "messages/MPoolOpReply.h"
49#include "messages/MOSDPGCreate.h"
50#include "messages/MOSDPGCreated.h"
51#include "messages/MOSDPGTemp.h"
52#include "messages/MMonCommand.h"
53#include "messages/MRemoveSnaps.h"
54#include "messages/MOSDScrub.h"
55#include "messages/MRoute.h"
56
57#include "common/TextTable.h"
58#include "common/Timer.h"
59#include "common/ceph_argparse.h"
60#include "common/perf_counters.h"
61#include "common/strtol.h"
62
63#include "common/config.h"
64#include "common/errno.h"
65
66#include "erasure-code/ErasureCodePlugin.h"
67#include "compressor/Compressor.h"
68#include "common/Checksummer.h"
69
70#include "include/compat.h"
71#include "include/assert.h"
72#include "include/stringify.h"
73#include "include/util.h"
74#include "common/cmdparse.h"
75#include "include/str_list.h"
76#include "include/str_map.h"
224ce89b 77#include "include/scope_guard.h"
7c673cae 78
28e407b8
AA
79#include "auth/cephx/CephxKeyServer.h"
80#include "osd/OSDCap.h"
81
7c673cae
FG
82#include "json_spirit/json_spirit_reader.h"
83
c07f9fc5
FG
84#include <boost/algorithm/string/predicate.hpp>
85
7c673cae 86#define dout_subsys ceph_subsys_mon
3efd9988
FG
87static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
88static const string OSD_METADATA_PREFIX("osd_metadata");
7c673cae 89
c07f9fc5
FG
90namespace {
91
92const uint32_t MAX_POOL_APPLICATIONS = 4;
93const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
94const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
95
28e407b8
AA
96bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
97 // Note: this doesn't include support for the application tag match
98 if ((grant.spec.allow & OSD_CAP_W) != 0) {
99 auto& match = grant.match;
100 if (match.is_match_all()) {
101 return true;
102 } else if (pool_name != nullptr && match.auid < 0 &&
103 !match.pool_namespace.pool_name.empty() &&
104 match.pool_namespace.pool_name == *pool_name) {
105 return true;
106 }
107 }
108 return false;
109}
110
111bool is_unmanaged_snap_op_permitted(CephContext* cct,
112 const KeyServer& key_server,
113 const EntityName& entity_name,
114 const MonCap& mon_caps,
115 const std::string* pool_name)
116{
117 typedef std::map<std::string, std::string> CommandArgs;
118
119 if (mon_caps.is_capable(cct, CEPH_ENTITY_TYPE_MON,
120 entity_name, "osd",
121 "osd pool op unmanaged-snap",
122 (pool_name == nullptr ?
123 CommandArgs{} /* pool DNE, require unrestricted cap */ :
124 CommandArgs{{"poolname", *pool_name}}),
125 false, true, false)) {
126 return true;
127 }
128
129 AuthCapsInfo caps_info;
130 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
131 caps_info)) {
132 dout(10) << "unable to locate OSD cap data for " << entity_name
133 << " in auth db" << dendl;
134 return false;
135 }
136
137 string caps_str;
138 if (caps_info.caps.length() > 0) {
139 auto p = caps_info.caps.begin();
140 try {
141 decode(caps_str, p);
142 } catch (const buffer::error &err) {
143 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
144 << dendl;
145 return false;
146 }
147 }
148
149 OSDCap osd_cap;
150 if (!osd_cap.parse(caps_str, nullptr)) {
151 dout(10) << "unable to parse OSD cap data for " << entity_name
152 << " in auth db" << dendl;
153 return false;
154 }
155
156 // if the entity has write permissions in one or all pools, permit
157 // usage of unmanaged-snapshots
158 if (osd_cap.allow_all()) {
159 return true;
160 }
161
162 for (auto& grant : osd_cap.grants) {
163 if (grant.profile.is_valid()) {
164 for (auto& profile_grant : grant.profile_grants) {
165 if (is_osd_writable(profile_grant, pool_name)) {
166 return true;
167 }
168 }
169 } else if (is_osd_writable(grant, pool_name)) {
170 return true;
171 }
172 }
173
174 return false;
175}
176
c07f9fc5
FG
177} // anonymous namespace
178
7c673cae
FG
179void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
180{
181 if (epoch_by_pg.size() <= ps) {
182 epoch_by_pg.resize(ps + 1, 0);
183 }
184 const auto old_lec = epoch_by_pg[ps];
185 if (old_lec >= last_epoch_clean) {
186 // stale lec
187 return;
188 }
189 epoch_by_pg[ps] = last_epoch_clean;
190 if (last_epoch_clean < floor) {
191 floor = last_epoch_clean;
192 } else if (last_epoch_clean > floor) {
193 if (old_lec == floor) {
194 // probably should increase floor?
195 auto new_floor = std::min_element(std::begin(epoch_by_pg),
196 std::end(epoch_by_pg));
197 floor = *new_floor;
198 }
199 }
200 if (ps != next_missing) {
201 return;
202 }
203 for (; next_missing < epoch_by_pg.size(); next_missing++) {
204 if (epoch_by_pg[next_missing] == 0) {
205 break;
206 }
207 }
208}
209
210void LastEpochClean::remove_pool(uint64_t pool)
211{
212 report_by_pool.erase(pool);
213}
214
215void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
216{
217 auto& lec = report_by_pool[pg.pool()];
218 return lec.report(pg.ps(), last_epoch_clean);
219}
220
221epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
222{
223 auto floor = latest.get_epoch();
224 for (auto& pool : latest.get_pools()) {
225 auto reported = report_by_pool.find(pool.first);
226 if (reported == report_by_pool.end()) {
227 return 0;
228 }
229 if (reported->second.next_missing < pool.second.get_pg_num()) {
230 return 0;
231 }
232 if (reported->second.floor < floor) {
233 floor = reported->second.floor;
234 }
235 }
236 return floor;
237}
238
239
240struct C_UpdateCreatingPGs : public Context {
241 OSDMonitor *osdmon;
242 utime_t start;
243 epoch_t epoch;
244 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
245 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
246 void finish(int r) override {
247 if (r >= 0) {
248 utime_t end = ceph_clock_now();
249 dout(10) << "osdmap epoch " << epoch << " mapping took "
250 << (end - start) << " seconds" << dendl;
251 osdmon->update_creating_pgs();
252 osdmon->check_pg_creates_subs();
253 }
254 }
255};
256
257#undef dout_prefix
258#define dout_prefix _prefix(_dout, mon, osdmap)
259static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
260 return *_dout << "mon." << mon->name << "@" << mon->rank
261 << "(" << mon->get_state_name()
262 << ").osd e" << osdmap.get_epoch() << " ";
263}
264
265OSDMonitor::OSDMonitor(
266 CephContext *cct,
267 Monitor *mn,
268 Paxos *p,
269 const string& service_name)
270 : PaxosService(mn, p, service_name),
271 cct(cct),
272 inc_osd_cache(g_conf->mon_osd_cache_size),
273 full_osd_cache(g_conf->mon_osd_cache_size),
7c673cae
FG
274 mapper(mn->cct, &mn->cpu_tp),
275 op_tracker(cct, true, 1)
276{}
277
278bool OSDMonitor::_have_pending_crush()
279{
280 return pending_inc.crush.length() > 0;
281}
282
283CrushWrapper &OSDMonitor::_get_stable_crush()
284{
285 return *osdmap.crush;
286}
287
288void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
289{
290 bufferlist bl;
291 if (pending_inc.crush.length())
292 bl = pending_inc.crush;
293 else
294 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
295
296 bufferlist::iterator p = bl.begin();
297 newcrush.decode(p);
298}
299
300void OSDMonitor::create_initial()
301{
302 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
303
304 OSDMap newmap;
305
306 bufferlist bl;
307 mon->store->get("mkfs", "osdmap", bl);
308
309 if (bl.length()) {
310 newmap.decode(bl);
311 newmap.set_fsid(mon->monmap->fsid);
312 } else {
224ce89b 313 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0);
7c673cae
FG
314 }
315 newmap.set_epoch(1);
316 newmap.created = newmap.modified = ceph_clock_now();
317
318 // new clusters should sort bitwise by default.
319 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
320
321 // new cluster should require latest by default
31f18b77
FG
322 if (g_conf->mon_debug_no_require_luminous) {
323 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
324 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
325 } else {
326 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
181888fb
FG
327 newmap.flags |=
328 CEPH_OSDMAP_RECOVERY_DELETES |
329 CEPH_OSDMAP_PURGED_SNAPDIRS;
7c673cae
FG
330 newmap.full_ratio = g_conf->mon_osd_full_ratio;
331 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
332 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
333 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
334 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
335 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
31f18b77
FG
336 int r = ceph_release_from_name(
337 g_conf->mon_osd_initial_require_min_compat_client.c_str());
338 if (r <= 0) {
339 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
340 }
341 newmap.require_min_compat_client = r;
7c673cae
FG
342 }
343
344 // encode into pending incremental
28e407b8 345 uint64_t features = newmap.get_encoding_features();
7c673cae 346 newmap.encode(pending_inc.fullmap,
28e407b8 347 features | CEPH_FEATURE_RESERVED);
7c673cae
FG
348 pending_inc.full_crc = newmap.get_crc();
349 dout(20) << " full crc " << pending_inc.full_crc << dendl;
350}
351
352void OSDMonitor::get_store_prefixes(std::set<string>& s)
353{
354 s.insert(service_name);
355 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 356 s.insert(OSD_METADATA_PREFIX);
7c673cae
FG
357}
358
359void OSDMonitor::update_from_paxos(bool *need_bootstrap)
360{
361 version_t version = get_last_committed();
362 if (version == osdmap.epoch)
363 return;
364 assert(version > osdmap.epoch);
365
366 dout(15) << "update_from_paxos paxos e " << version
367 << ", my e " << osdmap.epoch << dendl;
368
31f18b77
FG
369 if (mapping_job) {
370 if (!mapping_job->is_done()) {
371 dout(1) << __func__ << " mapping job "
372 << mapping_job.get() << " did not complete, "
373 << mapping_job->shards << " left, canceling" << dendl;
374 mapping_job->abort();
375 }
376 mapping_job.reset();
377 }
7c673cae 378
224ce89b
WB
379 load_health();
380
7c673cae
FG
381 /*
382 * We will possibly have a stashed latest that *we* wrote, and we will
383 * always be sure to have the oldest full map in the first..last range
384 * due to encode_trim_extra(), which includes the oldest full map in the trim
385 * transaction.
386 *
387 * encode_trim_extra() does not however write the full map's
388 * version to 'full_latest'. This is only done when we are building the
389 * full maps from the incremental versions. But don't panic! We make sure
390 * that the following conditions find whichever full map version is newer.
391 */
392 version_t latest_full = get_version_latest_full();
393 if (latest_full == 0 && get_first_committed() > 1)
394 latest_full = get_first_committed();
395
396 if (get_first_committed() > 1 &&
397 latest_full < get_first_committed()) {
398 // the monitor could be just sync'ed with its peer, and the latest_full key
399 // is not encoded in the paxos commits in encode_pending(), so we need to
400 // make sure we get it pointing to a proper version.
401 version_t lc = get_last_committed();
402 version_t fc = get_first_committed();
403
404 dout(10) << __func__ << " looking for valid full map in interval"
405 << " [" << fc << ", " << lc << "]" << dendl;
406
407 latest_full = 0;
408 for (version_t v = lc; v >= fc; v--) {
409 string full_key = "full_" + stringify(v);
410 if (mon->store->exists(get_service_name(), full_key)) {
411 dout(10) << __func__ << " found latest full map v " << v << dendl;
412 latest_full = v;
413 break;
414 }
415 }
416
417 assert(latest_full > 0);
418 auto t(std::make_shared<MonitorDBStore::Transaction>());
419 put_version_latest_full(t, latest_full);
420 mon->store->apply_transaction(t);
421 dout(10) << __func__ << " updated the on-disk full map version to "
422 << latest_full << dendl;
423 }
424
425 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
426 bufferlist latest_bl;
427 get_version_full(latest_full, latest_bl);
428 assert(latest_bl.length() != 0);
429 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
430 osdmap.decode(latest_bl);
431 }
432
433 if (mon->monmap->get_required_features().contains_all(
434 ceph::features::mon::FEATURE_LUMINOUS)) {
435 bufferlist bl;
436 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
437 auto p = bl.begin();
438 std::lock_guard<std::mutex> l(creating_pgs_lock);
439 creating_pgs.decode(p);
440 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
441 << creating_pgs.last_scan_epoch
442 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
443 } else {
444 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
445 << dendl;
446 }
447 }
448
31f18b77
FG
449 // make sure we're using the right pg service.. remove me post-luminous!
450 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
451 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
452 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
453 } else {
454 dout(10) << __func__ << " pgservice is pg" << dendl;
455 mon->pgservice = mon->pgmon()->get_pg_stat_service();
456 }
457
7c673cae
FG
458 // walk through incrementals
459 MonitorDBStore::TransactionRef t;
460 size_t tx_size = 0;
461 while (version > osdmap.epoch) {
462 bufferlist inc_bl;
463 int err = get_version(osdmap.epoch+1, inc_bl);
464 assert(err == 0);
465 assert(inc_bl.length());
466
467 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
468 << dendl;
469 OSDMap::Incremental inc(inc_bl);
470 err = osdmap.apply_incremental(inc);
471 assert(err == 0);
472
473 if (!t)
474 t.reset(new MonitorDBStore::Transaction);
475
476 // Write out the full map for all past epochs. Encode the full
477 // map with the same features as the incremental. If we don't
478 // know, use the quorum features. If we don't know those either,
479 // encode with all features.
480 uint64_t f = inc.encode_features;
481 if (!f)
482 f = mon->get_quorum_con_features();
483 if (!f)
484 f = -1;
485 bufferlist full_bl;
486 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
487 tx_size += full_bl.length();
488
489 bufferlist orig_full_bl;
490 get_version_full(osdmap.epoch, orig_full_bl);
491 if (orig_full_bl.length()) {
492 // the primary provided the full map
493 assert(inc.have_crc);
494 if (inc.full_crc != osdmap.crc) {
495 // This will happen if the mons were running mixed versions in
496 // the past or some other circumstance made the full encoded
497 // maps divergent. Reloading here will bring us back into
498 // sync with the primary for this and all future maps. OSDs
499 // will also be brought back into sync when they discover the
500 // crc mismatch and request a full map from a mon.
501 derr << __func__ << " full map CRC mismatch, resetting to canonical"
502 << dendl;
503 osdmap = OSDMap();
504 osdmap.decode(orig_full_bl);
505 }
506 } else {
507 assert(!inc.have_crc);
508 put_version_full(t, osdmap.epoch, full_bl);
509 }
510 put_version_latest_full(t, osdmap.epoch);
511
512 // share
513 dout(1) << osdmap << dendl;
514
515 if (osdmap.epoch == 1) {
516 t->erase("mkfs", "osdmap");
517 }
518
31f18b77
FG
519 // make sure we're using the right pg service.. remove me post-luminous!
520 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
521 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
522 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
523 } else {
524 dout(10) << __func__ << " pgservice is pg" << dendl;
525 mon->pgservice = mon->pgmon()->get_pg_stat_service();
526 }
527
7c673cae
FG
528 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
529 mon->store->apply_transaction(t);
530 t = MonitorDBStore::TransactionRef();
531 tx_size = 0;
532 }
533 if (mon->monmap->get_required_features().contains_all(
534 ceph::features::mon::FEATURE_LUMINOUS)) {
7c673cae
FG
535 for (const auto &osd_state : inc.new_state) {
536 if (osd_state.second & CEPH_OSD_UP) {
537 // could be marked up *or* down, but we're too lazy to check which
538 last_osd_report.erase(osd_state.first);
539 }
540 if (osd_state.second & CEPH_OSD_EXISTS) {
541 // could be created *or* destroyed, but we can safely drop it
542 osd_epochs.erase(osd_state.first);
543 }
544 }
545 }
546 }
547
548 if (t) {
549 mon->store->apply_transaction(t);
550 }
551
552 for (int o = 0; o < osdmap.get_max_osd(); o++) {
553 if (osdmap.is_out(o))
554 continue;
555 auto found = down_pending_out.find(o);
556 if (osdmap.is_down(o)) {
557 // populate down -> out map
558 if (found == down_pending_out.end()) {
559 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
560 down_pending_out[o] = ceph_clock_now();
561 }
562 } else {
563 if (found != down_pending_out.end()) {
564 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
565 down_pending_out.erase(found);
566 }
567 }
568 }
569 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
570
571 if (mon->is_leader()) {
572 // kick pgmon, make sure it's seen the latest map
573 mon->pgmon()->check_osd_map(osdmap.epoch);
574 }
575
576 check_osdmap_subs();
577 check_pg_creates_subs();
578
579 share_map_with_random_osd();
580 update_logger();
581
582 process_failures();
583
584 // make sure our feature bits reflect the latest map
585 update_msgr_features();
586
587 if (!mon->is_leader()) {
588 // will be called by on_active() on the leader, avoid doing so twice
589 start_mapping();
590 }
591}
592
593void OSDMonitor::start_mapping()
594{
595 // initiate mapping job
596 if (mapping_job) {
597 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
598 << dendl;
599 mapping_job->abort();
600 }
224ce89b
WB
601 if (!osdmap.get_pools().empty()) {
602 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
603 mapping_job = mapping.start_update(osdmap, mapper,
604 g_conf->mon_osd_mapping_pgs_per_chunk);
605 dout(10) << __func__ << " started mapping job " << mapping_job.get()
606 << " at " << fin->start << dendl;
607 mapping_job->set_finish_event(fin);
608 } else {
609 dout(10) << __func__ << " no pools, no mapping job" << dendl;
610 mapping_job = nullptr;
611 }
7c673cae
FG
612}
613
614void OSDMonitor::update_msgr_features()
615{
616 set<int> types;
617 types.insert((int)entity_name_t::TYPE_OSD);
618 types.insert((int)entity_name_t::TYPE_CLIENT);
619 types.insert((int)entity_name_t::TYPE_MDS);
620 types.insert((int)entity_name_t::TYPE_MON);
621 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
622 uint64_t mask;
623 uint64_t features = osdmap.get_features(*q, &mask);
624 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
625 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
626 Messenger::Policy p = mon->messenger->get_policy(*q);
627 p.features_required = (p.features_required & ~mask) | features;
628 mon->messenger->set_policy(*q, p);
629 }
630 }
631}
632
633void OSDMonitor::on_active()
634{
635 update_logger();
636
637 if (mon->is_leader()) {
224ce89b 638 mon->clog->debug() << "osdmap " << osdmap;
7c673cae
FG
639 } else {
640 list<MonOpRequestRef> ls;
641 take_all_failures(ls);
642 while (!ls.empty()) {
643 MonOpRequestRef op = ls.front();
644 op->mark_osdmon_event(__func__);
645 dispatch(op);
646 ls.pop_front();
647 }
648 }
649 start_mapping();
650}
651
652void OSDMonitor::on_restart()
653{
654 last_osd_report.clear();
655}
656
657void OSDMonitor::on_shutdown()
658{
659 dout(10) << __func__ << dendl;
660 if (mapping_job) {
661 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
662 << dendl;
663 mapping_job->abort();
664 }
665
666 // discard failure info, waiters
667 list<MonOpRequestRef> ls;
668 take_all_failures(ls);
669 ls.clear();
670}
671
672void OSDMonitor::update_logger()
673{
674 dout(10) << "update_logger" << dendl;
675
676 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
677 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
678 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
679 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
680}
681
7c673cae
FG
682void OSDMonitor::create_pending()
683{
684 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
685 pending_inc.fsid = mon->monmap->fsid;
686
687 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
688
689 // clean up pg_temp, primary_temp
690 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
691 dout(10) << "create_pending did clean_temps" << dendl;
692
693 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
694 // instead of osd_backfill_full_ratio config
695 if (osdmap.backfillfull_ratio <= 0) {
696 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
697 if (pending_inc.new_backfillfull_ratio > 1.0)
698 pending_inc.new_backfillfull_ratio /= 100;
699 dout(1) << __func__ << " setting backfillfull_ratio = "
700 << pending_inc.new_backfillfull_ratio << dendl;
701 }
31f18b77
FG
702 if (osdmap.get_epoch() > 0 &&
703 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae 704 // transition full ratios from PGMap to OSDMap (on upgrade)
31f18b77
FG
705 float full_ratio = mon->pgservice->get_full_ratio();
706 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
707 if (osdmap.full_ratio != full_ratio) {
7c673cae 708 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
31f18b77
FG
709 << " -> " << full_ratio << " (from pgmap)" << dendl;
710 pending_inc.new_full_ratio = full_ratio;
7c673cae 711 }
31f18b77 712 if (osdmap.nearfull_ratio != nearfull_ratio) {
7c673cae 713 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
31f18b77
FG
714 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
715 pending_inc.new_nearfull_ratio = nearfull_ratio;
7c673cae
FG
716 }
717 } else {
718 // safety check (this shouldn't really happen)
719 if (osdmap.full_ratio <= 0) {
720 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
721 if (pending_inc.new_full_ratio > 1.0)
722 pending_inc.new_full_ratio /= 100;
723 dout(1) << __func__ << " setting full_ratio = "
724 << pending_inc.new_full_ratio << dendl;
725 }
726 if (osdmap.nearfull_ratio <= 0) {
727 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
728 if (pending_inc.new_nearfull_ratio > 1.0)
729 pending_inc.new_nearfull_ratio /= 100;
730 dout(1) << __func__ << " setting nearfull_ratio = "
731 << pending_inc.new_nearfull_ratio << dendl;
732 }
733 }
3efd9988
FG
734
735 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
736 // structure.
737 if (osdmap.crush->has_legacy_rule_ids()) {
738 CrushWrapper newcrush;
739 _get_pending_crush(newcrush);
740
741 // First, for all pools, work out which rule they really used
742 // by resolving ruleset to rule.
743 for (const auto &i : osdmap.get_pools()) {
744 const auto pool_id = i.first;
745 const auto &pool = i.second;
746 int new_rule_id = newcrush.find_rule(pool.crush_rule,
747 pool.type, pool.size);
748
749 dout(1) << __func__ << " rewriting pool "
750 << osdmap.get_pool_name(pool_id) << " crush ruleset "
751 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
752 if (pending_inc.new_pools.count(pool_id) == 0) {
753 pending_inc.new_pools[pool_id] = pool;
754 }
755 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
756 }
757
758 // Now, go ahead and renumber all the rules so that their
759 // rule_id field corresponds to their position in the array
760 auto old_to_new = newcrush.renumber_rules();
761 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
762 for (const auto &i : old_to_new) {
763 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
764 }
765 pending_inc.crush.clear();
766 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
767 }
7c673cae
FG
768}
769
770creating_pgs_t
94b18763
FG
771OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
772 const OSDMap& nextmap)
7c673cae 773{
31f18b77 774 dout(10) << __func__ << dendl;
7c673cae
FG
775 creating_pgs_t pending_creatings;
776 {
777 std::lock_guard<std::mutex> l(creating_pgs_lock);
778 pending_creatings = creating_pgs;
779 }
31f18b77
FG
780 // check for new or old pools
781 if (pending_creatings.last_scan_epoch < inc.epoch) {
782 if (osdmap.get_epoch() &&
783 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
784 auto added =
785 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
786 osdmap.get_pools(),
787 &pending_creatings);
788 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
789 }
790 unsigned queued = 0;
791 queued += scan_for_creating_pgs(osdmap.get_pools(),
792 inc.old_pools,
793 inc.modified,
794 &pending_creatings);
795 queued += scan_for_creating_pgs(inc.new_pools,
796 inc.old_pools,
797 inc.modified,
798 &pending_creatings);
799 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
800 for (auto deleted_pool : inc.old_pools) {
801 auto removed = pending_creatings.remove_pool(deleted_pool);
802 dout(10) << __func__ << " " << removed
803 << " pg removed because containing pool deleted: "
804 << deleted_pool << dendl;
805 last_epoch_clean.remove_pool(deleted_pool);
806 }
807 // pgmon updates its creating_pgs in check_osd_map() which is called by
808 // on_active() and check_osd_map() could be delayed if lease expires, so its
809 // creating_pgs could be stale in comparison with the one of osdmon. let's
810 // trim them here. otherwise, they will be added back after being erased.
811 unsigned removed = 0;
812 for (auto& pg : pending_created_pgs) {
813 dout(20) << __func__ << " noting created pg " << pg << dendl;
814 pending_creatings.created_pools.insert(pg.pool());
815 removed += pending_creatings.pgs.erase(pg);
816 }
817 pending_created_pgs.clear();
818 dout(10) << __func__ << " " << removed
819 << " pgs removed because they're created" << dendl;
820 pending_creatings.last_scan_epoch = osdmap.get_epoch();
821 }
822
94b18763
FG
823 // filter out any pgs that shouldn't exist.
824 {
825 auto i = pending_creatings.pgs.begin();
826 while (i != pending_creatings.pgs.end()) {
827 if (!nextmap.pg_exists(i->first)) {
828 dout(10) << __func__ << " removing pg " << i->first
829 << " which should not exist" << dendl;
830 i = pending_creatings.pgs.erase(i);
831 } else {
832 ++i;
833 }
834 }
835 }
836
31f18b77
FG
837 // process queue
838 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
839 const auto total = pending_creatings.pgs.size();
840 while (pending_creatings.pgs.size() < max &&
841 !pending_creatings.queue.empty()) {
842 auto p = pending_creatings.queue.begin();
843 int64_t poolid = p->first;
844 dout(10) << __func__ << " pool " << poolid
845 << " created " << p->second.created
846 << " modified " << p->second.modified
847 << " [" << p->second.start << "-" << p->second.end << ")"
848 << dendl;
849 int n = MIN(max - pending_creatings.pgs.size(),
850 p->second.end - p->second.start);
851 ps_t first = p->second.start;
852 ps_t end = first + n;
853 for (ps_t ps = first; ps < end; ++ps) {
854 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
855 // NOTE: use the *current* epoch as the PG creation epoch so that the
856 // OSD does not have to generate a long set of PastIntervals.
857 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
858 p->second.modified));
859 dout(10) << __func__ << " adding " << pgid << dendl;
860 }
861 p->second.start = end;
862 if (p->second.done()) {
863 dout(10) << __func__ << " done with queue for " << poolid << dendl;
864 pending_creatings.queue.erase(p);
865 } else {
866 dout(10) << __func__ << " pool " << poolid
867 << " now [" << p->second.start << "-" << p->second.end << ")"
868 << dendl;
869 }
870 }
871 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
872 << " pools" << dendl;
c07f9fc5
FG
873 dout(10) << __func__
874 << " " << (pending_creatings.pgs.size() - total)
875 << "/" << pending_creatings.pgs.size()
31f18b77 876 << " pgs added from queued pools" << dendl;
7c673cae
FG
877 return pending_creatings;
878}
879
880void OSDMonitor::maybe_prime_pg_temp()
881{
882 bool all = false;
883 if (pending_inc.crush.length()) {
884 dout(10) << __func__ << " new crush map, all" << dendl;
885 all = true;
886 }
887
888 if (!pending_inc.new_up_client.empty()) {
889 dout(10) << __func__ << " new up osds, all" << dendl;
890 all = true;
891 }
892
893 // check for interesting OSDs
894 set<int> osds;
31f18b77 895 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
896 !all && p != pending_inc.new_state.end();
897 ++p) {
898 if ((p->second & CEPH_OSD_UP) &&
899 osdmap.is_up(p->first)) {
900 osds.insert(p->first);
901 }
902 }
903 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
904 !all && p != pending_inc.new_weight.end();
905 ++p) {
906 if (p->second < osdmap.get_weight(p->first)) {
907 // weight reduction
908 osds.insert(p->first);
909 } else {
910 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
911 << dendl;
912 all = true;
913 }
914 }
915
916 if (!all && osds.empty())
917 return;
918
919 if (!all) {
920 unsigned estimate =
921 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
922 if (estimate > mapping.get_num_pgs() *
923 g_conf->mon_osd_prime_pg_temp_max_estimate) {
924 dout(10) << __func__ << " estimate " << estimate << " pgs on "
925 << osds.size() << " osds >= "
926 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
927 << mapping.get_num_pgs() << " pgs, all"
928 << dendl;
929 all = true;
930 } else {
931 dout(10) << __func__ << " estimate " << estimate << " pgs on "
932 << osds.size() << " osds" << dendl;
933 }
934 }
935
936 OSDMap next;
937 next.deepish_copy_from(osdmap);
938 next.apply_incremental(pending_inc);
939
224ce89b
WB
940 if (next.get_pools().empty()) {
941 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
942 } else if (all) {
7c673cae
FG
943 PrimeTempJob job(next, this);
944 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
945 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
946 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
947 } else {
948 dout(10) << __func__ << " did not finish in "
949 << g_conf->mon_osd_prime_pg_temp_max_time
950 << ", stopping" << dendl;
951 job.abort();
952 }
953 } else {
954 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
955 utime_t stop = ceph_clock_now();
956 stop += g_conf->mon_osd_prime_pg_temp_max_time;
957 const int chunk = 1000;
958 int n = chunk;
959 std::unordered_set<pg_t> did_pgs;
960 for (auto osd : osds) {
961 auto& pgs = mapping.get_osd_acting_pgs(osd);
962 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
963 for (auto pgid : pgs) {
964 if (!did_pgs.insert(pgid).second) {
965 continue;
966 }
967 prime_pg_temp(next, pgid);
968 if (--n <= 0) {
969 n = chunk;
970 if (ceph_clock_now() > stop) {
971 dout(10) << __func__ << " consumed more than "
972 << g_conf->mon_osd_prime_pg_temp_max_time
973 << " seconds, stopping"
974 << dendl;
975 return;
976 }
977 }
978 }
979 }
980 }
981}
982
983void OSDMonitor::prime_pg_temp(
984 const OSDMap& next,
985 pg_t pgid)
986{
987 if (mon->monmap->get_required_features().contains_all(
988 ceph::features::mon::FEATURE_LUMINOUS)) {
31f18b77 989 // TODO: remove this creating_pgs direct access?
7c673cae
FG
990 if (creating_pgs.pgs.count(pgid)) {
991 return;
992 }
993 } else {
31f18b77 994 if (mon->pgservice->is_creating_pg(pgid)) {
7c673cae
FG
995 return;
996 }
997 }
998 if (!osdmap.pg_exists(pgid)) {
999 return;
1000 }
1001
1002 vector<int> up, acting;
1003 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1004
1005 vector<int> next_up, next_acting;
1006 int next_up_primary, next_acting_primary;
1007 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1008 &next_acting, &next_acting_primary);
f64942e4
AA
1009 if (acting == next_acting &&
1010 !(up != acting && next_up == next_acting))
7c673cae
FG
1011 return; // no change since last epoch
1012
1013 if (acting.empty())
1014 return; // if previously empty now we can be no worse off
1015 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1016 if (pool && acting.size() < pool->min_size)
1017 return; // can be no worse off than before
1018
c07f9fc5
FG
1019 if (next_up == next_acting) {
1020 acting.clear();
1021 dout(20) << __func__ << "next_up === next_acting now, clear pg_temp"
1022 << dendl;
1023 }
1024
7c673cae
FG
1025 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1026 << " -> " << next_up << "/" << next_acting
1027 << ", priming " << acting
1028 << dendl;
1029 {
1030 Mutex::Locker l(prime_pg_temp_lock);
1031 // do not touch a mapping if a change is pending
1032 pending_inc.new_pg_temp.emplace(
1033 pgid,
1034 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1035 }
1036}
1037
1038/**
1039 * @note receiving a transaction in this function gives a fair amount of
1040 * freedom to the service implementation if it does need it. It shouldn't.
1041 */
1042void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1043{
1044 dout(10) << "encode_pending e " << pending_inc.epoch
1045 << dendl;
1046
1047 // finalize up pending_inc
1048 pending_inc.modified = ceph_clock_now();
1049
1050 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
1051 assert(r == 0);
1052
1053 if (mapping_job) {
1054 if (!mapping_job->is_done()) {
1055 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1056 << mapping_job.get() << " did not complete, "
1057 << mapping_job->shards << " left" << dendl;
1058 mapping_job->abort();
1059 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1060 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1061 << mapping_job.get() << " is prior epoch "
1062 << mapping.get_epoch() << dendl;
1063 } else {
1064 if (g_conf->mon_osd_prime_pg_temp) {
1065 maybe_prime_pg_temp();
1066 }
1067 }
1068 } else if (g_conf->mon_osd_prime_pg_temp) {
1069 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1070 << dendl;
1071 }
1072 mapping_job.reset();
1073
c07f9fc5
FG
1074 // ensure we don't have blank new_state updates. these are interrpeted as
1075 // CEPH_OSD_UP (and almost certainly not what we want!).
1076 auto p = pending_inc.new_state.begin();
1077 while (p != pending_inc.new_state.end()) {
1078 if (p->second == 0) {
1079 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1080 p = pending_inc.new_state.erase(p);
1081 } else {
1082 ++p;
1083 }
1084 }
1085
7c673cae
FG
1086 bufferlist bl;
1087
1088 {
1089 OSDMap tmp;
1090 tmp.deepish_copy_from(osdmap);
1091 tmp.apply_incremental(pending_inc);
1092
31f18b77 1093 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
3efd9988
FG
1094 // remove any legacy osdmap nearfull/full flags
1095 {
1096 if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1097 dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1098 << dendl;
1099 remove_flag(CEPH_OSDMAP_NEARFULL);
1100 remove_flag(CEPH_OSDMAP_FULL);
1101 }
1102 }
1103 // collect which pools are currently affected by
1104 // the near/backfill/full osd(s),
1105 // and set per-pool near/backfill/full flag instead
1106 set<int64_t> full_pool_ids;
1107 set<int64_t> backfillfull_pool_ids;
1108 set<int64_t> nearfull_pool_ids;
1109 tmp.get_full_pools(g_ceph_context,
1110 &full_pool_ids,
1111 &backfillfull_pool_ids,
1112 &nearfull_pool_ids);
1113 if (full_pool_ids.empty() ||
1114 backfillfull_pool_ids.empty() ||
1115 nearfull_pool_ids.empty()) {
1116 // normal case - no nearfull, backfillfull or full osds
1117 // try cancel any improper nearfull/backfillfull/full pool
1118 // flags first
1119 for (auto &pool: tmp.get_pools()) {
1120 auto p = pool.first;
1121 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1122 nearfull_pool_ids.empty()) {
1123 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1124 << "'s nearfull flag" << dendl;
1125 if (pending_inc.new_pools.count(p) == 0) {
1126 // load original pool info first!
1127 pending_inc.new_pools[p] = pool.second;
1128 }
1129 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1130 }
1131 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1132 backfillfull_pool_ids.empty()) {
1133 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1134 << "'s backfillfull flag" << dendl;
1135 if (pending_inc.new_pools.count(p) == 0) {
1136 pending_inc.new_pools[p] = pool.second;
1137 }
1138 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1139 }
1140 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1141 full_pool_ids.empty()) {
1142 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1143 // set by EQUOTA, skipping
1144 continue;
1145 }
1146 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1147 << "'s full flag" << dendl;
1148 if (pending_inc.new_pools.count(p) == 0) {
1149 pending_inc.new_pools[p] = pool.second;
1150 }
1151 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1152 }
1153 }
1154 }
1155 if (!full_pool_ids.empty()) {
1156 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1157 << " as full" << dendl;
1158 for (auto &p: full_pool_ids) {
1159 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1160 continue;
1161 }
1162 if (pending_inc.new_pools.count(p) == 0) {
1163 pending_inc.new_pools[p] = tmp.pools[p];
1164 }
1165 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1166 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1167 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1168 }
1169 // cancel FLAG_FULL for pools which are no longer full too
1170 for (auto &pool: tmp.get_pools()) {
1171 auto p = pool.first;
1172 if (full_pool_ids.count(p)) {
1173 // skip pools we have just marked as full above
1174 continue;
1175 }
1176 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1177 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1178 // don't touch if currently is not full
1179 // or is running out of quota (and hence considered as full)
1180 continue;
1181 }
1182 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1183 << "'s full flag" << dendl;
1184 if (pending_inc.new_pools.count(p) == 0) {
1185 pending_inc.new_pools[p] = pool.second;
1186 }
1187 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1188 }
1189 }
1190 if (!backfillfull_pool_ids.empty()) {
1191 for (auto &p: backfillfull_pool_ids) {
1192 if (full_pool_ids.count(p)) {
1193 // skip pools we have already considered as full above
1194 continue;
1195 }
1196 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1197 // make sure FLAG_FULL is truly set, so we are safe not
1198 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1199 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1200 continue;
1201 }
1202 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1203 // don't bother if pool is already marked as backfillfull
1204 continue;
1205 }
1206 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1207 << "'s as backfillfull" << dendl;
1208 if (pending_inc.new_pools.count(p) == 0) {
1209 pending_inc.new_pools[p] = tmp.pools[p];
1210 }
1211 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1212 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1213 }
1214 // cancel FLAG_BACKFILLFULL for pools
1215 // which are no longer backfillfull too
1216 for (auto &pool: tmp.get_pools()) {
1217 auto p = pool.first;
1218 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1219 // skip pools we have just marked as backfillfull/full above
1220 continue;
1221 }
1222 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1223 // and don't touch if currently is not backfillfull
1224 continue;
1225 }
1226 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1227 << "'s backfillfull flag" << dendl;
1228 if (pending_inc.new_pools.count(p) == 0) {
1229 pending_inc.new_pools[p] = pool.second;
1230 }
1231 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1232 }
1233 }
1234 if (!nearfull_pool_ids.empty()) {
1235 for (auto &p: nearfull_pool_ids) {
1236 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1237 continue;
1238 }
1239 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1240 // make sure FLAG_FULL is truly set, so we are safe not
1241 // to set a extra (redundant) FLAG_NEARFULL flag
1242 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1243 continue;
1244 }
1245 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1246 // don't bother if pool is already marked as nearfull
1247 continue;
1248 }
1249 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1250 << "'s as nearfull" << dendl;
1251 if (pending_inc.new_pools.count(p) == 0) {
1252 pending_inc.new_pools[p] = tmp.pools[p];
1253 }
1254 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1255 }
1256 // cancel FLAG_NEARFULL for pools
1257 // which are no longer nearfull too
1258 for (auto &pool: tmp.get_pools()) {
1259 auto p = pool.first;
1260 if (full_pool_ids.count(p) ||
1261 backfillfull_pool_ids.count(p) ||
1262 nearfull_pool_ids.count(p)) {
1263 // skip pools we have just marked as
1264 // nearfull/backfillfull/full above
1265 continue;
1266 }
1267 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1268 // and don't touch if currently is not nearfull
1269 continue;
1270 }
1271 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1272 << "'s nearfull flag" << dendl;
1273 if (pending_inc.new_pools.count(p) == 0) {
1274 pending_inc.new_pools[p] = pool.second;
1275 }
1276 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1277 }
7c673cae
FG
1278 }
1279
1280 // min_compat_client?
31f18b77 1281 if (tmp.require_min_compat_client == 0) {
7c673cae 1282 auto mv = tmp.get_min_compat_client();
31f18b77
FG
1283 dout(1) << __func__ << " setting require_min_compat_client to currently "
1284 << "required " << ceph_release_name(mv) << dendl;
1285 mon->clog->info() << "setting require_min_compat_client to currently "
1286 << "required " << ceph_release_name(mv);
1287 pending_inc.new_require_min_compat_client = mv;
7c673cae 1288 }
224ce89b
WB
1289
1290 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1291 // convert ec profile ruleset-* -> crush-*
1292 for (auto& p : tmp.erasure_code_profiles) {
1293 bool changed = false;
1294 map<string,string> newprofile;
1295 for (auto& q : p.second) {
1296 if (q.first.find("ruleset-") == 0) {
1297 string key = "crush-";
1298 key += q.first.substr(8);
1299 newprofile[key] = q.second;
1300 changed = true;
1301 dout(20) << " updating ec profile " << p.first
1302 << " key " << q.first << " -> " << key << dendl;
1303 } else {
1304 newprofile[q.first] = q.second;
1305 }
1306 }
1307 if (changed) {
1308 dout(10) << " updated ec profile " << p.first << ": "
1309 << newprofile << dendl;
1310 pending_inc.new_erasure_code_profiles[p.first] = newprofile;
1311 }
1312 }
c07f9fc5
FG
1313
1314 // auto-enable pool applications upon upgrade
1315 // NOTE: this can be removed post-Luminous assuming upgrades need to
1316 // proceed through Luminous
1317 for (auto &pool_pair : tmp.pools) {
1318 int64_t pool_id = pool_pair.first;
1319 pg_pool_t pg_pool = pool_pair.second;
1320 if (pg_pool.is_tier()) {
1321 continue;
1322 }
1323
1324 std::string pool_name = tmp.get_pool_name(pool_id);
1325 uint32_t match_count = 0;
1326
1327 // CephFS
28e407b8 1328 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
c07f9fc5
FG
1329 if (pending_fsmap.pool_in_use(pool_id)) {
1330 dout(10) << __func__ << " auto-enabling CephFS on pool '"
1331 << pool_name << "'" << dendl;
1332 pg_pool.application_metadata.insert(
1333 {pg_pool_t::APPLICATION_NAME_CEPHFS, {}});
1334 ++match_count;
1335 }
1336
1337 // RBD heuristics (default OpenStack pool names from docs and
1338 // ceph-ansible)
1339 if (boost::algorithm::contains(pool_name, "rbd") ||
1340 pool_name == "images" || pool_name == "volumes" ||
1341 pool_name == "backups" || pool_name == "vms") {
1342 dout(10) << __func__ << " auto-enabling RBD on pool '"
1343 << pool_name << "'" << dendl;
1344 pg_pool.application_metadata.insert(
1345 {pg_pool_t::APPLICATION_NAME_RBD, {}});
1346 ++match_count;
1347 }
1348
1349 // RGW heuristics
1350 if (boost::algorithm::contains(pool_name, ".rgw") ||
1351 boost::algorithm::contains(pool_name, ".log") ||
1352 boost::algorithm::contains(pool_name, ".intent-log") ||
1353 boost::algorithm::contains(pool_name, ".usage") ||
1354 boost::algorithm::contains(pool_name, ".users")) {
1355 dout(10) << __func__ << " auto-enabling RGW on pool '"
1356 << pool_name << "'" << dendl;
1357 pg_pool.application_metadata.insert(
1358 {pg_pool_t::APPLICATION_NAME_RGW, {}});
1359 ++match_count;
1360 }
1361
1362 // OpenStack gnocchi (from ceph-ansible)
1363 if (pool_name == "metrics" && match_count == 0) {
1364 dout(10) << __func__ << " auto-enabling OpenStack Gnocchi on pool '"
1365 << pool_name << "'" << dendl;
1366 pg_pool.application_metadata.insert({"openstack_gnocchi", {}});
1367 ++match_count;
1368 }
1369
1370 if (match_count == 1) {
1371 pg_pool.last_change = pending_inc.epoch;
1372 pending_inc.new_pools[pool_id] = pg_pool;
1373 } else if (match_count > 1) {
1374 auto pstat = mon->pgservice->get_pool_stat(pool_id);
1375 if (pstat != nullptr && pstat->stats.sum.num_objects > 0) {
1376 mon->clog->info() << "unable to auto-enable application for pool "
1377 << "'" << pool_name << "'";
1378 }
1379 }
1380 }
224ce89b 1381 }
7c673cae
FG
1382 }
1383 }
1384
1385 // tell me about it
31f18b77 1386 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1387 i != pending_inc.new_state.end();
1388 ++i) {
1389 int s = i->second ? i->second : CEPH_OSD_UP;
1390 if (s & CEPH_OSD_UP)
1391 dout(2) << " osd." << i->first << " DOWN" << dendl;
1392 if (s & CEPH_OSD_EXISTS)
1393 dout(2) << " osd." << i->first << " DNE" << dendl;
1394 }
1395 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1396 i != pending_inc.new_up_client.end();
1397 ++i) {
1398 //FIXME: insert cluster addresses too
1399 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1400 }
1401 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1402 i != pending_inc.new_weight.end();
1403 ++i) {
1404 if (i->second == CEPH_OSD_OUT) {
1405 dout(2) << " osd." << i->first << " OUT" << dendl;
1406 } else if (i->second == CEPH_OSD_IN) {
1407 dout(2) << " osd." << i->first << " IN" << dendl;
1408 } else {
1409 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1410 }
1411 }
1412
94b18763
FG
1413 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1414 osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
1415
7c673cae 1416 // features for osdmap and its incremental
28e407b8 1417 uint64_t features;
7c673cae
FG
1418
1419 // encode full map and determine its crc
1420 OSDMap tmp;
1421 {
1422 tmp.deepish_copy_from(osdmap);
1423 tmp.apply_incremental(pending_inc);
1424
1425 // determine appropriate features
28e407b8
AA
1426 features = tmp.get_encoding_features();
1427 dout(10) << __func__ << " encoding full map with "
1428 << ceph_release_name(tmp.require_osd_release)
1429 << " features " << features << dendl;
1430
1431 // the features should be a subset of the mon quorum's features!
1432 assert((features & ~mon->get_quorum_con_features()) == 0);
7c673cae
FG
1433
1434 bufferlist fullbl;
1435 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1436 pending_inc.full_crc = tmp.get_crc();
1437
1438 // include full map in the txn. note that old monitors will
1439 // overwrite this. new ones will now skip the local full map
1440 // encode and reload from this.
1441 put_version_full(t, pending_inc.epoch, fullbl);
1442 }
1443
1444 // encode
1445 assert(get_last_committed() + 1 == pending_inc.epoch);
1446 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1447
1448 dout(20) << " full_crc " << tmp.get_crc()
1449 << " inc_crc " << pending_inc.inc_crc << dendl;
1450
1451 /* put everything in the transaction */
1452 put_version(t, pending_inc.epoch, bl);
1453 put_last_committed(t, pending_inc.epoch);
1454
1455 // metadata, too!
1456 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1457 p != pending_metadata.end();
1458 ++p)
1459 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1460 for (set<int>::iterator p = pending_metadata_rm.begin();
1461 p != pending_metadata_rm.end();
1462 ++p)
1463 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1464 pending_metadata.clear();
1465 pending_metadata_rm.clear();
1466
1467 // and pg creating, also!
1468 if (mon->monmap->get_required_features().contains_all(
1469 ceph::features::mon::FEATURE_LUMINOUS)) {
94b18763 1470 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
31f18b77
FG
1471 if (osdmap.get_epoch() &&
1472 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1473 dout(7) << __func__ << " in the middle of upgrading, "
1474 << " trimming pending creating_pgs using pgmap" << dendl;
31f18b77 1475 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
7c673cae
FG
1476 }
1477 bufferlist creatings_bl;
1478 ::encode(pending_creatings, creatings_bl);
1479 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1480 }
224ce89b
WB
1481
1482 // health
1483 health_check_map_t next;
1484 tmp.check_health(&next);
1485 encode_health(next, t);
7c673cae
FG
1486}
1487
1488void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
31f18b77 1489 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
7c673cae
FG
1490{
1491 auto p = creating_pgs->pgs.begin();
1492 while (p != creating_pgs->pgs.end()) {
31f18b77
FG
1493 auto q = pg_stat.find(p->first);
1494 if (q != pg_stat.end() &&
7c673cae
FG
1495 !(q->second.state & PG_STATE_CREATING)) {
1496 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1497 << dendl;
1498 p = creating_pgs->pgs.erase(p);
7c673cae
FG
1499 } else {
1500 ++p;
1501 }
1502 }
1503}
1504
1505int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1506{
1507 bufferlist bl;
1508 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1509 if (r < 0)
1510 return r;
1511 try {
1512 bufferlist::iterator p = bl.begin();
1513 ::decode(m, p);
1514 }
1515 catch (buffer::error& e) {
1516 if (err)
1517 *err << "osd." << osd << " metadata is corrupt";
1518 return -EIO;
1519 }
1520 return 0;
1521}
1522
c07f9fc5 1523void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 1524{
31f18b77
FG
1525 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1526 if (osdmap.is_up(osd)) {
1527 map<string,string> meta;
1528 load_metadata(osd, meta, nullptr);
1529 auto p = meta.find(field);
1530 if (p == meta.end()) {
c07f9fc5 1531 (*out)["unknown"]++;
31f18b77 1532 } else {
c07f9fc5 1533 (*out)[p->second]++;
31f18b77
FG
1534 }
1535 }
1536 }
c07f9fc5
FG
1537}
1538
1539void OSDMonitor::count_metadata(const string& field, Formatter *f)
1540{
1541 map<string,int> by_val;
1542 count_metadata(field, &by_val);
31f18b77
FG
1543 f->open_object_section(field.c_str());
1544 for (auto& p : by_val) {
1545 f->dump_int(p.first.c_str(), p.second);
1546 }
1547 f->close_section();
1548}
1549
7c673cae
FG
1550int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1551{
1552 map<string, string> metadata;
1553 int r = load_metadata(osd, metadata, nullptr);
1554 if (r < 0)
1555 return r;
1556
1557 auto it = metadata.find("osd_objectstore");
1558 if (it == metadata.end())
1559 return -ENOENT;
1560 *type = it->second;
1561 return 0;
1562}
1563
1564bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1565 const pg_pool_t &pool,
1566 ostream *err)
1567{
1568 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1569 // since filestore osds could always join the pool later
1570 set<int> checked_osds;
1571 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1572 vector<int> up, acting;
1573 pg_t pgid(ps, pool_id, -1);
1574 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1575 for (int osd : up) {
1576 if (checked_osds.find(osd) != checked_osds.end())
1577 continue;
1578 string objectstore_type;
1579 int r = get_osd_objectstore_type(osd, &objectstore_type);
1580 // allow with missing metadata, e.g. due to an osd never booting yet
1581 if (r < 0 || objectstore_type == "bluestore") {
1582 checked_osds.insert(osd);
1583 continue;
1584 }
1585 *err << "osd." << osd << " uses " << objectstore_type;
1586 return false;
1587 }
1588 }
1589 return true;
1590}
1591
1592int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1593{
1594 map<string,string> m;
1595 if (int r = load_metadata(osd, m, err))
1596 return r;
1597 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1598 f->dump_string(p->first.c_str(), p->second);
1599 return 0;
1600}
1601
1602void OSDMonitor::print_nodes(Formatter *f)
1603{
1604 // group OSDs by their hosts
1605 map<string, list<int> > osds; // hostname => osd
1606 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1607 map<string, string> m;
1608 if (load_metadata(osd, m, NULL)) {
1609 continue;
1610 }
1611 map<string, string>::iterator hostname = m.find("hostname");
1612 if (hostname == m.end()) {
1613 // not likely though
1614 continue;
1615 }
1616 osds[hostname->second].push_back(osd);
1617 }
1618
1619 dump_services(f, osds, "osd");
1620}
1621
1622void OSDMonitor::share_map_with_random_osd()
1623{
1624 if (osdmap.get_num_up_osds() == 0) {
1625 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1626 return;
1627 }
1628
1629 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1630 if (!s) {
1631 dout(10) << __func__ << " no up osd on our session map" << dendl;
1632 return;
1633 }
1634
1635 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
28e407b8
AA
1636
1637 // get feature of the peer
1638 // use quorum_con_features, if it's an anonymous connection.
1639 uint64_t features = s->con_features ? s->con_features :
1640 mon->get_quorum_con_features();
7c673cae 1641 // whatev, they'll request more if they need it
28e407b8 1642 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
7c673cae
FG
1643 s->con->send_message(m);
1644 // NOTE: do *not* record osd has up to this epoch (as we do
1645 // elsewhere) as they may still need to request older values.
1646}
1647
1648version_t OSDMonitor::get_trim_to()
1649{
31f18b77
FG
1650 if (mon->get_quorum().empty()) {
1651 dout(10) << __func__ << ": quorum not formed" << dendl;
1652 return 0;
1653 }
7c673cae 1654
31f18b77 1655 epoch_t floor;
7c673cae
FG
1656 if (mon->monmap->get_required_features().contains_all(
1657 ceph::features::mon::FEATURE_LUMINOUS)) {
1658 {
31f18b77 1659 // TODO: Get this hidden in PGStatService
7c673cae
FG
1660 std::lock_guard<std::mutex> l(creating_pgs_lock);
1661 if (!creating_pgs.pgs.empty()) {
1662 return 0;
1663 }
1664 }
1665 floor = get_min_last_epoch_clean();
1666 } else {
31f18b77 1667 if (!mon->pgservice->is_readable())
7c673cae 1668 return 0;
31f18b77 1669 if (mon->pgservice->have_creating_pgs()) {
7c673cae
FG
1670 return 0;
1671 }
31f18b77 1672 floor = mon->pgservice->get_min_last_epoch_clean();
7c673cae
FG
1673 }
1674 {
1675 dout(10) << " min_last_epoch_clean " << floor << dendl;
1676 if (g_conf->mon_osd_force_trim_to > 0 &&
1677 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1678 floor = g_conf->mon_osd_force_trim_to;
1679 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1680 }
1681 unsigned min = g_conf->mon_min_osdmap_epochs;
1682 if (floor + min > get_last_committed()) {
1683 if (min < get_last_committed())
1684 floor = get_last_committed() - min;
1685 else
1686 floor = 0;
1687 }
1688 if (floor > get_first_committed())
1689 return floor;
1690 }
1691 return 0;
1692}
1693
1694epoch_t OSDMonitor::get_min_last_epoch_clean() const
1695{
1696 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1697 // also scan osd epochs
1698 // don't trim past the oldest reported osd epoch
1699 for (auto& osd_epoch : osd_epochs) {
1700 if (osd_epoch.second < floor) {
1701 floor = osd_epoch.second;
1702 }
1703 }
1704 return floor;
1705}
1706
1707void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1708 version_t first)
1709{
1710 dout(10) << __func__ << " including full map for e " << first << dendl;
1711 bufferlist bl;
1712 get_version_full(first, bl);
1713 put_version_full(tx, first, bl);
1714}
1715
1716// -------------
1717
1718bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1719{
1720 op->mark_osdmon_event(__func__);
1721 Message *m = op->get_req();
1722 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1723
1724 switch (m->get_type()) {
1725 // READs
1726 case MSG_MON_COMMAND:
f64942e4
AA
1727 try {
1728 return preprocess_command(op);
1729 }
1730 catch (const bad_cmd_get& e) {
1731 bufferlist bl;
1732 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
1733 return true;
1734 }
7c673cae
FG
1735 case CEPH_MSG_MON_GET_OSDMAP:
1736 return preprocess_get_osdmap(op);
1737
1738 // damp updates
1739 case MSG_OSD_MARK_ME_DOWN:
1740 return preprocess_mark_me_down(op);
1741 case MSG_OSD_FULL:
1742 return preprocess_full(op);
1743 case MSG_OSD_FAILURE:
1744 return preprocess_failure(op);
1745 case MSG_OSD_BOOT:
1746 return preprocess_boot(op);
1747 case MSG_OSD_ALIVE:
1748 return preprocess_alive(op);
1749 case MSG_OSD_PG_CREATED:
1750 return preprocess_pg_created(op);
1751 case MSG_OSD_PGTEMP:
1752 return preprocess_pgtemp(op);
1753 case MSG_OSD_BEACON:
1754 return preprocess_beacon(op);
1755
1756 case CEPH_MSG_POOLOP:
1757 return preprocess_pool_op(op);
1758
1759 case MSG_REMOVE_SNAPS:
1760 return preprocess_remove_snaps(op);
1761
1762 default:
1763 ceph_abort();
1764 return true;
1765 }
1766}
1767
1768bool OSDMonitor::prepare_update(MonOpRequestRef op)
1769{
1770 op->mark_osdmon_event(__func__);
1771 Message *m = op->get_req();
1772 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1773
1774 switch (m->get_type()) {
1775 // damp updates
1776 case MSG_OSD_MARK_ME_DOWN:
1777 return prepare_mark_me_down(op);
1778 case MSG_OSD_FULL:
1779 return prepare_full(op);
1780 case MSG_OSD_FAILURE:
1781 return prepare_failure(op);
1782 case MSG_OSD_BOOT:
1783 return prepare_boot(op);
1784 case MSG_OSD_ALIVE:
1785 return prepare_alive(op);
1786 case MSG_OSD_PG_CREATED:
1787 return prepare_pg_created(op);
1788 case MSG_OSD_PGTEMP:
1789 return prepare_pgtemp(op);
1790 case MSG_OSD_BEACON:
1791 return prepare_beacon(op);
1792
1793 case MSG_MON_COMMAND:
f64942e4
AA
1794 try {
1795 return prepare_command(op);
1796 }
1797 catch (const bad_cmd_get& e) {
1798 bufferlist bl;
1799 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
1800 return true;
1801 }
7c673cae
FG
1802
1803 case CEPH_MSG_POOLOP:
1804 return prepare_pool_op(op);
1805
1806 case MSG_REMOVE_SNAPS:
1807 return prepare_remove_snaps(op);
1808
1809
1810 default:
1811 ceph_abort();
1812 }
1813
1814 return false;
1815}
1816
1817bool OSDMonitor::should_propose(double& delay)
1818{
1819 dout(10) << "should_propose" << dendl;
1820
1821 // if full map, propose immediately! any subsequent changes will be clobbered.
1822 if (pending_inc.fullmap.length())
1823 return true;
1824
1825 // adjust osd weights?
1826 if (!osd_weight.empty() &&
1827 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1828 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1829 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1830 delay = 0.0;
1831 osd_weight.clear();
1832 return true;
1833 }
1834
7c673cae
FG
1835 return PaxosService::should_propose(delay);
1836}
1837
1838
1839
1840// ---------------------------
1841// READs
1842
1843bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1844{
1845 op->mark_osdmon_event(__func__);
1846 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
28e407b8
AA
1847
1848 uint64_t features = mon->get_quorum_con_features();
1849 if (m->get_session() && m->get_session()->con_features)
1850 features = m->get_session()->con_features;
1851
7c673cae 1852 dout(10) << __func__ << " " << *m << dendl;
28e407b8 1853 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
1854 epoch_t first = get_first_committed();
1855 epoch_t last = osdmap.get_epoch();
1856 int max = g_conf->osd_map_message_max;
1857 for (epoch_t e = MAX(first, m->get_full_first());
1858 e <= MIN(last, m->get_full_last()) && max > 0;
1859 ++e, --max) {
28e407b8 1860 int r = get_version_full(e, features, reply->maps[e]);
7c673cae
FG
1861 assert(r >= 0);
1862 }
1863 for (epoch_t e = MAX(first, m->get_inc_first());
1864 e <= MIN(last, m->get_inc_last()) && max > 0;
1865 ++e, --max) {
28e407b8 1866 int r = get_version(e, features, reply->incremental_maps[e]);
7c673cae
FG
1867 assert(r >= 0);
1868 }
1869 reply->oldest_map = first;
1870 reply->newest_map = last;
1871 mon->send_reply(op, reply);
1872 return true;
1873}
1874
1875
1876// ---------------------------
1877// UPDATEs
1878
1879// failure --
1880
1881bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1882 // check permissions
1883 MonSession *session = m->get_session();
1884 if (!session)
1885 return true;
1886 if (!session->is_capable("osd", MON_CAP_X)) {
1887 dout(0) << "got MOSDFailure from entity with insufficient caps "
1888 << session->caps << dendl;
1889 return true;
1890 }
1891 if (fsid != mon->monmap->fsid) {
1892 dout(0) << "check_source: on fsid " << fsid
1893 << " != " << mon->monmap->fsid << dendl;
1894 return true;
1895 }
1896 return false;
1897}
1898
1899
1900bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1901{
1902 op->mark_osdmon_event(__func__);
1903 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1904 // who is target_osd
1905 int badboy = m->get_target().name.num();
1906
1907 // check permissions
1908 if (check_source(m, m->fsid))
1909 goto didit;
1910
1911 // first, verify the reporting host is valid
1912 if (m->get_orig_source().is_osd()) {
1913 int from = m->get_orig_source().num();
1914 if (!osdmap.exists(from) ||
1915 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1916 (osdmap.is_down(from) && m->if_osd_failed())) {
1917 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1918 send_incremental(op, m->get_epoch()+1);
1919 goto didit;
1920 }
1921 }
1922
1923
1924 // weird?
1925 if (osdmap.is_down(badboy)) {
1926 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1927 if (m->get_epoch() < osdmap.get_epoch())
1928 send_incremental(op, m->get_epoch()+1);
1929 goto didit;
1930 }
1931 if (osdmap.get_inst(badboy) != m->get_target()) {
1932 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1933 << ", from " << m->get_orig_source_inst() << dendl;
1934 if (m->get_epoch() < osdmap.get_epoch())
1935 send_incremental(op, m->get_epoch()+1);
1936 goto didit;
1937 }
1938
1939 // already reported?
1940 if (osdmap.is_down(badboy) ||
1941 osdmap.get_up_from(badboy) > m->get_epoch()) {
1942 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1943 if (m->get_epoch() < osdmap.get_epoch())
1944 send_incremental(op, m->get_epoch()+1);
1945 goto didit;
1946 }
1947
1948 if (!can_mark_down(badboy)) {
1949 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1950 goto didit;
1951 }
1952
1953 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1954 return false;
1955
1956 didit:
28e407b8 1957 mon->no_reply(op);
7c673cae
FG
1958 return true;
1959}
1960
1961class C_AckMarkedDown : public C_MonOp {
1962 OSDMonitor *osdmon;
1963public:
1964 C_AckMarkedDown(
1965 OSDMonitor *osdmon,
1966 MonOpRequestRef op)
1967 : C_MonOp(op), osdmon(osdmon) {}
1968
1969 void _finish(int) override {
1970 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1971 osdmon->mon->send_reply(
1972 op,
1973 new MOSDMarkMeDown(
1974 m->fsid,
1975 m->get_target(),
1976 m->get_epoch(),
1977 false)); // ACK itself does not request an ack
1978 }
1979 ~C_AckMarkedDown() override {
1980 }
1981};
1982
1983bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1984{
1985 op->mark_osdmon_event(__func__);
1986 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1987 int requesting_down = m->get_target().name.num();
1988 int from = m->get_orig_source().num();
1989
1990 // check permissions
1991 if (check_source(m, m->fsid))
1992 goto reply;
1993
1994 // first, verify the reporting host is valid
1995 if (!m->get_orig_source().is_osd())
1996 goto reply;
1997
1998 if (!osdmap.exists(from) ||
1999 osdmap.is_down(from) ||
2000 osdmap.get_addr(from) != m->get_target().addr) {
2001 dout(5) << "preprocess_mark_me_down from dead osd."
2002 << from << ", ignoring" << dendl;
2003 send_incremental(op, m->get_epoch()+1);
2004 goto reply;
2005 }
2006
2007 // no down might be set
2008 if (!can_mark_down(requesting_down))
2009 goto reply;
2010
2011 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
2012 return false;
2013
2014 reply:
2015 if (m->request_ack) {
2016 Context *c(new C_AckMarkedDown(this, op));
2017 c->complete(0);
2018 }
2019 return true;
2020}
2021
2022bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2023{
2024 op->mark_osdmon_event(__func__);
2025 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2026 int target_osd = m->get_target().name.num();
2027
2028 assert(osdmap.is_up(target_osd));
2029 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
2030
2031 mon->clog->info() << "osd." << target_osd << " marked itself down";
2032 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2033 if (m->request_ack)
2034 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2035 return true;
2036}
2037
2038bool OSDMonitor::can_mark_down(int i)
2039{
2040 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
31f18b77
FG
2041 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
2042 << " down" << dendl;
2043 return false;
2044 }
2045
2046 if (osdmap.is_nodown(i)) {
2047 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
2048 << "will not mark it down" << dendl;
7c673cae
FG
2049 return false;
2050 }
31f18b77 2051
7c673cae
FG
2052 int num_osds = osdmap.get_num_osds();
2053 if (num_osds == 0) {
31f18b77 2054 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
2055 return false;
2056 }
2057 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
2058 float up_ratio = (float)up / (float)num_osds;
2059 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
31f18b77 2060 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
7c673cae
FG
2061 << g_conf->mon_osd_min_up_ratio
2062 << ", will not mark osd." << i << " down" << dendl;
2063 return false;
2064 }
2065 return true;
2066}
2067
2068bool OSDMonitor::can_mark_up(int i)
2069{
2070 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
31f18b77
FG
2071 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
2072 << " up" << dendl;
2073 return false;
2074 }
2075
2076 if (osdmap.is_noup(i)) {
2077 dout(5) << __func__ << " osd." << i << " is marked as noup, "
2078 << "will not mark it up" << dendl;
7c673cae
FG
2079 return false;
2080 }
31f18b77 2081
7c673cae
FG
2082 return true;
2083}
2084
2085/**
2086 * @note the parameter @p i apparently only exists here so we can output the
2087 * osd's id on messages.
2088 */
2089bool OSDMonitor::can_mark_out(int i)
2090{
2091 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
2092 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
2093 return false;
2094 }
31f18b77
FG
2095
2096 if (osdmap.is_noout(i)) {
2097 dout(5) << __func__ << " osd." << i << " is marked as noout, "
2098 << "will not mark it out" << dendl;
2099 return false;
2100 }
2101
7c673cae
FG
2102 int num_osds = osdmap.get_num_osds();
2103 if (num_osds == 0) {
2104 dout(5) << __func__ << " no osds" << dendl;
2105 return false;
2106 }
2107 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2108 float in_ratio = (float)in / (float)num_osds;
2109 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
2110 if (i >= 0)
2111 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2112 << g_conf->mon_osd_min_in_ratio
2113 << ", will not mark osd." << i << " out" << dendl;
2114 else
2115 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2116 << g_conf->mon_osd_min_in_ratio
2117 << ", will not mark osds out" << dendl;
2118 return false;
2119 }
2120
2121 return true;
2122}
2123
2124bool OSDMonitor::can_mark_in(int i)
2125{
2126 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
31f18b77
FG
2127 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
2128 << " in" << dendl;
2129 return false;
2130 }
2131
2132 if (osdmap.is_noin(i)) {
2133 dout(5) << __func__ << " osd." << i << " is marked as noin, "
2134 << "will not mark it in" << dendl;
7c673cae
FG
2135 return false;
2136 }
31f18b77 2137
7c673cae
FG
2138 return true;
2139}
2140
2141bool OSDMonitor::check_failures(utime_t now)
2142{
2143 bool found_failure = false;
2144 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2145 p != failure_info.end();
2146 ++p) {
2147 if (can_mark_down(p->first)) {
2148 found_failure |= check_failure(now, p->first, p->second);
2149 }
2150 }
2151 return found_failure;
2152}
2153
2154bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2155{
2156 // already pending failure?
2157 if (pending_inc.new_state.count(target_osd) &&
2158 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2159 dout(10) << " already pending failure" << dendl;
2160 return true;
2161 }
2162
2163 set<string> reporters_by_subtree;
2164 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
2165 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
2166 utime_t max_failed_since = fi.get_failed_since();
2167 utime_t failed_for = now - max_failed_since;
2168
2169 utime_t grace = orig_grace;
2170 double my_grace = 0, peer_grace = 0;
2171 double decay_k = 0;
2172 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2173 double halflife = (double)g_conf->mon_osd_laggy_halflife;
2174 decay_k = ::log(.5) / halflife;
2175
2176 // scale grace period based on historical probability of 'lagginess'
2177 // (false positive failures due to slowness).
2178 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2179 double decay = exp((double)failed_for * decay_k);
2180 dout(20) << " halflife " << halflife << " decay_k " << decay_k
2181 << " failed_for " << failed_for << " decay " << decay << dendl;
2182 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2183 grace += my_grace;
2184 }
2185
2186 // consider the peers reporting a failure a proxy for a potential
2187 // 'subcluster' over the overall cluster that is similarly
2188 // laggy. this is clearly not true in all cases, but will sometimes
2189 // help us localize the grace correction to a subset of the system
2190 // (say, a rack with a bad switch) that is unhappy.
2191 assert(fi.reporters.size());
2192 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2193 p != fi.reporters.end();
2194 ++p) {
2195 // get the parent bucket whose type matches with "reporter_subtree_level".
2196 // fall back to OSD if the level doesn't exist.
2197 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2198 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2199 if (iter == reporter_loc.end()) {
2200 reporters_by_subtree.insert("osd." + to_string(p->first));
2201 } else {
2202 reporters_by_subtree.insert(iter->second);
2203 }
2204 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2205 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2206 utime_t elapsed = now - xi.down_stamp;
2207 double decay = exp((double)elapsed * decay_k);
2208 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2209 }
2210 }
2211
2212 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2213 peer_grace /= (double)fi.reporters.size();
2214 grace += peer_grace;
2215 }
2216
2217 dout(10) << " osd." << target_osd << " has "
2218 << fi.reporters.size() << " reporters, "
2219 << grace << " grace (" << orig_grace << " + " << my_grace
2220 << " + " << peer_grace << "), max_failed_since " << max_failed_since
2221 << dendl;
2222
2223 if (failed_for >= grace &&
2224 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
2225 dout(1) << " we have enough reporters to mark osd." << target_osd
2226 << " down" << dendl;
2227 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2228
31f18b77
FG
2229 mon->clog->info() << "osd." << target_osd << " failed ("
2230 << osdmap.crush->get_full_location_ordered_string(
2231 target_osd)
2232 << ") ("
2233 << (int)reporters_by_subtree.size()
2234 << " reporters from different "
7c673cae
FG
2235 << reporter_subtree_level << " after "
2236 << failed_for << " >= grace " << grace << ")";
2237 return true;
2238 }
2239 return false;
2240}
2241
224ce89b 2242void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
2243{
2244 // already pending failure?
2245 if (pending_inc.new_state.count(target_osd) &&
2246 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2247 dout(10) << " already pending failure" << dendl;
2248 return;
2249 }
2250
2251 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2252 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2253
31f18b77
FG
2254 mon->clog->info() << "osd." << target_osd << " failed ("
2255 << osdmap.crush->get_full_location_ordered_string(target_osd)
2256 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
2257 return;
2258}
2259
2260bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2261{
2262 op->mark_osdmon_event(__func__);
2263 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2264 dout(1) << "prepare_failure " << m->get_target()
2265 << " from " << m->get_orig_source_inst()
2266 << " is reporting failure:" << m->if_osd_failed() << dendl;
2267
2268 int target_osd = m->get_target().name.num();
2269 int reporter = m->get_orig_source().num();
2270 assert(osdmap.is_up(target_osd));
2271 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
2272
2273 if (m->if_osd_failed()) {
2274 // calculate failure time
2275 utime_t now = ceph_clock_now();
2276 utime_t failed_since =
2277 m->get_recv_stamp() - utime_t(m->failed_for, 0);
2278
2279 // add a report
2280 if (m->is_immediate()) {
2281 mon->clog->debug() << m->get_target() << " reported immediately failed by "
2282 << m->get_orig_source_inst();
224ce89b 2283 force_failure(target_osd, reporter);
94b18763 2284 mon->no_reply(op);
7c673cae
FG
2285 return true;
2286 }
2287 mon->clog->debug() << m->get_target() << " reported failed by "
2288 << m->get_orig_source_inst();
2289
2290 failure_info_t& fi = failure_info[target_osd];
2291 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2292 if (old_op) {
2293 mon->no_reply(old_op);
2294 }
2295
2296 return check_failure(now, target_osd, fi);
2297 } else {
2298 // remove the report
2299 mon->clog->debug() << m->get_target() << " failure report canceled by "
2300 << m->get_orig_source_inst();
2301 if (failure_info.count(target_osd)) {
2302 failure_info_t& fi = failure_info[target_osd];
2303 MonOpRequestRef report_op = fi.cancel_report(reporter);
2304 if (report_op) {
2305 mon->no_reply(report_op);
2306 }
2307 if (fi.reporters.empty()) {
2308 dout(10) << " removing last failure_info for osd." << target_osd
2309 << dendl;
2310 failure_info.erase(target_osd);
2311 } else {
2312 dout(10) << " failure_info for osd." << target_osd << " now "
2313 << fi.reporters.size() << " reporters" << dendl;
2314 }
2315 } else {
2316 dout(10) << " no failure_info for osd." << target_osd << dendl;
2317 }
2318 mon->no_reply(op);
2319 }
2320
2321 return false;
2322}
2323
2324void OSDMonitor::process_failures()
2325{
2326 map<int,failure_info_t>::iterator p = failure_info.begin();
2327 while (p != failure_info.end()) {
2328 if (osdmap.is_up(p->first)) {
2329 ++p;
2330 } else {
2331 dout(10) << "process_failures osd." << p->first << dendl;
2332 list<MonOpRequestRef> ls;
2333 p->second.take_report_messages(ls);
2334 failure_info.erase(p++);
2335
2336 while (!ls.empty()) {
2337 MonOpRequestRef o = ls.front();
2338 if (o) {
2339 o->mark_event(__func__);
2340 MOSDFailure *m = o->get_req<MOSDFailure>();
2341 send_latest(o, m->get_epoch());
28e407b8 2342 mon->no_reply(o);
7c673cae
FG
2343 }
2344 ls.pop_front();
2345 }
2346 }
2347 }
2348}
2349
2350void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2351{
2352 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2353
2354 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2355 p != failure_info.end();
2356 ++p) {
2357 p->second.take_report_messages(ls);
2358 }
2359 failure_info.clear();
2360}
2361
2362
2363// boot --
2364
2365bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2366{
2367 op->mark_osdmon_event(__func__);
2368 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2369 int from = m->get_orig_source_inst().name.num();
2370
2371 // check permissions, ignore if failed (no response expected)
2372 MonSession *session = m->get_session();
2373 if (!session)
2374 goto ignore;
2375 if (!session->is_capable("osd", MON_CAP_X)) {
2376 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2377 << session->caps << dendl;
2378 goto ignore;
2379 }
2380
2381 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2382 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2383 << " != " << mon->monmap->fsid << dendl;
2384 goto ignore;
2385 }
2386
2387 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2388 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2389 goto ignore;
2390 }
2391
2392 assert(m->get_orig_source_inst().name.is_osd());
2393
2394 // check if osd has required features to boot
2395 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2396 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2397 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2398 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2399 << m->get_orig_source_inst()
2400 << " doesn't announce support -- ignore" << dendl;
2401 goto ignore;
2402 }
2403
2404 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2405 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2406 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2407 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2408 << m->get_orig_source_inst()
2409 << " doesn't announce support -- ignore" << dendl;
2410 goto ignore;
2411 }
2412
2413 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2414 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2415 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2416 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2417 << m->get_orig_source_inst()
2418 << " doesn't announce support -- ignore" << dendl;
2419 goto ignore;
2420 }
2421
31f18b77 2422 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
2423 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2424 mon->clog->info() << "disallowing boot of OSD "
2425 << m->get_orig_source_inst()
2426 << " because the osdmap requires"
2427 << " CEPH_FEATURE_SERVER_LUMINOUS"
2428 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2429 goto ignore;
2430 }
2431
31f18b77 2432 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
7c673cae
FG
2433 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2434 mon->clog->info() << "disallowing boot of OSD "
2435 << m->get_orig_source_inst()
2436 << " because the osdmap requires"
2437 << " CEPH_FEATURE_SERVER_JEWEL"
2438 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2439 goto ignore;
2440 }
2441
31f18b77 2442 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
7c673cae
FG
2443 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2444 mon->clog->info() << "disallowing boot of OSD "
2445 << m->get_orig_source_inst()
2446 << " because the osdmap requires"
2447 << " CEPH_FEATURE_SERVER_KRAKEN"
2448 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2449 goto ignore;
2450 }
2451
2452 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2453 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2454 mon->clog->info() << "disallowing boot of OSD "
2455 << m->get_orig_source_inst()
2456 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2457 goto ignore;
2458 }
2459
c07f9fc5
FG
2460 if (osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES) &&
2461 !(m->osd_features & CEPH_FEATURE_OSD_RECOVERY_DELETES)) {
2462 mon->clog->info() << "disallowing boot of OSD "
2463 << m->get_orig_source_inst()
2464 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2465 goto ignore;
2466 }
2467
7c673cae
FG
2468 if (any_of(osdmap.get_pools().begin(),
2469 osdmap.get_pools().end(),
2470 [](const std::pair<int64_t,pg_pool_t>& pool)
2471 { return pool.second.use_gmt_hitset; })) {
2472 assert(osdmap.get_num_up_osds() == 0 ||
2473 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2474 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2475 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2476 << m->get_orig_source_inst()
2477 << " doesn't announce support -- ignore" << dendl;
2478 goto ignore;
2479 }
2480 }
2481
2482 // make sure upgrades stop at luminous
2483 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
31f18b77 2484 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
2485 mon->clog->info() << "disallowing boot of post-luminous OSD "
2486 << m->get_orig_source_inst()
31f18b77 2487 << " because require_osd_release < luminous";
7c673cae
FG
2488 goto ignore;
2489 }
2490
2491 // make sure upgrades stop at jewel
2492 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
31f18b77 2493 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
2494 mon->clog->info() << "disallowing boot of post-jewel OSD "
2495 << m->get_orig_source_inst()
31f18b77 2496 << " because require_osd_release < jewel";
7c673cae
FG
2497 goto ignore;
2498 }
2499
2500 // make sure upgrades stop at hammer
2501 // * HAMMER_0_94_4 is the required hammer feature
2502 // * MON_METADATA is the first post-hammer feature
2503 if (osdmap.get_num_up_osds() > 0) {
2504 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2505 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2506 mon->clog->info() << "disallowing boot of post-hammer OSD "
2507 << m->get_orig_source_inst()
2508 << " because one or more up OSDs is pre-hammer v0.94.4";
2509 goto ignore;
2510 }
2511 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2512 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2513 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2514 << m->get_orig_source_inst()
2515 << " because all up OSDs are post-hammer";
2516 goto ignore;
2517 }
2518 }
2519
f64942e4
AA
2520 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2521 // we are reusing a jewel feature bit that was retired in luminous.
2522 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
2523 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
2524 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
2525 mon->clog->info() << "disallowing boot of OSD "
2526 << m->get_orig_source_inst()
2527 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2528 goto ignore;
2529 }
2530
7c673cae
FG
2531 // already booted?
2532 if (osdmap.is_up(from) &&
2533 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2534 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2535 // yup.
2536 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2537 << " == " << osdmap.get_inst(from) << dendl;
2538 _booted(op, false);
2539 return true;
2540 }
2541
2542 if (osdmap.exists(from) &&
2543 !osdmap.get_uuid(from).is_zero() &&
2544 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2545 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2546 << " clashes with existing osd: different fsid"
2547 << " (ours: " << osdmap.get_uuid(from)
2548 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2549 goto ignore;
2550 }
2551
2552 if (osdmap.exists(from) &&
2553 osdmap.get_info(from).up_from > m->version &&
2554 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2555 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2556 send_latest(op, m->sb.current_epoch+1);
2557 return true;
2558 }
2559
2560 // noup?
2561 if (!can_mark_up(from)) {
2562 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2563 send_latest(op, m->sb.current_epoch+1);
2564 return true;
2565 }
2566
2567 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2568 return false;
2569
2570 ignore:
2571 return true;
2572}
2573
2574bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2575{
2576 op->mark_osdmon_event(__func__);
2577 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2578 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2579 << " cluster_addr " << m->cluster_addr
2580 << " hb_back_addr " << m->hb_back_addr
2581 << " hb_front_addr " << m->hb_front_addr
2582 << dendl;
2583
2584 assert(m->get_orig_source().is_osd());
2585 int from = m->get_orig_source().num();
2586
2587 // does this osd exist?
2588 if (from >= osdmap.get_max_osd()) {
2589 dout(1) << "boot from osd." << from << " >= max_osd "
2590 << osdmap.get_max_osd() << dendl;
2591 return false;
2592 }
2593
2594 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2595 if (pending_inc.new_state.count(from))
2596 oldstate ^= pending_inc.new_state[from];
2597
2598 // already up? mark down first?
2599 if (osdmap.is_up(from)) {
2600 dout(7) << __func__ << " was up, first marking down "
2601 << osdmap.get_inst(from) << dendl;
2602 // preprocess should have caught these; if not, assert.
2603 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2604 osdmap.get_cluster_addr(from) != m->cluster_addr);
2605 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2606
2607 if (pending_inc.new_state.count(from) == 0 ||
2608 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2609 // mark previous guy down
2610 pending_inc.new_state[from] = CEPH_OSD_UP;
2611 }
2612 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2613 } else if (pending_inc.new_up_client.count(from)) {
2614 // already prepared, just wait
2615 dout(7) << __func__ << " already prepared, waiting on "
2616 << m->get_orig_source_addr() << dendl;
2617 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2618 } else {
2619 // mark new guy up.
2620 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2621 if (!m->cluster_addr.is_blank_ip())
2622 pending_inc.new_up_cluster[from] = m->cluster_addr;
2623 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2624 if (!m->hb_front_addr.is_blank_ip())
2625 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2626
2627 down_pending_out.erase(from); // if any
2628
2629 if (m->sb.weight)
2630 osd_weight[from] = m->sb.weight;
2631
2632 // set uuid?
2633 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2634 << dendl;
2635 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2636 // preprocess should have caught this; if not, assert.
2637 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2638 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2639 }
2640
2641 // fresh osd?
2642 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2643 const osd_info_t& i = osdmap.get_info(from);
2644 if (i.up_from > i.lost_at) {
2645 dout(10) << " fresh osd; marking lost_at too" << dendl;
2646 pending_inc.new_lost[from] = osdmap.get_epoch();
2647 }
2648 }
2649
2650 // metadata
2651 bufferlist osd_metadata;
2652 ::encode(m->metadata, osd_metadata);
2653 pending_metadata[from] = osd_metadata;
31f18b77 2654 pending_metadata_rm.erase(from);
7c673cae
FG
2655
2656 // adjust last clean unmount epoch?
2657 const osd_info_t& info = osdmap.get_info(from);
2658 dout(10) << " old osd_info: " << info << dendl;
2659 if (m->sb.mounted > info.last_clean_begin ||
2660 (m->sb.mounted == info.last_clean_begin &&
2661 m->sb.clean_thru > info.last_clean_end)) {
2662 epoch_t begin = m->sb.mounted;
2663 epoch_t end = m->sb.clean_thru;
2664
2665 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2666 << "[" << info.last_clean_begin << "," << info.last_clean_end
2667 << ") -> [" << begin << "-" << end << ")"
2668 << dendl;
2669 pending_inc.new_last_clean_interval[from] =
2670 pair<epoch_t,epoch_t>(begin, end);
2671 }
2672
2673 osd_xinfo_t xi = osdmap.get_xinfo(from);
2674 if (m->boot_epoch == 0) {
2675 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2676 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2677 dout(10) << " not laggy, new xi " << xi << dendl;
2678 } else {
2679 if (xi.down_stamp.sec()) {
2680 int interval = ceph_clock_now().sec() -
2681 xi.down_stamp.sec();
2682 if (g_conf->mon_osd_laggy_max_interval &&
2683 (interval > g_conf->mon_osd_laggy_max_interval)) {
2684 interval = g_conf->mon_osd_laggy_max_interval;
2685 }
2686 xi.laggy_interval =
2687 interval * g_conf->mon_osd_laggy_weight +
2688 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2689 }
2690 xi.laggy_probability =
2691 g_conf->mon_osd_laggy_weight +
2692 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2693 dout(10) << " laggy, now xi " << xi << dendl;
2694 }
2695
2696 // set features shared by the osd
2697 if (m->osd_features)
2698 xi.features = m->osd_features;
2699 else
2700 xi.features = m->get_connection()->get_features();
2701
2702 // mark in?
2703 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2704 (oldstate & CEPH_OSD_AUTOOUT)) ||
2705 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2706 (g_conf->mon_osd_auto_mark_in)) {
2707 if (can_mark_in(from)) {
2708 if (osdmap.osd_xinfo[from].old_weight > 0) {
2709 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2710 xi.old_weight = 0;
2711 } else {
2712 pending_inc.new_weight[from] = CEPH_OSD_IN;
2713 }
2714 } else {
2715 dout(7) << __func__ << " NOIN set, will not mark in "
2716 << m->get_orig_source_addr() << dendl;
2717 }
2718 }
2719
2720 pending_inc.new_xinfo[from] = xi;
2721
2722 // wait
2723 wait_for_finished_proposal(op, new C_Booted(this, op));
2724 }
2725 return true;
2726}
2727
2728void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2729{
2730 op->mark_osdmon_event(__func__);
2731 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2732 dout(7) << "_booted " << m->get_orig_source_inst()
2733 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2734
2735 if (logit) {
2736 mon->clog->info() << m->get_orig_source_inst() << " boot";
2737 }
2738
2739 send_latest(op, m->sb.current_epoch+1);
2740}
2741
2742
2743// -------------
2744// full
2745
2746bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2747{
2748 op->mark_osdmon_event(__func__);
2749 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2750 int from = m->get_orig_source().num();
2751 set<string> state;
2752 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2753
2754 // check permissions, ignore if failed
2755 MonSession *session = m->get_session();
2756 if (!session)
2757 goto ignore;
2758 if (!session->is_capable("osd", MON_CAP_X)) {
2759 dout(0) << "MOSDFull from entity with insufficient privileges:"
2760 << session->caps << dendl;
2761 goto ignore;
2762 }
2763
2764 // ignore a full message from the osd instance that already went down
2765 if (!osdmap.exists(from)) {
2766 dout(7) << __func__ << " ignoring full message from nonexistent "
2767 << m->get_orig_source_inst() << dendl;
2768 goto ignore;
2769 }
2770 if ((!osdmap.is_up(from) &&
2771 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2772 (osdmap.is_up(from) &&
2773 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2774 dout(7) << __func__ << " ignoring full message from down "
2775 << m->get_orig_source_inst() << dendl;
2776 goto ignore;
2777 }
2778
2779 OSDMap::calc_state_set(osdmap.get_state(from), state);
2780
2781 if ((osdmap.get_state(from) & mask) == m->state) {
2782 dout(7) << __func__ << " state already " << state << " for osd." << from
2783 << " " << m->get_orig_source_inst() << dendl;
2784 _reply_map(op, m->version);
2785 goto ignore;
2786 }
2787
2788 dout(10) << __func__ << " want state " << state << " for osd." << from
2789 << " " << m->get_orig_source_inst() << dendl;
2790 return false;
2791
2792 ignore:
2793 return true;
2794}
2795
2796bool OSDMonitor::prepare_full(MonOpRequestRef op)
2797{
2798 op->mark_osdmon_event(__func__);
2799 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2800 const int from = m->get_orig_source().num();
2801
2802 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2803 const unsigned want_state = m->state & mask; // safety first
2804
2805 unsigned cur_state = osdmap.get_state(from);
2806 auto p = pending_inc.new_state.find(from);
2807 if (p != pending_inc.new_state.end()) {
2808 cur_state ^= p->second;
2809 }
2810 cur_state &= mask;
2811
2812 set<string> want_state_set, cur_state_set;
2813 OSDMap::calc_state_set(want_state, want_state_set);
2814 OSDMap::calc_state_set(cur_state, cur_state_set);
2815
2816 if (cur_state != want_state) {
2817 if (p != pending_inc.new_state.end()) {
2818 p->second &= ~mask;
2819 } else {
2820 pending_inc.new_state[from] = 0;
2821 }
2822 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2823 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2824 << " -> " << want_state_set << dendl;
2825 } else {
2826 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2827 << " = wanted " << want_state_set << ", just waiting" << dendl;
2828 }
2829
2830 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2831 return true;
2832}
2833
2834// -------------
2835// alive
2836
2837bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2838{
2839 op->mark_osdmon_event(__func__);
2840 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2841 int from = m->get_orig_source().num();
2842
2843 // check permissions, ignore if failed
2844 MonSession *session = m->get_session();
2845 if (!session)
2846 goto ignore;
2847 if (!session->is_capable("osd", MON_CAP_X)) {
2848 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2849 << session->caps << dendl;
2850 goto ignore;
2851 }
2852
2853 if (!osdmap.is_up(from) ||
2854 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2855 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2856 goto ignore;
2857 }
2858
2859 if (osdmap.get_up_thru(from) >= m->want) {
2860 // yup.
2861 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2862 _reply_map(op, m->version);
2863 return true;
2864 }
2865
2866 dout(10) << "preprocess_alive want up_thru " << m->want
2867 << " from " << m->get_orig_source_inst() << dendl;
2868 return false;
2869
2870 ignore:
2871 return true;
2872}
2873
2874bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2875{
2876 op->mark_osdmon_event(__func__);
2877 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2878 int from = m->get_orig_source().num();
2879
2880 if (0) { // we probably don't care much about these
2881 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2882 }
2883
2884 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2885 << " from " << m->get_orig_source_inst() << dendl;
2886
2887 update_up_thru(from, m->version); // set to the latest map the OSD has
2888 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2889 return true;
2890}
2891
2892void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2893{
2894 op->mark_osdmon_event(__func__);
2895 dout(7) << "_reply_map " << e
2896 << " from " << op->get_req()->get_orig_source_inst()
2897 << dendl;
2898 send_latest(op, e);
2899}
2900
2901// pg_created
2902bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2903{
2904 op->mark_osdmon_event(__func__);
2905 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2906 dout(10) << __func__ << " " << *m << dendl;
2907 auto session = m->get_session();
94b18763 2908 mon->no_reply(op);
7c673cae
FG
2909 if (!session) {
2910 dout(10) << __func__ << ": no monitor session!" << dendl;
2911 return true;
2912 }
2913 if (!session->is_capable("osd", MON_CAP_X)) {
2914 derr << __func__ << " received from entity "
2915 << "with insufficient privileges " << session->caps << dendl;
2916 return true;
2917 }
2918 // always forward the "created!" to the leader
2919 return false;
2920}
2921
2922bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2923{
2924 op->mark_osdmon_event(__func__);
2925 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2926 dout(10) << __func__ << " " << *m << dendl;
2927 auto src = m->get_orig_source();
2928 auto from = src.num();
2929 if (!src.is_osd() ||
2930 !mon->osdmon()->osdmap.is_up(from) ||
2931 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2932 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2933 return false;
2934 }
2935 pending_created_pgs.push_back(m->pgid);
2936 return true;
2937}
2938
2939// -------------
2940// pg_temp changes
2941
2942bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2943{
2944 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2945 dout(10) << "preprocess_pgtemp " << *m << dendl;
2946 mempool::osdmap::vector<int> empty;
2947 int from = m->get_orig_source().num();
2948 size_t ignore_cnt = 0;
2949
2950 // check caps
2951 MonSession *session = m->get_session();
2952 if (!session)
2953 goto ignore;
2954 if (!session->is_capable("osd", MON_CAP_X)) {
2955 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2956 << session->caps << dendl;
2957 goto ignore;
2958 }
2959
2960 if (!osdmap.is_up(from) ||
2961 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2962 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2963 goto ignore;
2964 }
2965
3efd9988
FG
2966 if (m->forced) {
2967 return false;
2968 }
2969
7c673cae
FG
2970 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2971 dout(20) << " " << p->first
31f18b77 2972 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
2973 << " -> " << p->second << dendl;
2974
2975 // does the pool exist?
2976 if (!osdmap.have_pg_pool(p->first.pool())) {
2977 /*
2978 * 1. If the osdmap does not have the pool, it means the pool has been
2979 * removed in-between the osd sending this message and us handling it.
2980 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2981 * not exist in the pending either, as the osds would not send a
2982 * message about a pool they know nothing about (yet).
2983 * 3. However, if the pool does exist in the pending, then it must be a
2984 * new pool, and not relevant to this message (see 1).
2985 */
2986 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2987 << ": pool has been removed" << dendl;
2988 ignore_cnt++;
2989 continue;
2990 }
2991
2992 int acting_primary = -1;
2993 osdmap.pg_to_up_acting_osds(
2994 p->first, nullptr, nullptr, nullptr, &acting_primary);
2995 if (acting_primary != from) {
2996 /* If the source isn't the primary based on the current osdmap, we know
2997 * that the interval changed and that we can discard this message.
2998 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2999 * which of two pg temp mappings on the same pg is more recent.
3000 */
3001 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3002 << ": primary has changed" << dendl;
3003 ignore_cnt++;
3004 continue;
3005 }
3006
3007 // removal?
3008 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3009 osdmap.primary_temp->count(p->first)))
3010 return false;
3011 // change?
3012 // NOTE: we assume that this will clear pg_primary, so consider
3013 // an existing pg_primary field to imply a change
3014 if (p->second.size() &&
3015 (osdmap.pg_temp->count(p->first) == 0 ||
31f18b77 3016 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
7c673cae
FG
3017 osdmap.primary_temp->count(p->first)))
3018 return false;
3019 }
3020
3021 // should we ignore all the pgs?
3022 if (ignore_cnt == m->pg_temp.size())
3023 goto ignore;
3024
3025 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3026 _reply_map(op, m->map_epoch);
3027 return true;
3028
3029 ignore:
3030 return true;
3031}
3032
3033void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
3034{
3035 epoch_t old_up_thru = osdmap.get_up_thru(from);
3036 auto ut = pending_inc.new_up_thru.find(from);
3037 if (ut != pending_inc.new_up_thru.end()) {
3038 old_up_thru = ut->second;
3039 }
3040 if (up_thru > old_up_thru) {
3041 // set up_thru too, so the osd doesn't have to ask again
3042 pending_inc.new_up_thru[from] = up_thru;
3043 }
3044}
3045
3046bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
3047{
3048 op->mark_osdmon_event(__func__);
3049 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3050 int from = m->get_orig_source().num();
3051 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
3052 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3053 uint64_t pool = p->first.pool();
3054 if (pending_inc.old_pools.count(pool)) {
3055 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3056 << ": pool pending removal" << dendl;
3057 continue;
3058 }
3059 if (!osdmap.have_pg_pool(pool)) {
3060 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3061 << ": pool has been removed" << dendl;
3062 continue;
3063 }
3064 pending_inc.new_pg_temp[p->first] =
3065 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
3066
3067 // unconditionally clear pg_primary (until this message can encode
3068 // a change for that, too.. at which point we need to also fix
3069 // preprocess_pg_temp)
3070 if (osdmap.primary_temp->count(p->first) ||
3071 pending_inc.new_primary_temp.count(p->first))
3072 pending_inc.new_primary_temp[p->first] = -1;
3073 }
3074
3075 // set up_thru too, so the osd doesn't have to ask again
3076 update_up_thru(from, m->map_epoch);
3077
3078 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
3079 return true;
3080}
3081
3082
3083// ---
3084
3085bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
3086{
3087 op->mark_osdmon_event(__func__);
3088 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3089 dout(7) << "preprocess_remove_snaps " << *m << dendl;
3090
3091 // check privilege, ignore if failed
3092 MonSession *session = m->get_session();
f64942e4 3093 mon->no_reply(op);
7c673cae
FG
3094 if (!session)
3095 goto ignore;
3096 if (!session->caps.is_capable(
3097 g_ceph_context,
3098 CEPH_ENTITY_TYPE_MON,
3099 session->entity_name,
3100 "osd", "osd pool rmsnap", {}, true, true, false)) {
3101 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3102 << session->caps << dendl;
3103 goto ignore;
3104 }
3105
3106 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3107 q != m->snaps.end();
3108 ++q) {
3109 if (!osdmap.have_pg_pool(q->first)) {
3110 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3111 continue;
3112 }
3113 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3114 for (vector<snapid_t>::iterator p = q->second.begin();
3115 p != q->second.end();
3116 ++p) {
3117 if (*p > pi->get_snap_seq() ||
3118 !pi->removed_snaps.contains(*p))
3119 return false;
3120 }
3121 }
3122
3123 ignore:
3124 return true;
3125}
3126
3127bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3128{
3129 op->mark_osdmon_event(__func__);
3130 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3131 dout(7) << "prepare_remove_snaps " << *m << dendl;
3132
3133 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3134 p != m->snaps.end();
3135 ++p) {
3136
3137 if (!osdmap.have_pg_pool(p->first)) {
3138 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3139 continue;
3140 }
3141
3142 pg_pool_t& pi = osdmap.pools[p->first];
3143 for (vector<snapid_t>::iterator q = p->second.begin();
3144 q != p->second.end();
3145 ++q) {
3146 if (!pi.removed_snaps.contains(*q) &&
3147 (!pending_inc.new_pools.count(p->first) ||
3148 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3149 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3150 newpi->removed_snaps.insert(*q);
3151 dout(10) << " pool " << p->first << " removed_snaps added " << *q
3152 << " (now " << newpi->removed_snaps << ")" << dendl;
3153 if (*q > newpi->get_snap_seq()) {
3154 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
3155 newpi->set_snap_seq(*q);
3156 }
3157 newpi->set_snap_epoch(pending_inc.epoch);
3158 }
3159 }
3160 }
3161 return true;
3162}
3163
3164// osd beacon
3165bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3166{
3167 op->mark_osdmon_event(__func__);
3168 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3169 // check caps
3170 auto session = beacon->get_session();
94b18763 3171 mon->no_reply(op);
7c673cae
FG
3172 if (!session) {
3173 dout(10) << __func__ << " no monitor session!" << dendl;
3174 return true;
3175 }
3176 if (!session->is_capable("osd", MON_CAP_X)) {
3177 derr << __func__ << " received from entity "
3178 << "with insufficient privileges " << session->caps << dendl;
3179 return true;
3180 }
3181 // Always forward the beacon to the leader, even if they are the same as
3182 // the old one. The leader will mark as down osds that haven't sent
3183 // beacon for a few minutes.
3184 return false;
3185}
3186
3187bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3188{
3189 op->mark_osdmon_event(__func__);
3190 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3191 const auto src = beacon->get_orig_source();
3192 dout(10) << __func__ << " " << *beacon
3193 << " from " << src << dendl;
3194 int from = src.num();
3195
3196 if (!src.is_osd() ||
3197 !osdmap.is_up(from) ||
3198 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
3199 dout(1) << " ignoring beacon from non-active osd." << dendl;
3200 return false;
3201 }
3202
3203 last_osd_report[from] = ceph_clock_now();
3204 osd_epochs[from] = beacon->version;
3205
3206 for (const auto& pg : beacon->pgs) {
3207 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3208 }
3209 return false;
3210}
3211
3212// ---------------
3213// map helpers
3214
3215void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3216{
3217 op->mark_osdmon_event(__func__);
3218 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3219 << " start " << start << dendl;
3220 if (start == 0)
3221 send_full(op);
3222 else
3223 send_incremental(op, start);
3224}
3225
3226
28e407b8 3227MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
7c673cae 3228{
28e407b8
AA
3229 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
3230 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
7c673cae
FG
3231 r->oldest_map = get_first_committed();
3232 r->newest_map = osdmap.get_epoch();
3233 return r;
3234}
3235
28e407b8 3236MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
7c673cae 3237{
28e407b8
AA
3238 dout(10) << "build_incremental [" << from << ".." << to << "] with features " << std::hex << features << dendl;
3239 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
3240 m->oldest_map = get_first_committed();
3241 m->newest_map = osdmap.get_epoch();
3242
3243 for (epoch_t e = to; e >= from && e > 0; e--) {
3244 bufferlist bl;
28e407b8 3245 int err = get_version(e, features, bl);
7c673cae
FG
3246 if (err == 0) {
3247 assert(bl.length());
3248 // if (get_version(e, bl) > 0) {
3249 dout(20) << "build_incremental inc " << e << " "
3250 << bl.length() << " bytes" << dendl;
3251 m->incremental_maps[e] = bl;
3252 } else {
3253 assert(err == -ENOENT);
3254 assert(!bl.length());
28e407b8 3255 get_version_full(e, features, bl);
7c673cae
FG
3256 if (bl.length() > 0) {
3257 //else if (get_version("full", e, bl) > 0) {
3258 dout(20) << "build_incremental full " << e << " "
3259 << bl.length() << " bytes" << dendl;
3260 m->maps[e] = bl;
3261 } else {
3262 ceph_abort(); // we should have all maps.
3263 }
3264 }
3265 }
3266 return m;
3267}
3268
3269void OSDMonitor::send_full(MonOpRequestRef op)
3270{
3271 op->mark_osdmon_event(__func__);
3272 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
28e407b8 3273 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
7c673cae
FG
3274}
3275
3276void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3277{
3278 op->mark_osdmon_event(__func__);
3279
3280 MonSession *s = op->get_session();
3281 assert(s);
3282
3283 if (s->proxy_con &&
3284 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
3285 // oh, we can tell the other mon to do it
3286 dout(10) << __func__ << " asking proxying mon to send_incremental from "
3287 << first << dendl;
3288 MRoute *r = new MRoute(s->proxy_tid, NULL);
3289 r->send_osdmap_first = first;
3290 s->proxy_con->send_message(r);
3291 op->mark_event("reply: send routed send_osdmap_first reply");
3292 } else {
3293 // do it ourselves
3294 send_incremental(first, s, false, op);
3295 }
3296}
3297
3298void OSDMonitor::send_incremental(epoch_t first,
3299 MonSession *session,
3300 bool onetime,
3301 MonOpRequestRef req)
3302{
3303 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3304 << " to " << session->inst << dendl;
3305
28e407b8
AA
3306 // get feature of the peer
3307 // use quorum_con_features, if it's an anonymous connection.
3308 uint64_t features = session->con_features ? session->con_features :
3309 mon->get_quorum_con_features();
3310
7c673cae 3311 if (first <= session->osd_epoch) {
31f18b77 3312 dout(10) << __func__ << " " << session->inst << " should already have epoch "
7c673cae
FG
3313 << session->osd_epoch << dendl;
3314 first = session->osd_epoch + 1;
3315 }
3316
3317 if (first < get_first_committed()) {
3318 first = get_first_committed();
3319 bufferlist bl;
28e407b8 3320 int err = get_version_full(first, features, bl);
7c673cae
FG
3321 assert(err == 0);
3322 assert(bl.length());
3323
3324 dout(20) << "send_incremental starting with base full "
3325 << first << " " << bl.length() << " bytes" << dendl;
3326
28e407b8 3327 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
7c673cae
FG
3328 m->oldest_map = get_first_committed();
3329 m->newest_map = osdmap.get_epoch();
3330 m->maps[first] = bl;
3331
3332 if (req) {
3333 mon->send_reply(req, m);
3334 session->osd_epoch = first;
3335 return;
3336 } else {
3337 session->con->send_message(m);
3338 session->osd_epoch = first;
3339 }
3340 first++;
3341 }
3342
3343 while (first <= osdmap.get_epoch()) {
28e407b8
AA
3344 epoch_t last = std::min<epoch_t>(first + g_conf->osd_map_message_max - 1,
3345 osdmap.get_epoch());
3346 MOSDMap *m = build_incremental(first, last, features);
7c673cae
FG
3347
3348 if (req) {
3349 // send some maps. it may not be all of them, but it will get them
3350 // started.
3351 mon->send_reply(req, m);
3352 } else {
3353 session->con->send_message(m);
3354 first = last + 1;
3355 }
3356 session->osd_epoch = last;
3357 if (onetime || req)
3358 break;
3359 }
3360}
3361
3362int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3363{
28e407b8
AA
3364 return get_version(ver, mon->get_quorum_con_features(), bl);
3365}
3366
3367void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
3368{
3369 OSDMap::Incremental inc;
3370 bufferlist::iterator q = bl.begin();
3371 inc.decode(q);
3372 // always encode with subset of osdmap's canonical features
3373 uint64_t f = features & inc.encode_features;
3374 dout(20) << __func__ << " " << inc.epoch << " with features " << f
3375 << dendl;
3376 bl.clear();
3377 if (inc.fullmap.length()) {
3378 // embedded full map?
3379 OSDMap m;
3380 m.decode(inc.fullmap);
3381 inc.fullmap.clear();
3382 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
3383 }
3384 if (inc.crush.length()) {
3385 // embedded crush map
3386 CrushWrapper c;
3387 auto p = inc.crush.begin();
3388 c.decode(p);
3389 inc.crush.clear();
3390 c.encode(inc.crush, f);
3391 }
3392 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
3393}
3394
3395void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
3396{
3397 OSDMap m;
3398 bufferlist::iterator q = bl.begin();
3399 m.decode(q);
3400 // always encode with subset of osdmap's canonical features
3401 uint64_t f = features & m.get_encoding_features();
3402 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
3403 << dendl;
3404 bl.clear();
3405 m.encode(bl, f | CEPH_FEATURE_RESERVED);
3406}
3407
3408int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
3409{
3410 uint64_t significant_features = OSDMap::get_significant_features(features);
3411 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
3412 return 0;
3413 }
3414 int ret = PaxosService::get_version(ver, bl);
3415 if (ret < 0) {
7c673cae 3416 return ret;
28e407b8
AA
3417 }
3418 // NOTE: this check is imprecise; the OSDMap encoding features may
3419 // be a subset of the latest mon quorum features, but worst case we
3420 // reencode once and then cache the (identical) result under both
3421 // feature masks.
3422 if (significant_features !=
3423 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3424 reencode_incremental_map(bl, features);
3425 }
3426 inc_osd_cache.add({ver, significant_features}, bl);
3427 return 0;
7c673cae
FG
3428}
3429
3430int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3431{
28e407b8
AA
3432 return get_version_full(ver, mon->get_quorum_con_features(), bl);
3433}
3434
3435int OSDMonitor::get_version_full(version_t ver, uint64_t features,
3436 bufferlist& bl)
3437{
3438 uint64_t significant_features = OSDMap::get_significant_features(features);
3439 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
3440 return 0;
3441 }
3442 int ret = PaxosService::get_version_full(ver, bl);
3443 if (ret < 0) {
7c673cae 3444 return ret;
28e407b8
AA
3445 }
3446 // NOTE: this check is imprecise; the OSDMap encoding features may
3447 // be a subset of the latest mon quorum features, but worst case we
3448 // reencode once and then cache the (identical) result under both
3449 // feature masks.
3450 if (significant_features !=
3451 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3452 reencode_full_map(bl, features);
3453 }
3454 full_osd_cache.add({ver, significant_features}, bl);
3455 return 0;
7c673cae
FG
3456}
3457
3458epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
3459{
3460 dout(10) << "blacklist " << a << " until " << until << dendl;
3461 pending_inc.new_blacklist[a] = until;
3462 return pending_inc.epoch;
3463}
3464
3465
3466void OSDMonitor::check_osdmap_subs()
3467{
3468 dout(10) << __func__ << dendl;
3469 if (!osdmap.get_epoch()) {
3470 return;
3471 }
3472 auto osdmap_subs = mon->session_map.subs.find("osdmap");
3473 if (osdmap_subs == mon->session_map.subs.end()) {
3474 return;
3475 }
3476 auto p = osdmap_subs->second->begin();
3477 while (!p.end()) {
3478 auto sub = *p;
3479 ++p;
3480 check_osdmap_sub(sub);
3481 }
3482}
3483
3484void OSDMonitor::check_osdmap_sub(Subscription *sub)
3485{
3486 dout(10) << __func__ << " " << sub << " next " << sub->next
3487 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
3488 if (sub->next <= osdmap.get_epoch()) {
3489 if (sub->next >= 1)
3490 send_incremental(sub->next, sub->session, sub->incremental_onetime);
3491 else
28e407b8 3492 sub->session->con->send_message(build_latest_full(sub->session->con_features));
7c673cae
FG
3493 if (sub->onetime)
3494 mon->session_map.remove_sub(sub);
3495 else
3496 sub->next = osdmap.get_epoch() + 1;
3497 }
3498}
3499
3500void OSDMonitor::check_pg_creates_subs()
3501{
3502 if (!mon->monmap->get_required_features().contains_all(
3503 ceph::features::mon::FEATURE_LUMINOUS)) {
3504 // PGMonitor takes care of this in pre-luminous era.
3505 return;
3506 }
3507 if (!osdmap.get_num_up_osds()) {
3508 return;
3509 }
3510 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3511 mon->with_session_map([this](const MonSessionMap& session_map) {
3512 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3513 if (pg_creates_subs == session_map.subs.end()) {
3514 return;
3515 }
3516 for (auto sub : *pg_creates_subs->second) {
3517 check_pg_creates_sub(sub);
3518 }
3519 });
3520}
3521
3522void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3523{
3524 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3525 assert(sub->type == "osd_pg_creates");
3526 // only send these if the OSD is up. we will check_subs() when they do
3527 // come up so they will get the creates then.
3528 if (sub->session->inst.name.is_osd() &&
3529 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3530 sub->next = send_pg_creates(sub->session->inst.name.num(),
3531 sub->session->con.get(),
3532 sub->next);
3533 }
3534}
3535
c07f9fc5
FG
3536void OSDMonitor::do_application_enable(int64_t pool_id,
3537 const std::string &app_name)
3538{
35e4c445 3539 assert(paxos->is_plugged() && is_writeable());
c07f9fc5
FG
3540
3541 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
3542 << dendl;
3543
35e4c445
FG
3544 assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS ||
3545 pending_inc.new_require_osd_release >= CEPH_RELEASE_LUMINOUS);
3546
c07f9fc5
FG
3547 auto pp = osdmap.get_pg_pool(pool_id);
3548 assert(pp != nullptr);
3549
3550 pg_pool_t p = *pp;
3551 if (pending_inc.new_pools.count(pool_id)) {
3552 p = pending_inc.new_pools[pool_id];
3553 }
3554
3555 p.application_metadata.insert({app_name, {}});
3556 p.last_change = pending_inc.epoch;
3557 pending_inc.new_pools[pool_id] = p;
3558}
3559
31f18b77 3560unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
3561 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3562 const mempool::osdmap::set<int64_t>& removed_pools,
3563 utime_t modified,
3564 creating_pgs_t* creating_pgs) const
3565{
31f18b77 3566 unsigned queued = 0;
7c673cae
FG
3567 for (auto& p : pools) {
3568 int64_t poolid = p.first;
3569 const pg_pool_t& pool = p.second;
31f18b77 3570 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
3571 pool.get_type(), pool.get_size());
3572 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3573 continue;
3574
3575 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3576 const auto created = pool.get_last_change();
3577 if (last_scan_epoch && created <= last_scan_epoch) {
3578 dout(10) << __func__ << " no change in pool " << poolid
3579 << " " << pool << dendl;
3580 continue;
3581 }
3582 if (removed_pools.count(poolid)) {
3583 dout(10) << __func__ << " pool is being removed: " << poolid
3584 << " " << pool << dendl;
3585 continue;
3586 }
31f18b77 3587 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 3588 << " " << pool << dendl;
31f18b77
FG
3589 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3590 created, modified)) {
3591 queued++;
7c673cae
FG
3592 }
3593 }
31f18b77 3594 return queued;
7c673cae
FG
3595}
3596
3597void OSDMonitor::update_creating_pgs()
3598{
31f18b77
FG
3599 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3600 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
3601 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3602 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 3603 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
3604 int acting_primary = -1;
3605 auto pgid = pg.first;
94b18763
FG
3606 if (!osdmap.pg_exists(pgid)) {
3607 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
3608 << dendl;
3609 continue;
3610 }
7c673cae 3611 auto mapped = pg.second.first;
c07f9fc5 3612 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
7c673cae
FG
3613 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3614 // check the previous creating_pgs, look for the target to whom the pg was
3615 // previously mapped
3616 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3617 const auto last_acting_primary = pgs_by_epoch.first;
3618 for (auto& pgs: pgs_by_epoch.second) {
3619 if (pgs.second.count(pgid)) {
3620 if (last_acting_primary == acting_primary) {
3621 mapped = pgs.first;
3622 } else {
3623 dout(20) << __func__ << " " << pgid << " "
3624 << " acting_primary:" << last_acting_primary
3625 << " -> " << acting_primary << dendl;
3626 // note epoch if the target of the create message changed.
3627 mapped = mapping.get_epoch();
3628 }
3629 break;
31f18b77
FG
3630 } else {
3631 // newly creating
3632 mapped = mapping.get_epoch();
3633 }
7c673cae
FG
3634 }
3635 }
3636 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 3637 << " to create " << pgid << "@" << mapped << dendl;
7c673cae
FG
3638 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3639 }
3640 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3641 creating_pgs_epoch = mapping.get_epoch();
3642}
3643
c07f9fc5 3644epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
3645{
3646 dout(30) << __func__ << " osd." << osd << " next=" << next
3647 << " " << creating_pgs_by_osd_epoch << dendl;
3648 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
3649 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
3650 dout(20) << __func__
3651 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
3652 // the subscribers will be updated when the mapping is completed anyway
3653 return next;
3654 }
7c673cae
FG
3655 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3656 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3657 return next;
3658 assert(!creating_pgs_by_epoch->second.empty());
3659
3660 MOSDPGCreate *m = nullptr;
3661 epoch_t last = 0;
3662 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3663 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3664 auto epoch = epoch_pgs->first;
3665 auto& pgs = epoch_pgs->second;
3666 dout(20) << __func__ << " osd." << osd << " from " << next
3667 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3668 last = epoch;
3669 for (auto& pg : pgs) {
3670 if (!m)
3671 m = new MOSDPGCreate(creating_pgs_epoch);
3672 // Need the create time from the monitor using its clock to set
3673 // last_scrub_stamp upon pg creation.
c07f9fc5
FG
3674 auto create = creating_pgs.pgs.find(pg);
3675 assert(create != creating_pgs.pgs.end());
3676 m->mkpg.emplace(pg, pg_create_t{create->second.first, pg, 0});
3677 m->ctimes.emplace(pg, create->second.second);
7c673cae 3678 dout(20) << __func__ << " will create " << pg
c07f9fc5 3679 << " at " << create->second.first << dendl;
7c673cae
FG
3680 }
3681 }
3682 if (!m) {
3683 dout(20) << __func__ << " osd." << osd << " from " << next
3684 << " has nothing to send" << dendl;
3685 return next;
3686 }
3687 con->send_message(m);
3688 // sub is current through last + 1
3689 return last + 1;
3690}
3691
3692// TICK
3693
3694
3695void OSDMonitor::tick()
3696{
3697 if (!is_active()) return;
3698
3699 dout(10) << osdmap << dendl;
3700
3701 if (!mon->is_leader()) return;
3702
3703 bool do_propose = false;
3704 utime_t now = ceph_clock_now();
3705
31f18b77 3706 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
3707 mon->monmap->get_required_features().contains_all(
3708 ceph::features::mon::FEATURE_LUMINOUS)) {
3709 if (handle_osd_timeouts(now, last_osd_report)) {
3710 do_propose = true;
3711 }
3712 }
181888fb
FG
3713 if (!osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) &&
3714 osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3715 mon->mgrstatmon()->is_readable() &&
3716 mon->mgrstatmon()->definitely_converted_snapsets()) {
3717 dout(1) << __func__ << " all snapsets converted, setting purged_snapdirs"
3718 << dendl;
3719 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS);
3720 do_propose = true;
3721 }
7c673cae
FG
3722
3723 // mark osds down?
3724 if (check_failures(now))
3725 do_propose = true;
3726
3727 // mark down osds out?
3728
3729 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3730 * influence at all. The decision is made based on the ratio of "in" osds,
3731 * and the function returns false if this ratio is lower that the minimum
3732 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3733 */
3734 if (can_mark_out(-1)) {
3735 set<int> down_cache; // quick cache of down subtrees
3736
3737 map<int,utime_t>::iterator i = down_pending_out.begin();
3738 while (i != down_pending_out.end()) {
3739 int o = i->first;
3740 utime_t down = now;
3741 down -= i->second;
3742 ++i;
3743
3744 if (osdmap.is_down(o) &&
3745 osdmap.is_in(o) &&
3746 can_mark_out(o)) {
3747 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3748 utime_t grace = orig_grace;
3749 double my_grace = 0.0;
3750
3751 if (g_conf->mon_osd_adjust_down_out_interval) {
3752 // scale grace period the same way we do the heartbeat grace.
3753 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3754 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3755 double decay_k = ::log(.5) / halflife;
3756 double decay = exp((double)down * decay_k);
3757 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3758 << " down for " << down << " decay " << decay << dendl;
3759 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3760 grace += my_grace;
3761 }
3762
3763 // is this an entire large subtree down?
3764 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3765 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3766 if (type > 0) {
3767 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3768 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3769 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3770 // reset timer, too.
3771 down_pending_out[o] = now;
3772 continue;
3773 }
3774 }
3775 }
3776
c07f9fc5
FG
3777 bool down_out = !osdmap.is_destroyed(o) &&
3778 g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace;
3779 bool destroyed_out = osdmap.is_destroyed(o) &&
3780 g_conf->mon_osd_destroyed_out_interval > 0 &&
3781 // this is not precise enough as we did not make a note when this osd
3782 // was marked as destroyed, but let's not bother with that
3783 // complexity for now.
3784 down.sec() >= g_conf->mon_osd_destroyed_out_interval;
3785 if (down_out || destroyed_out) {
7c673cae
FG
3786 dout(10) << "tick marking osd." << o << " OUT after " << down
3787 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3788 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3789
3790 // set the AUTOOUT bit.
3791 if (pending_inc.new_state.count(o) == 0)
3792 pending_inc.new_state[o] = 0;
3793 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3794
3795 // remember previous weight
3796 if (pending_inc.new_xinfo.count(o) == 0)
3797 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3798 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3799
3800 do_propose = true;
3801
224ce89b
WB
3802 mon->clog->info() << "Marking osd." << o << " out (has been down for "
3803 << int(down.sec()) << " seconds)";
7c673cae
FG
3804 } else
3805 continue;
3806 }
3807
3808 down_pending_out.erase(o);
3809 }
3810 } else {
3811 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3812 }
3813
3814 // expire blacklisted items?
3815 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3816 p != osdmap.blacklist.end();
3817 ++p) {
3818 if (p->second < now) {
3819 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3820 pending_inc.old_blacklist.push_back(p->first);
3821 do_propose = true;
3822 }
3823 }
3824
3825 // if map full setting has changed, get that info out there!
31f18b77
FG
3826 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3827 mon->pgservice->is_readable()) {
7c673cae 3828 // for pre-luminous compat only!
31f18b77 3829 if (mon->pgservice->have_full_osds()) {
7c673cae
FG
3830 dout(5) << "There are full osds, setting full flag" << dendl;
3831 add_flag(CEPH_OSDMAP_FULL);
3832 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3833 dout(10) << "No full osds, removing full flag" << dendl;
3834 remove_flag(CEPH_OSDMAP_FULL);
3835 }
3836
31f18b77 3837 if (mon->pgservice->have_nearfull_osds()) {
7c673cae
FG
3838 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3839 add_flag(CEPH_OSDMAP_NEARFULL);
3840 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3841 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3842 remove_flag(CEPH_OSDMAP_NEARFULL);
3843 }
3844 if (pending_inc.new_flags != -1 &&
3845 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3846 dout(1) << "New setting for" <<
3847 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3848 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3849 << " -- doing propose" << dendl;
3850 do_propose = true;
3851 }
3852 }
3853
3854 if (update_pools_status())
3855 do_propose = true;
3856
3857 if (do_propose ||
3858 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3859 propose_pending();
3860}
3861
3862bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3863 std::map<int,utime_t> &last_osd_report)
3864{
3865 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3866 if (now - mon->get_leader_since() < timeo) {
3867 // We haven't been the leader for long enough to consider OSD timeouts
3868 return false;
3869 }
3870
3871 int max_osd = osdmap.get_max_osd();
3872 bool new_down = false;
3873
3874 for (int i=0; i < max_osd; ++i) {
3875 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
3876 if (!osdmap.exists(i)) {
3877 last_osd_report.erase(i); // if any
3878 continue;
3879 }
7c673cae
FG
3880 if (!osdmap.is_up(i))
3881 continue;
3882 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3883 if (t == last_osd_report.end()) {
3884 // it wasn't in the map; start the timer.
3885 last_osd_report[i] = now;
3886 } else if (can_mark_down(i)) {
3887 utime_t diff = now - t->second;
3888 if (diff > timeo) {
31f18b77
FG
3889 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3890 << diff << " seconds";
3891 derr << "no beacon from osd." << i << " since " << t->second
3892 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
3893 pending_inc.new_state[i] = CEPH_OSD_UP;
3894 new_down = true;
3895 }
3896 }
3897 }
3898 return new_down;
3899}
3900
3901void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3902 list<pair<health_status_t,string> > *detail,
3903 CephContext *cct) const
3904{
3905 int num_osds = osdmap.get_num_osds();
3906
3907 if (num_osds == 0) {
3908 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3909 } else {
3910 int num_in_osds = 0;
3911 int num_down_in_osds = 0;
3912 set<int> osds;
31f18b77
FG
3913 set<int> down_in_osds;
3914 set<int> up_in_osds;
3915 set<int> subtree_up;
3916 unordered_map<int, set<int> > subtree_type_down;
3917 unordered_map<int, int> num_osds_subtree;
3918 int max_type = osdmap.crush->get_max_type_id();
3919
7c673cae
FG
3920 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3921 if (!osdmap.exists(i)) {
3922 if (osdmap.crush->item_exists(i)) {
3923 osds.insert(i);
3924 }
31f18b77 3925 continue;
224ce89b 3926 }
7c673cae
FG
3927 if (osdmap.is_out(i))
3928 continue;
3929 ++num_in_osds;
31f18b77
FG
3930 if (down_in_osds.count(i) || up_in_osds.count(i))
3931 continue;
7c673cae 3932 if (!osdmap.is_up(i)) {
31f18b77
FG
3933 down_in_osds.insert(i);
3934 int parent_id = 0;
3935 int current = i;
3936 for (int type = 0; type <= max_type; type++) {
3937 if (!osdmap.crush->get_type_name(type))
3938 continue;
3939 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3940 if (r == -ENOENT)
3941 break;
3942 // break early if this parent is already marked as up
3943 if (subtree_up.count(parent_id))
3944 break;
3945 type = osdmap.crush->get_bucket_type(parent_id);
3946 if (!osdmap.subtree_type_is_down(
3947 g_ceph_context, parent_id, type,
3948 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3949 break;
3950 current = parent_id;
3951 }
3952 }
3953 }
3954
3955 // calculate the number of down osds in each down subtree and
3956 // store it in num_osds_subtree
3957 for (int type = 1; type <= max_type; type++) {
3958 if (!osdmap.crush->get_type_name(type))
3959 continue;
3960 for (auto j = subtree_type_down[type].begin();
3961 j != subtree_type_down[type].end();
3962 ++j) {
3963 if (type == 1) {
3964 list<int> children;
3965 int num = osdmap.crush->get_children(*j, &children);
3966 num_osds_subtree[*j] = num;
3967 } else {
3968 list<int> children;
3969 int num = 0;
3970 int num_children = osdmap.crush->get_children(*j, &children);
3971 if (num_children == 0)
3972 continue;
3973 for (auto l = children.begin(); l != children.end(); ++l) {
3974 if (num_osds_subtree[*l] > 0) {
3975 num = num + num_osds_subtree[*l];
3976 }
3977 }
3978 num_osds_subtree[*j] = num;
7c673cae
FG
3979 }
3980 }
3981 }
31f18b77 3982 num_down_in_osds = down_in_osds.size();
7c673cae
FG
3983 assert(num_down_in_osds <= num_in_osds);
3984 if (num_down_in_osds > 0) {
31f18b77
FG
3985 // summary of down subtree types and osds
3986 for (int type = max_type; type > 0; type--) {
3987 if (!osdmap.crush->get_type_name(type))
3988 continue;
3989 if (subtree_type_down[type].size() > 0) {
3990 ostringstream ss;
3991 ss << subtree_type_down[type].size() << " "
3992 << osdmap.crush->get_type_name(type);
3993 if (subtree_type_down[type].size() > 1) {
3994 ss << "s";
3995 }
3996 int sum_down_osds = 0;
3997 for (auto j = subtree_type_down[type].begin();
3998 j != subtree_type_down[type].end();
3999 ++j) {
4000 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
4001 }
4002 ss << " (" << sum_down_osds << " osds) down";
4003 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4004 }
4005 }
7c673cae 4006 ostringstream ss;
31f18b77 4007 ss << down_in_osds.size() << " osds down";
7c673cae 4008 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
31f18b77
FG
4009
4010 if (detail) {
4011 // details of down subtree types
4012 for (int type = max_type; type > 0; type--) {
4013 if (!osdmap.crush->get_type_name(type))
4014 continue;
4015 for (auto j = subtree_type_down[type].rbegin();
4016 j != subtree_type_down[type].rend();
4017 ++j) {
4018 ostringstream ss;
4019 ss << osdmap.crush->get_type_name(type);
4020 ss << " ";
4021 ss << osdmap.crush->get_item_name(*j);
4022 // at the top level, do not print location
4023 if (type != max_type) {
4024 ss << " (";
4025 ss << osdmap.crush->get_full_location_ordered_string(*j);
4026 ss << ")";
4027 }
4028 int num = num_osds_subtree[*j];
4029 ss << " (" << num << " osds)";
4030 ss << " is down";
4031 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4032 }
4033 }
4034 // details of down osds
4035 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
4036 ostringstream ss;
4037 ss << "osd." << *it << " (";
4038 ss << osdmap.crush->get_full_location_ordered_string(*it);
4039 ss << ") is down";
4040 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4041 }
4042 }
7c673cae
FG
4043 }
4044
4045 if (!osds.empty()) {
4046 ostringstream ss;
31f18b77 4047 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
7c673cae
FG
4048 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4049 if (detail) {
31f18b77 4050 ss << " (osds: " << osds << ")";
7c673cae
FG
4051 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4052 }
4053 }
4054
7c673cae
FG
4055 // note: we leave it to ceph-mgr to generate details health warnings
4056 // with actual osd utilizations
4057
4058 // warn about flags
4059 uint64_t warn_flags =
4060 CEPH_OSDMAP_FULL |
4061 CEPH_OSDMAP_PAUSERD |
4062 CEPH_OSDMAP_PAUSEWR |
4063 CEPH_OSDMAP_PAUSEREC |
4064 CEPH_OSDMAP_NOUP |
4065 CEPH_OSDMAP_NODOWN |
4066 CEPH_OSDMAP_NOIN |
4067 CEPH_OSDMAP_NOOUT |
4068 CEPH_OSDMAP_NOBACKFILL |
4069 CEPH_OSDMAP_NORECOVER |
4070 CEPH_OSDMAP_NOSCRUB |
4071 CEPH_OSDMAP_NODEEP_SCRUB |
4072 CEPH_OSDMAP_NOTIERAGENT |
4073 CEPH_OSDMAP_NOREBALANCE;
4074 if (osdmap.test_flag(warn_flags)) {
4075 ostringstream ss;
4076 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
4077 << " flag(s) set";
4078 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4079 if (detail)
4080 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4081 }
4082
4083 // old crush tunables?
4084 if (g_conf->mon_warn_on_legacy_crush_tunables) {
4085 string min = osdmap.crush->get_min_required_version();
4086 if (min < g_conf->mon_crush_min_required_version) {
4087 ostringstream ss;
4088 ss << "crush map has legacy tunables (require " << min
4089 << ", min is " << g_conf->mon_crush_min_required_version << ")";
4090 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4091 if (detail) {
4092 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
4093 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4094 }
4095 }
4096 }
4097 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
4098 if (osdmap.crush->get_straw_calc_version() == 0) {
4099 ostringstream ss;
4100 ss << "crush map has straw_calc_version=0";
4101 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4102 if (detail) {
4103 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
4104 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4105 }
4106 }
4107 }
4108
4109 // hit_set-less cache_mode?
4110 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
4111 int problem_cache_pools = 0;
4112 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
4113 p != osdmap.pools.end();
4114 ++p) {
4115 const pg_pool_t& info = p->second;
4116 if (info.cache_mode_requires_hit_set() &&
4117 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
4118 ++problem_cache_pools;
4119 if (detail) {
4120 ostringstream ss;
4121 ss << "pool '" << osdmap.get_pool_name(p->first)
4122 << "' with cache_mode " << info.get_cache_mode_name()
4123 << " needs hit_set_type to be set but it is not";
4124 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4125 }
4126 }
4127 }
4128 if (problem_cache_pools) {
4129 ostringstream ss;
4130 ss << problem_cache_pools << " cache pools are missing hit_sets";
4131 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4132 }
4133 }
4134
4135 // Not using 'sortbitwise' and should be?
4136 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
4137 (osdmap.get_up_osd_features() &
4138 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
4139 ostringstream ss;
4140 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
4141 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4142 }
4143
4144 // Warn if 'mon_osd_down_out_interval' is set to zero.
4145 // Having this option set to zero on the leader acts much like the
4146 // 'noout' flag. It's hard to figure out what's going wrong with clusters
4147 // without the 'noout' flag set but acting like that just the same, so
4148 // we report a HEALTH_WARN in case this option is set to zero.
4149 // This is an ugly hack to get the warning out, but until we find a way
4150 // to spread global options throughout the mon cluster and have all mons
4151 // using a base set of the same options, we need to work around this sort
4152 // of things.
4153 // There's also the obvious drawback that if this is set on a single
4154 // monitor on a 3-monitor cluster, this warning will only be shown every
4155 // third monitor connection.
4156 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
4157 g_conf->mon_osd_down_out_interval == 0) {
4158 ostringstream ss;
4159 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
4160 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4161 if (detail) {
4162 ss << "; this has the same effect as the 'noout' flag";
4163 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4164 }
4165 }
4166
4167 // warn about upgrade flags that can be set but are not.
4168 if (g_conf->mon_debug_no_require_luminous) {
4169 // ignore these checks
4170 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
31f18b77
FG
4171 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
4172 string msg = "all OSDs are running luminous or later but"
4173 " require_osd_release < luminous";
7c673cae
FG
4174 summary.push_back(make_pair(HEALTH_WARN, msg));
4175 if (detail) {
4176 detail->push_back(make_pair(HEALTH_WARN, msg));
4177 }
4178 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
31f18b77
FG
4179 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
4180 string msg = "all OSDs are running kraken or later but"
4181 " require_osd_release < kraken";
7c673cae
FG
4182 summary.push_back(make_pair(HEALTH_WARN, msg));
4183 if (detail) {
4184 detail->push_back(make_pair(HEALTH_WARN, msg));
4185 }
4186 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
31f18b77
FG
4187 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
4188 string msg = "all OSDs are running jewel or later but"
4189 " require_osd_release < jewel";
7c673cae
FG
4190 summary.push_back(make_pair(HEALTH_WARN, msg));
4191 if (detail) {
4192 detail->push_back(make_pair(HEALTH_WARN, msg));
4193 }
4194 }
4195
224ce89b
WB
4196 for (auto it : osdmap.get_pools()) {
4197 const pg_pool_t &pool = it.second;
4198 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4199 const string& pool_name = osdmap.get_pool_name(it.first);
4200 stringstream ss;
4201 ss << "pool '" << pool_name << "' is full";
4202 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4203 if (detail)
4204 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4205 }
4206 }
7c673cae
FG
4207 }
4208}
4209
4210void OSDMonitor::dump_info(Formatter *f)
4211{
4212 f->open_object_section("osdmap");
4213 osdmap.dump(f);
4214 f->close_section();
4215
4216 f->open_array_section("osd_metadata");
4217 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4218 if (osdmap.exists(i)) {
4219 f->open_object_section("osd");
4220 f->dump_unsigned("id", i);
4221 dump_osd_metadata(i, f, NULL);
4222 f->close_section();
4223 }
4224 }
4225 f->close_section();
4226
4227 f->dump_unsigned("osdmap_first_committed", get_first_committed());
4228 f->dump_unsigned("osdmap_last_committed", get_last_committed());
4229
4230 f->open_object_section("crushmap");
4231 osdmap.crush->dump(f);
4232 f->close_section();
4233}
4234
4235namespace {
4236 enum osd_pool_get_choices {
4237 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
28e407b8 4238 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
7c673cae
FG
4239 NODELETE, NOPGCHANGE, NOSIZECHANGE,
4240 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4241 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4242 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4243 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4244 CACHE_TARGET_FULL_RATIO,
4245 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4246 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4247 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4248 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4249 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4250 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4251 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4252 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4253 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
4254
4255 std::set<osd_pool_get_choices>
4256 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4257 const std::set<osd_pool_get_choices>& second)
4258 {
4259 std::set<osd_pool_get_choices> result;
4260 std::set_difference(first.begin(), first.end(),
4261 second.begin(), second.end(),
4262 std::inserter(result, result.end()));
4263 return result;
4264 }
4265}
4266
4267
4268bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4269{
4270 op->mark_osdmon_event(__func__);
4271 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4272 int r = 0;
4273 bufferlist rdata;
4274 stringstream ss, ds;
4275
4276 map<string, cmd_vartype> cmdmap;
4277 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4278 string rs = ss.str();
4279 mon->reply_command(op, -EINVAL, rs, get_last_committed());
4280 return true;
4281 }
4282
4283 MonSession *session = m->get_session();
4284 if (!session) {
4285 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4286 return true;
4287 }
4288
4289 string prefix;
f64942e4 4290 cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
7c673cae
FG
4291
4292 string format;
f64942e4 4293 cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
7c673cae
FG
4294 boost::scoped_ptr<Formatter> f(Formatter::create(format));
4295
4296 if (prefix == "osd stat") {
224ce89b 4297 osdmap.print_summary(f.get(), ds, "");
7c673cae
FG
4298 if (f)
4299 f->flush(rdata);
4300 else
4301 rdata.append(ds);
4302 }
4303 else if (prefix == "osd perf" ||
4304 prefix == "osd blocked-by") {
31f18b77
FG
4305 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4306 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
4307 }
4308 else if (prefix == "osd dump" ||
4309 prefix == "osd tree" ||
4310 prefix == "osd ls" ||
4311 prefix == "osd getmap" ||
31f18b77
FG
4312 prefix == "osd getcrushmap" ||
4313 prefix == "osd ls-tree") {
7c673cae
FG
4314 string val;
4315
4316 epoch_t epoch = 0;
4317 int64_t epochnum;
f64942e4 4318 cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
7c673cae
FG
4319 epoch = epochnum;
4320
4321 bufferlist osdmap_bl;
4322 int err = get_version_full(epoch, osdmap_bl);
4323 if (err == -ENOENT) {
4324 r = -ENOENT;
4325 ss << "there is no map for epoch " << epoch;
4326 goto reply;
4327 }
4328 assert(err == 0);
4329 assert(osdmap_bl.length());
4330
4331 OSDMap *p;
4332 if (epoch == osdmap.get_epoch()) {
4333 p = &osdmap;
4334 } else {
4335 p = new OSDMap;
4336 p->decode(osdmap_bl);
4337 }
4338
224ce89b
WB
4339 auto sg = make_scope_guard([&] {
4340 if (p != &osdmap) {
4341 delete p;
4342 }
4343 });
4344
7c673cae
FG
4345 if (prefix == "osd dump") {
4346 stringstream ds;
4347 if (f) {
4348 f->open_object_section("osdmap");
4349 p->dump(f.get());
4350 f->close_section();
4351 f->flush(ds);
4352 } else {
4353 p->print(ds);
4354 }
4355 rdata.append(ds);
4356 if (!f)
4357 ds << " ";
4358 } else if (prefix == "osd ls") {
4359 if (f) {
4360 f->open_array_section("osds");
4361 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4362 if (osdmap.exists(i)) {
4363 f->dump_int("osd", i);
4364 }
4365 }
4366 f->close_section();
4367 f->flush(ds);
4368 } else {
4369 bool first = true;
4370 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4371 if (osdmap.exists(i)) {
4372 if (!first)
4373 ds << "\n";
4374 first = false;
4375 ds << i;
4376 }
4377 }
4378 }
4379 rdata.append(ds);
4380 } else if (prefix == "osd tree") {
31f18b77 4381 vector<string> states;
f64942e4 4382 cmd_getval_throws(g_ceph_context, cmdmap, "states", states);
31f18b77
FG
4383 unsigned filter = 0;
4384 for (auto& s : states) {
4385 if (s == "up") {
4386 filter |= OSDMap::DUMP_UP;
4387 } else if (s == "down") {
4388 filter |= OSDMap::DUMP_DOWN;
4389 } else if (s == "in") {
4390 filter |= OSDMap::DUMP_IN;
4391 } else if (s == "out") {
4392 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
4393 } else if (s == "destroyed") {
4394 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
4395 } else {
4396 ss << "unrecognized state '" << s << "'";
4397 r = -EINVAL;
4398 goto reply;
4399 }
4400 }
4401 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
4402 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4403 ss << "cannot specify both 'in' and 'out'";
4404 r = -EINVAL;
4405 goto reply;
4406 }
4407 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4408 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4409 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4410 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4411 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4412 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4413 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
4414 r = -EINVAL;
4415 goto reply;
4416 }
7c673cae
FG
4417 if (f) {
4418 f->open_object_section("tree");
31f18b77 4419 p->print_tree(f.get(), NULL, filter);
7c673cae
FG
4420 f->close_section();
4421 f->flush(ds);
4422 } else {
31f18b77 4423 p->print_tree(NULL, &ds, filter);
7c673cae
FG
4424 }
4425 rdata.append(ds);
4426 } else if (prefix == "osd getmap") {
4427 rdata.append(osdmap_bl);
4428 ss << "got osdmap epoch " << p->get_epoch();
4429 } else if (prefix == "osd getcrushmap") {
4430 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
4431 ss << p->get_crush_version();
4432 } else if (prefix == "osd ls-tree") {
4433 string bucket_name;
f64942e4 4434 cmd_getval_throws(g_ceph_context, cmdmap, "name", bucket_name);
31f18b77
FG
4435 set<int> osds;
4436 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4437 if (r == -ENOENT) {
4438 ss << "\"" << bucket_name << "\" does not exist";
4439 goto reply;
4440 } else if (r < 0) {
4441 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4442 goto reply;
4443 }
4444
4445 if (f) {
4446 f->open_array_section("osds");
4447 for (auto &i : osds) {
4448 if (osdmap.exists(i)) {
4449 f->dump_int("osd", i);
4450 }
4451 }
4452 f->close_section();
4453 f->flush(ds);
4454 } else {
4455 bool first = true;
4456 for (auto &i : osds) {
4457 if (osdmap.exists(i)) {
4458 if (!first)
4459 ds << "\n";
4460 first = false;
4461 ds << i;
4462 }
4463 }
4464 }
4465
4466 rdata.append(ds);
7c673cae 4467 }
7c673cae
FG
4468 } else if (prefix == "osd df") {
4469 string method;
4470 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
31f18b77
FG
4471 print_osd_utilization(osdmap, mon->pgservice, ds,
4472 f.get(), method == "tree");
7c673cae
FG
4473 rdata.append(ds);
4474 } else if (prefix == "osd getmaxosd") {
4475 if (f) {
4476 f->open_object_section("getmaxosd");
4477 f->dump_unsigned("epoch", osdmap.get_epoch());
4478 f->dump_int("max_osd", osdmap.get_max_osd());
4479 f->close_section();
4480 f->flush(rdata);
4481 } else {
4482 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4483 rdata.append(ds);
4484 }
4485 } else if (prefix == "osd utilization") {
4486 string out;
4487 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4488 if (f)
4489 f->flush(rdata);
4490 else
4491 rdata.append(out);
4492 r = 0;
4493 goto reply;
4494 } else if (prefix == "osd find") {
4495 int64_t osd;
f64942e4 4496 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
7c673cae
FG
4497 ss << "unable to parse osd id value '"
4498 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4499 r = -EINVAL;
4500 goto reply;
4501 }
4502 if (!osdmap.exists(osd)) {
4503 ss << "osd." << osd << " does not exist";
4504 r = -ENOENT;
4505 goto reply;
4506 }
4507 string format;
f64942e4 4508 cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
7c673cae
FG
4509 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4510 f->open_object_section("osd_location");
4511 f->dump_int("osd", osd);
4512 f->dump_stream("ip") << osdmap.get_addr(osd);
f64942e4 4513 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
7c673cae
FG
4514 f->open_object_section("crush_location");
4515 map<string,string> loc = osdmap.crush->get_full_location(osd);
4516 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4517 f->dump_string(p->first.c_str(), p->second);
4518 f->close_section();
4519 f->close_section();
4520 f->flush(rdata);
4521 } else if (prefix == "osd metadata") {
4522 int64_t osd = -1;
4523 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
f64942e4 4524 !cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
7c673cae
FG
4525 ss << "unable to parse osd id value '"
4526 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4527 r = -EINVAL;
4528 goto reply;
4529 }
4530 if (osd >= 0 && !osdmap.exists(osd)) {
4531 ss << "osd." << osd << " does not exist";
4532 r = -ENOENT;
4533 goto reply;
4534 }
4535 string format;
f64942e4 4536 cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
7c673cae
FG
4537 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4538 if (osd >= 0) {
4539 f->open_object_section("osd_metadata");
4540 f->dump_unsigned("id", osd);
4541 r = dump_osd_metadata(osd, f.get(), &ss);
4542 if (r < 0)
4543 goto reply;
4544 f->close_section();
4545 } else {
4546 r = 0;
4547 f->open_array_section("osd_metadata");
4548 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4549 if (osdmap.exists(i)) {
4550 f->open_object_section("osd");
4551 f->dump_unsigned("id", i);
4552 r = dump_osd_metadata(i, f.get(), NULL);
4553 if (r == -EINVAL || r == -ENOENT) {
4554 // Drop error, continue to get other daemons' metadata
4555 dout(4) << "No metadata for osd." << i << dendl;
4556 r = 0;
4557 } else if (r < 0) {
4558 // Unexpected error
4559 goto reply;
4560 }
4561 f->close_section();
4562 }
4563 }
4564 f->close_section();
4565 }
4566 f->flush(rdata);
31f18b77
FG
4567 } else if (prefix == "osd versions") {
4568 if (!f)
4569 f.reset(Formatter::create("json-pretty"));
4570 count_metadata("ceph_version", f.get());
4571 f->flush(rdata);
4572 r = 0;
4573 } else if (prefix == "osd count-metadata") {
4574 if (!f)
4575 f.reset(Formatter::create("json-pretty"));
4576 string field;
f64942e4 4577 cmd_getval_throws(g_ceph_context, cmdmap, "property", field);
31f18b77
FG
4578 count_metadata(field, f.get());
4579 f->flush(rdata);
4580 r = 0;
7c673cae
FG
4581 } else if (prefix == "osd map") {
4582 string poolstr, objstr, namespacestr;
f64942e4
AA
4583 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
4584 cmd_getval_throws(g_ceph_context, cmdmap, "object", objstr);
4585 cmd_getval_throws(g_ceph_context, cmdmap, "nspace", namespacestr);
7c673cae
FG
4586
4587 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4588 if (pool < 0) {
4589 ss << "pool " << poolstr << " does not exist";
4590 r = -ENOENT;
4591 goto reply;
4592 }
4593 object_locator_t oloc(pool, namespacestr);
4594 object_t oid(objstr);
4595 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4596 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4597 vector<int> up, acting;
4598 int up_p, acting_p;
4599 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4600
4601 string fullobjname;
4602 if (!namespacestr.empty())
4603 fullobjname = namespacestr + string("/") + oid.name;
4604 else
4605 fullobjname = oid.name;
4606 if (f) {
4607 f->open_object_section("osd_map");
4608 f->dump_unsigned("epoch", osdmap.get_epoch());
4609 f->dump_string("pool", poolstr);
4610 f->dump_int("pool_id", pool);
4611 f->dump_stream("objname") << fullobjname;
4612 f->dump_stream("raw_pgid") << pgid;
4613 f->dump_stream("pgid") << mpgid;
4614 f->open_array_section("up");
4615 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4616 f->dump_int("osd", *p);
4617 f->close_section();
4618 f->dump_int("up_primary", up_p);
4619 f->open_array_section("acting");
4620 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4621 f->dump_int("osd", *p);
4622 f->close_section();
4623 f->dump_int("acting_primary", acting_p);
4624 f->close_section(); // osd_map
4625 f->flush(rdata);
4626 } else {
4627 ds << "osdmap e" << osdmap.get_epoch()
4628 << " pool '" << poolstr << "' (" << pool << ")"
4629 << " object '" << fullobjname << "' ->"
4630 << " pg " << pgid << " (" << mpgid << ")"
4631 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4632 << pg_vector_string(acting) << ", p" << acting_p << ")";
4633 rdata.append(ds);
4634 }
4635
4636 } else if (prefix == "pg map") {
4637 pg_t pgid;
4638 string pgidstr;
f64942e4 4639 cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
7c673cae
FG
4640 if (!pgid.parse(pgidstr.c_str())) {
4641 ss << "invalid pgid '" << pgidstr << "'";
4642 r = -EINVAL;
4643 goto reply;
4644 }
4645 vector<int> up, acting;
4646 if (!osdmap.have_pg_pool(pgid.pool())) {
4647 ss << "pg '" << pgidstr << "' does not exist";
4648 r = -ENOENT;
4649 goto reply;
4650 }
4651 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4652 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4653 if (f) {
4654 f->open_object_section("pg_map");
4655 f->dump_unsigned("epoch", osdmap.get_epoch());
4656 f->dump_stream("raw_pgid") << pgid;
4657 f->dump_stream("pgid") << mpgid;
4658 f->open_array_section("up");
4659 for (auto osd : up) {
4660 f->dump_int("up_osd", osd);
4661 }
4662 f->close_section();
4663 f->open_array_section("acting");
4664 for (auto osd : acting) {
4665 f->dump_int("acting_osd", osd);
4666 }
4667 f->close_section();
4668 f->close_section();
4669 f->flush(rdata);
4670 } else {
4671 ds << "osdmap e" << osdmap.get_epoch()
4672 << " pg " << pgid << " (" << mpgid << ")"
4673 << " -> up " << up << " acting " << acting;
4674 rdata.append(ds);
4675 }
4676 goto reply;
4677
224ce89b
WB
4678 } else if (prefix == "osd scrub" ||
4679 prefix == "osd deep-scrub" ||
4680 prefix == "osd repair") {
7c673cae
FG
4681 string whostr;
4682 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4683 vector<string> pvec;
4684 get_str_vec(prefix, pvec);
4685
224ce89b 4686 if (whostr == "*" || whostr == "all" || whostr == "any") {
7c673cae
FG
4687 ss << "osds ";
4688 int c = 0;
4689 for (int i = 0; i < osdmap.get_max_osd(); i++)
4690 if (osdmap.is_up(i)) {
4691 ss << (c++ ? "," : "") << i;
4692 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4693 pvec.back() == "repair",
4694 pvec.back() == "deep-scrub"),
4695 osdmap.get_inst(i));
4696 }
4697 r = 0;
4698 ss << " instructed to " << pvec.back();
4699 } else {
4700 long osd = parse_osd_id(whostr.c_str(), &ss);
4701 if (osd < 0) {
4702 r = -EINVAL;
4703 } else if (osdmap.is_up(osd)) {
4704 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4705 pvec.back() == "repair",
4706 pvec.back() == "deep-scrub"),
4707 osdmap.get_inst(osd));
4708 ss << "osd." << osd << " instructed to " << pvec.back();
4709 } else {
4710 ss << "osd." << osd << " is not up";
4711 r = -EAGAIN;
4712 }
4713 }
4714 } else if (prefix == "osd lspools") {
4715 int64_t auid;
f64942e4 4716 cmd_getval_throws(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
7c673cae
FG
4717 if (f)
4718 f->open_array_section("pools");
4719 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4720 p != osdmap.pools.end();
4721 ++p) {
4722 if (!auid || p->second.auid == (uint64_t)auid) {
4723 if (f) {
4724 f->open_object_section("pool");
4725 f->dump_int("poolnum", p->first);
4726 f->dump_string("poolname", osdmap.pool_name[p->first]);
4727 f->close_section();
4728 } else {
4729 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4730 }
4731 }
4732 }
4733 if (f) {
4734 f->close_section();
4735 f->flush(ds);
4736 }
4737 rdata.append(ds);
4738 } else if (prefix == "osd blacklist ls") {
4739 if (f)
4740 f->open_array_section("blacklist");
4741
4742 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4743 p != osdmap.blacklist.end();
4744 ++p) {
4745 if (f) {
4746 f->open_object_section("entry");
4747 f->dump_stream("addr") << p->first;
4748 f->dump_stream("until") << p->second;
4749 f->close_section();
4750 } else {
4751 stringstream ss;
4752 string s;
4753 ss << p->first << " " << p->second;
4754 getline(ss, s);
4755 s += "\n";
4756 rdata.append(s);
4757 }
4758 }
4759 if (f) {
4760 f->close_section();
4761 f->flush(rdata);
4762 }
4763 ss << "listed " << osdmap.blacklist.size() << " entries";
4764
4765 } else if (prefix == "osd pool ls") {
4766 string detail;
f64942e4 4767 cmd_getval_throws(g_ceph_context, cmdmap, "detail", detail);
7c673cae
FG
4768 if (!f && detail == "detail") {
4769 ostringstream ss;
4770 osdmap.print_pools(ss);
4771 rdata.append(ss.str());
4772 } else {
4773 if (f)
4774 f->open_array_section("pools");
4775 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4776 it != osdmap.get_pools().end();
4777 ++it) {
4778 if (f) {
4779 if (detail == "detail") {
4780 f->open_object_section("pool");
4781 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4782 it->second.dump(f.get());
4783 f->close_section();
4784 } else {
4785 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4786 }
4787 } else {
4788 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4789 }
4790 }
4791 if (f) {
4792 f->close_section();
4793 f->flush(rdata);
4794 }
4795 }
4796
4797 } else if (prefix == "osd crush get-tunable") {
4798 string tunable;
f64942e4 4799 cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
7c673cae
FG
4800 ostringstream rss;
4801 if (f)
4802 f->open_object_section("tunable");
4803 if (tunable == "straw_calc_version") {
4804 if (f)
4805 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4806 else
4807 rss << osdmap.crush->get_straw_calc_version() << "\n";
4808 } else {
4809 r = -EINVAL;
4810 goto reply;
4811 }
4812 if (f) {
4813 f->close_section();
4814 f->flush(rdata);
4815 } else {
4816 rdata.append(rss.str());
4817 }
4818 r = 0;
4819
4820 } else if (prefix == "osd pool get") {
4821 string poolstr;
f64942e4 4822 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
4823 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4824 if (pool < 0) {
4825 ss << "unrecognized pool '" << poolstr << "'";
4826 r = -ENOENT;
4827 goto reply;
4828 }
4829
4830 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4831 string var;
f64942e4 4832 cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
7c673cae
FG
4833
4834 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4835 const choices_map_t ALL_CHOICES = {
4836 {"size", SIZE},
4837 {"min_size", MIN_SIZE},
4838 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4839 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
28e407b8
AA
4840 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
4841 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
7c673cae
FG
4842 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4843 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4844 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4845 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4846 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4847 {"use_gmt_hitset", USE_GMT_HITSET},
4848 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4849 {"target_max_bytes", TARGET_MAX_BYTES},
4850 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4851 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4852 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4853 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4854 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4855 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4856 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4857 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4858 {"fast_read", FAST_READ},
4859 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4860 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4861 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4862 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4863 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4864 {"recovery_priority", RECOVERY_PRIORITY},
4865 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4866 {"scrub_priority", SCRUB_PRIORITY},
4867 {"compression_mode", COMPRESSION_MODE},
4868 {"compression_algorithm", COMPRESSION_ALGORITHM},
4869 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4870 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4871 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4872 {"csum_type", CSUM_TYPE},
4873 {"csum_max_block", CSUM_MAX_BLOCK},
4874 {"csum_min_block", CSUM_MIN_BLOCK},
4875 };
4876
4877 typedef std::set<osd_pool_get_choices> choices_set_t;
4878
4879 const choices_set_t ONLY_TIER_CHOICES = {
4880 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4881 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4882 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4883 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4884 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 4885 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
4886 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4887 };
4888 const choices_set_t ONLY_ERASURE_CHOICES = {
28e407b8 4889 EC_OVERWRITES, ERASURE_CODE_PROFILE
7c673cae
FG
4890 };
4891
4892 choices_set_t selected_choices;
4893 if (var == "all") {
4894 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4895 it != ALL_CHOICES.end(); ++it) {
4896 selected_choices.insert(it->second);
4897 }
4898
4899 if(!p->is_tier()) {
4900 selected_choices = subtract_second_from_first(selected_choices,
4901 ONLY_TIER_CHOICES);
4902 }
4903
4904 if(!p->is_erasure()) {
4905 selected_choices = subtract_second_from_first(selected_choices,
4906 ONLY_ERASURE_CHOICES);
4907 }
4908 } else /* var != "all" */ {
4909 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4910 osd_pool_get_choices selected = found->second;
4911
4912 if (!p->is_tier() &&
4913 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4914 ss << "pool '" << poolstr
4915 << "' is not a tier pool: variable not applicable";
4916 r = -EACCES;
4917 goto reply;
4918 }
4919
4920 if (!p->is_erasure() &&
4921 ONLY_ERASURE_CHOICES.find(selected)
4922 != ONLY_ERASURE_CHOICES.end()) {
4923 ss << "pool '" << poolstr
4924 << "' is not a erasure pool: variable not applicable";
4925 r = -EACCES;
4926 goto reply;
4927 }
4928
94b18763
FG
4929 if (pool_opts_t::is_opt_name(var) &&
4930 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
4931 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
4932 r = -ENOENT;
4933 goto reply;
4934 }
4935
7c673cae
FG
4936 selected_choices.insert(selected);
4937 }
4938
4939 if (f) {
94b18763
FG
4940 f->open_object_section("pool");
4941 f->dump_string("pool", poolstr);
4942 f->dump_int("pool_id", pool);
7c673cae
FG
4943 for(choices_set_t::const_iterator it = selected_choices.begin();
4944 it != selected_choices.end(); ++it) {
4945 choices_map_t::const_iterator i;
c07f9fc5
FG
4946 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4947 if (i->second == *it) {
4948 break;
4949 }
4950 }
4951 assert(i != ALL_CHOICES.end());
7c673cae
FG
4952 switch(*it) {
4953 case PG_NUM:
4954 f->dump_int("pg_num", p->get_pg_num());
4955 break;
4956 case PGP_NUM:
4957 f->dump_int("pgp_num", p->get_pgp_num());
4958 break;
4959 case AUID:
4960 f->dump_int("auid", p->get_auid());
4961 break;
4962 case SIZE:
4963 f->dump_int("size", p->get_size());
4964 break;
4965 case MIN_SIZE:
4966 f->dump_int("min_size", p->get_min_size());
4967 break;
4968 case CRASH_REPLAY_INTERVAL:
4969 f->dump_int("crash_replay_interval",
4970 p->get_crash_replay_interval());
4971 break;
4972 case CRUSH_RULE:
31f18b77 4973 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4974 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 4975 p->get_crush_rule()));
7c673cae 4976 } else {
31f18b77 4977 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
4978 }
4979 break;
28e407b8
AA
4980 case EC_OVERWRITES:
4981 f->dump_bool("allow_ec_overwrites",
4982 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
4983 break;
7c673cae
FG
4984 case HASHPSPOOL:
4985 case NODELETE:
4986 case NOPGCHANGE:
4987 case NOSIZECHANGE:
4988 case WRITE_FADVISE_DONTNEED:
4989 case NOSCRUB:
4990 case NODEEP_SCRUB:
94b18763
FG
4991 f->dump_bool(i->first.c_str(),
4992 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
4993 break;
4994 case HIT_SET_PERIOD:
4995 f->dump_int("hit_set_period", p->hit_set_period);
4996 break;
4997 case HIT_SET_COUNT:
4998 f->dump_int("hit_set_count", p->hit_set_count);
4999 break;
5000 case HIT_SET_TYPE:
5001 f->dump_string("hit_set_type",
5002 HitSet::get_type_name(p->hit_set_params.get_type()));
5003 break;
5004 case HIT_SET_FPP:
5005 {
5006 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5007 BloomHitSet::Params *bloomp =
5008 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5009 f->dump_float("hit_set_fpp", bloomp->get_fpp());
5010 } else if(var != "all") {
5011 f->close_section();
5012 ss << "hit set is not of type Bloom; " <<
5013 "invalid to get a false positive rate!";
5014 r = -EINVAL;
5015 goto reply;
5016 }
5017 }
5018 break;
5019 case USE_GMT_HITSET:
5020 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
5021 break;
5022 case TARGET_MAX_OBJECTS:
5023 f->dump_unsigned("target_max_objects", p->target_max_objects);
5024 break;
5025 case TARGET_MAX_BYTES:
5026 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
5027 break;
5028 case CACHE_TARGET_DIRTY_RATIO:
5029 f->dump_unsigned("cache_target_dirty_ratio_micro",
5030 p->cache_target_dirty_ratio_micro);
5031 f->dump_float("cache_target_dirty_ratio",
5032 ((float)p->cache_target_dirty_ratio_micro/1000000));
5033 break;
5034 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5035 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
5036 p->cache_target_dirty_high_ratio_micro);
5037 f->dump_float("cache_target_dirty_high_ratio",
5038 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
5039 break;
5040 case CACHE_TARGET_FULL_RATIO:
5041 f->dump_unsigned("cache_target_full_ratio_micro",
5042 p->cache_target_full_ratio_micro);
5043 f->dump_float("cache_target_full_ratio",
5044 ((float)p->cache_target_full_ratio_micro/1000000));
5045 break;
5046 case CACHE_MIN_FLUSH_AGE:
5047 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
5048 break;
5049 case CACHE_MIN_EVICT_AGE:
5050 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
5051 break;
5052 case ERASURE_CODE_PROFILE:
5053 f->dump_string("erasure_code_profile", p->erasure_code_profile);
5054 break;
5055 case MIN_READ_RECENCY_FOR_PROMOTE:
5056 f->dump_int("min_read_recency_for_promote",
5057 p->min_read_recency_for_promote);
5058 break;
5059 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5060 f->dump_int("min_write_recency_for_promote",
5061 p->min_write_recency_for_promote);
5062 break;
5063 case FAST_READ:
5064 f->dump_int("fast_read", p->fast_read);
5065 break;
5066 case HIT_SET_GRADE_DECAY_RATE:
5067 f->dump_int("hit_set_grade_decay_rate",
5068 p->hit_set_grade_decay_rate);
5069 break;
5070 case HIT_SET_SEARCH_LAST_N:
5071 f->dump_int("hit_set_search_last_n",
5072 p->hit_set_search_last_n);
5073 break;
5074 case SCRUB_MIN_INTERVAL:
5075 case SCRUB_MAX_INTERVAL:
5076 case DEEP_SCRUB_INTERVAL:
5077 case RECOVERY_PRIORITY:
5078 case RECOVERY_OP_PRIORITY:
5079 case SCRUB_PRIORITY:
5080 case COMPRESSION_MODE:
5081 case COMPRESSION_ALGORITHM:
5082 case COMPRESSION_REQUIRED_RATIO:
5083 case COMPRESSION_MAX_BLOB_SIZE:
5084 case COMPRESSION_MIN_BLOB_SIZE:
5085 case CSUM_TYPE:
5086 case CSUM_MAX_BLOCK:
5087 case CSUM_MIN_BLOCK:
c07f9fc5
FG
5088 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5089 if (p->opts.is_set(key)) {
c07f9fc5
FG
5090 if(*it == CSUM_TYPE) {
5091 int val;
5092 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
5093 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
5094 } else {
5095 p->opts.dump(i->first, f.get());
5096 }
94b18763 5097 }
7c673cae
FG
5098 break;
5099 }
7c673cae 5100 }
94b18763
FG
5101 f->close_section();
5102 f->flush(rdata);
7c673cae
FG
5103 } else /* !f */ {
5104 for(choices_set_t::const_iterator it = selected_choices.begin();
5105 it != selected_choices.end(); ++it) {
5106 choices_map_t::const_iterator i;
5107 switch(*it) {
5108 case PG_NUM:
5109 ss << "pg_num: " << p->get_pg_num() << "\n";
5110 break;
5111 case PGP_NUM:
5112 ss << "pgp_num: " << p->get_pgp_num() << "\n";
5113 break;
5114 case AUID:
5115 ss << "auid: " << p->get_auid() << "\n";
5116 break;
5117 case SIZE:
5118 ss << "size: " << p->get_size() << "\n";
5119 break;
5120 case MIN_SIZE:
5121 ss << "min_size: " << p->get_min_size() << "\n";
5122 break;
5123 case CRASH_REPLAY_INTERVAL:
5124 ss << "crash_replay_interval: " <<
5125 p->get_crash_replay_interval() << "\n";
5126 break;
5127 case CRUSH_RULE:
31f18b77 5128 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 5129 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 5130 p->get_crush_rule()) << "\n";
7c673cae 5131 } else {
31f18b77 5132 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
5133 }
5134 break;
7c673cae
FG
5135 case HIT_SET_PERIOD:
5136 ss << "hit_set_period: " << p->hit_set_period << "\n";
5137 break;
5138 case HIT_SET_COUNT:
5139 ss << "hit_set_count: " << p->hit_set_count << "\n";
5140 break;
5141 case HIT_SET_TYPE:
5142 ss << "hit_set_type: " <<
5143 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
5144 break;
5145 case HIT_SET_FPP:
5146 {
5147 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5148 BloomHitSet::Params *bloomp =
5149 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5150 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
5151 } else if(var != "all") {
5152 ss << "hit set is not of type Bloom; " <<
5153 "invalid to get a false positive rate!";
5154 r = -EINVAL;
5155 goto reply;
5156 }
5157 }
5158 break;
5159 case USE_GMT_HITSET:
5160 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
5161 break;
5162 case TARGET_MAX_OBJECTS:
5163 ss << "target_max_objects: " << p->target_max_objects << "\n";
5164 break;
5165 case TARGET_MAX_BYTES:
5166 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
5167 break;
5168 case CACHE_TARGET_DIRTY_RATIO:
5169 ss << "cache_target_dirty_ratio: "
5170 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
5171 break;
5172 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5173 ss << "cache_target_dirty_high_ratio: "
5174 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
5175 break;
5176 case CACHE_TARGET_FULL_RATIO:
5177 ss << "cache_target_full_ratio: "
5178 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
5179 break;
5180 case CACHE_MIN_FLUSH_AGE:
5181 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5182 break;
5183 case CACHE_MIN_EVICT_AGE:
5184 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5185 break;
5186 case ERASURE_CODE_PROFILE:
5187 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5188 break;
5189 case MIN_READ_RECENCY_FOR_PROMOTE:
5190 ss << "min_read_recency_for_promote: " <<
5191 p->min_read_recency_for_promote << "\n";
5192 break;
5193 case HIT_SET_GRADE_DECAY_RATE:
5194 ss << "hit_set_grade_decay_rate: " <<
5195 p->hit_set_grade_decay_rate << "\n";
5196 break;
5197 case HIT_SET_SEARCH_LAST_N:
5198 ss << "hit_set_search_last_n: " <<
5199 p->hit_set_search_last_n << "\n";
5200 break;
28e407b8
AA
5201 case EC_OVERWRITES:
5202 ss << "allow_ec_overwrites: " <<
5203 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
5204 "\n";
5205 break;
7c673cae
FG
5206 case HASHPSPOOL:
5207 case NODELETE:
5208 case NOPGCHANGE:
5209 case NOSIZECHANGE:
5210 case WRITE_FADVISE_DONTNEED:
5211 case NOSCRUB:
5212 case NODEEP_SCRUB:
5213 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5214 if (i->second == *it)
5215 break;
5216 }
5217 assert(i != ALL_CHOICES.end());
5218 ss << i->first << ": " <<
5219 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5220 "true" : "false") << "\n";
5221 break;
5222 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5223 ss << "min_write_recency_for_promote: " <<
5224 p->min_write_recency_for_promote << "\n";
5225 break;
5226 case FAST_READ:
5227 ss << "fast_read: " << p->fast_read << "\n";
5228 break;
5229 case SCRUB_MIN_INTERVAL:
5230 case SCRUB_MAX_INTERVAL:
5231 case DEEP_SCRUB_INTERVAL:
5232 case RECOVERY_PRIORITY:
5233 case RECOVERY_OP_PRIORITY:
5234 case SCRUB_PRIORITY:
5235 case COMPRESSION_MODE:
5236 case COMPRESSION_ALGORITHM:
5237 case COMPRESSION_REQUIRED_RATIO:
5238 case COMPRESSION_MAX_BLOB_SIZE:
5239 case COMPRESSION_MIN_BLOB_SIZE:
5240 case CSUM_TYPE:
5241 case CSUM_MAX_BLOCK:
5242 case CSUM_MIN_BLOCK:
5243 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5244 if (i->second == *it)
5245 break;
5246 }
5247 assert(i != ALL_CHOICES.end());
5248 {
5249 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5250 if (p->opts.is_set(key)) {
5251 if(key == pool_opts_t::CSUM_TYPE) {
5252 int val;
5253 p->opts.get(key, &val);
5254 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5255 } else {
5256 ss << i->first << ": " << p->opts.get(key) << "\n";
5257 }
5258 }
5259 }
5260 break;
5261 }
5262 rdata.append(ss.str());
5263 ss.str("");
5264 }
5265 }
5266 r = 0;
5267 } else if (prefix == "osd pool stats") {
31f18b77
FG
5268 r = mon->pgservice->process_pg_command(prefix, cmdmap,
5269 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
5270 } else if (prefix == "osd pool get-quota") {
5271 string pool_name;
f64942e4 5272 cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
7c673cae
FG
5273
5274 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5275 if (poolid < 0) {
5276 assert(poolid == -ENOENT);
5277 ss << "unrecognized pool '" << pool_name << "'";
5278 r = -ENOENT;
5279 goto reply;
5280 }
5281 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5282
5283 if (f) {
5284 f->open_object_section("pool_quotas");
5285 f->dump_string("pool_name", pool_name);
5286 f->dump_unsigned("pool_id", poolid);
5287 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5288 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5289 f->close_section();
5290 f->flush(rdata);
5291 } else {
5292 stringstream rs;
5293 rs << "quotas for pool '" << pool_name << "':\n"
5294 << " max objects: ";
5295 if (p->quota_max_objects == 0)
5296 rs << "N/A";
5297 else
1adf2230 5298 rs << si_u_t(p->quota_max_objects) << " objects";
7c673cae
FG
5299 rs << "\n"
5300 << " max bytes : ";
5301 if (p->quota_max_bytes == 0)
5302 rs << "N/A";
5303 else
1adf2230 5304 rs << byte_u_t(p->quota_max_bytes);
7c673cae
FG
5305 rdata.append(rs.str());
5306 }
5307 rdata.append("\n");
5308 r = 0;
5309 } else if (prefix == "osd crush rule list" ||
5310 prefix == "osd crush rule ls") {
c07f9fc5
FG
5311 if (f) {
5312 f->open_array_section("rules");
5313 osdmap.crush->list_rules(f.get());
5314 f->close_section();
5315 f->flush(rdata);
5316 } else {
5317 ostringstream ss;
5318 osdmap.crush->list_rules(&ss);
5319 rdata.append(ss.str());
5320 }
b5b8bbf5
FG
5321 } else if (prefix == "osd crush rule ls-by-class") {
5322 string class_name;
f64942e4 5323 cmd_getval_throws(g_ceph_context, cmdmap, "class", class_name);
b5b8bbf5
FG
5324 if (class_name.empty()) {
5325 ss << "no class specified";
5326 r = -EINVAL;
5327 goto reply;
5328 }
5329 set<int> rules;
5330 r = osdmap.crush->get_rules_by_class(class_name, &rules);
5331 if (r < 0) {
5332 ss << "failed to get rules by class '" << class_name << "'";
5333 goto reply;
5334 }
5335 if (f) {
5336 f->open_array_section("rules");
5337 for (auto &rule: rules) {
5338 f->dump_string("name", osdmap.crush->get_rule_name(rule));
5339 }
5340 f->close_section();
5341 f->flush(rdata);
5342 } else {
5343 ostringstream rs;
5344 for (auto &rule: rules) {
5345 rs << osdmap.crush->get_rule_name(rule) << "\n";
5346 }
5347 rdata.append(rs.str());
5348 }
7c673cae
FG
5349 } else if (prefix == "osd crush rule dump") {
5350 string name;
f64942e4 5351 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae 5352 string format;
f64942e4 5353 cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
7c673cae
FG
5354 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5355 if (name == "") {
5356 f->open_array_section("rules");
5357 osdmap.crush->dump_rules(f.get());
5358 f->close_section();
5359 } else {
5360 int ruleno = osdmap.crush->get_rule_id(name);
5361 if (ruleno < 0) {
31f18b77 5362 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
5363 r = ruleno;
5364 goto reply;
5365 }
5366 osdmap.crush->dump_rule(ruleno, f.get());
5367 }
5368 ostringstream rs;
5369 f->flush(rs);
5370 rs << "\n";
5371 rdata.append(rs.str());
5372 } else if (prefix == "osd crush dump") {
5373 string format;
f64942e4 5374 cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
7c673cae
FG
5375 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5376 f->open_object_section("crush_map");
5377 osdmap.crush->dump(f.get());
5378 f->close_section();
5379 ostringstream rs;
5380 f->flush(rs);
5381 rs << "\n";
5382 rdata.append(rs.str());
5383 } else if (prefix == "osd crush show-tunables") {
5384 string format;
f64942e4 5385 cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
7c673cae
FG
5386 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5387 f->open_object_section("crush_map_tunables");
5388 osdmap.crush->dump_tunables(f.get());
5389 f->close_section();
5390 ostringstream rs;
5391 f->flush(rs);
5392 rs << "\n";
5393 rdata.append(rs.str());
5394 } else if (prefix == "osd crush tree") {
c07f9fc5 5395 string shadow;
f64942e4 5396 cmd_getval_throws(g_ceph_context, cmdmap, "shadow", shadow);
c07f9fc5
FG
5397 bool show_shadow = shadow == "--show-shadow";
5398 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5399 if (f) {
91327a77 5400 f->open_object_section("crush_tree");
c07f9fc5
FG
5401 osdmap.crush->dump_tree(nullptr,
5402 f.get(),
5403 osdmap.get_pool_names(),
5404 show_shadow);
91327a77 5405 f->close_section();
c07f9fc5
FG
5406 f->flush(rdata);
5407 } else {
5408 ostringstream ss;
5409 osdmap.crush->dump_tree(&ss,
5410 nullptr,
5411 osdmap.get_pool_names(),
5412 show_shadow);
5413 rdata.append(ss.str());
5414 }
d2e6a577
FG
5415 } else if (prefix == "osd crush ls") {
5416 string name;
f64942e4 5417 if (!cmd_getval_throws(g_ceph_context, cmdmap, "node", name)) {
d2e6a577
FG
5418 ss << "no node specified";
5419 r = -EINVAL;
5420 goto reply;
5421 }
5422 if (!osdmap.crush->name_exists(name)) {
5423 ss << "node '" << name << "' does not exist";
5424 r = -ENOENT;
5425 goto reply;
5426 }
5427 int id = osdmap.crush->get_item_id(name);
5428 list<int> result;
5429 if (id >= 0) {
5430 result.push_back(id);
5431 } else {
5432 int num = osdmap.crush->get_bucket_size(id);
5433 for (int i = 0; i < num; ++i) {
5434 result.push_back(osdmap.crush->get_bucket_item(id, i));
5435 }
5436 }
5437 if (f) {
5438 f->open_array_section("items");
5439 for (auto i : result) {
5440 f->dump_string("item", osdmap.crush->get_item_name(i));
5441 }
5442 f->close_section();
5443 f->flush(rdata);
5444 } else {
5445 ostringstream ss;
5446 for (auto i : result) {
5447 ss << osdmap.crush->get_item_name(i) << "\n";
5448 }
5449 rdata.append(ss.str());
5450 }
5451 r = 0;
7c673cae
FG
5452 } else if (prefix == "osd crush class ls") {
5453 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5454 f->open_array_section("crush_classes");
5455 for (auto i : osdmap.crush->class_name)
5456 f->dump_string("class", i.second);
5457 f->close_section();
5458 f->flush(rdata);
224ce89b
WB
5459 } else if (prefix == "osd crush class ls-osd") {
5460 string name;
f64942e4 5461 cmd_getval_throws(g_ceph_context, cmdmap, "class", name);
224ce89b
WB
5462 set<int> osds;
5463 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
5464 if (f) {
5465 f->open_array_section("osds");
5466 for (auto &osd: osds)
5467 f->dump_int("osd", osd);
5468 f->close_section();
5469 f->flush(rdata);
5470 } else {
5471 bool first = true;
5472 for (auto &osd : osds) {
5473 if (!first)
5474 ds << "\n";
5475 first = false;
5476 ds << osd;
5477 }
5478 rdata.append(ds);
5479 }
7c673cae
FG
5480 } else if (prefix == "osd erasure-code-profile ls") {
5481 const auto &profiles = osdmap.get_erasure_code_profiles();
5482 if (f)
5483 f->open_array_section("erasure-code-profiles");
5484 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5485 if (f)
5486 f->dump_string("profile", i->first.c_str());
5487 else
5488 rdata.append(i->first + "\n");
5489 }
5490 if (f) {
5491 f->close_section();
5492 ostringstream rs;
5493 f->flush(rs);
5494 rs << "\n";
5495 rdata.append(rs.str());
5496 }
c07f9fc5
FG
5497 } else if (prefix == "osd crush weight-set ls") {
5498 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5499 if (f) {
5500 f->open_array_section("weight_sets");
5501 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5502 f->dump_string("pool", "(compat)");
5503 }
5504 for (auto& i : osdmap.crush->choose_args) {
5505 if (i.first >= 0) {
5506 f->dump_string("pool", osdmap.get_pool_name(i.first));
5507 }
5508 }
5509 f->close_section();
5510 f->flush(rdata);
5511 } else {
5512 ostringstream rs;
5513 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5514 rs << "(compat)\n";
5515 }
5516 for (auto& i : osdmap.crush->choose_args) {
5517 if (i.first >= 0) {
5518 rs << osdmap.get_pool_name(i.first) << "\n";
5519 }
5520 }
5521 rdata.append(rs.str());
5522 }
5523 } else if (prefix == "osd crush weight-set dump") {
5524 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5525 "json-pretty"));
5526 osdmap.crush->dump_choose_args(f.get());
5527 f->flush(rdata);
7c673cae
FG
5528 } else if (prefix == "osd erasure-code-profile get") {
5529 string name;
f64942e4 5530 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae
FG
5531 if (!osdmap.has_erasure_code_profile(name)) {
5532 ss << "unknown erasure code profile '" << name << "'";
5533 r = -ENOENT;
5534 goto reply;
5535 }
5536 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5537 if (f)
5538 f->open_object_section("profile");
5539 for (map<string,string>::const_iterator i = profile.begin();
5540 i != profile.end();
5541 ++i) {
5542 if (f)
5543 f->dump_string(i->first.c_str(), i->second.c_str());
5544 else
5545 rdata.append(i->first + "=" + i->second + "\n");
5546 }
5547 if (f) {
5548 f->close_section();
5549 ostringstream rs;
5550 f->flush(rs);
5551 rs << "\n";
5552 rdata.append(rs.str());
5553 }
181888fb
FG
5554 } else if (prefix == "osd pool application get") {
5555 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5556 "json-pretty"));
5557 string pool_name;
f64942e4 5558 cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
181888fb 5559 string app;
f64942e4 5560 cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
181888fb 5561 string key;
f64942e4 5562 cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
181888fb
FG
5563
5564 if (pool_name.empty()) {
5565 // all
5566 f->open_object_section("pools");
5567 for (const auto &pool : osdmap.pools) {
5568 std::string name("<unknown>");
5569 const auto &pni = osdmap.pool_name.find(pool.first);
5570 if (pni != osdmap.pool_name.end())
5571 name = pni->second;
5572 f->open_object_section(name.c_str());
5573 for (auto &app_pair : pool.second.application_metadata) {
5574 f->open_object_section(app_pair.first.c_str());
5575 for (auto &kv_pair : app_pair.second) {
5576 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5577 }
5578 f->close_section();
5579 }
5580 f->close_section(); // name
5581 }
5582 f->close_section(); // pools
5583 f->flush(rdata);
5584 } else {
5585 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
5586 if (pool < 0) {
5587 ss << "unrecognized pool '" << pool_name << "'";
5588 r = -ENOENT;
5589 goto reply;
5590 }
5591 auto p = osdmap.get_pg_pool(pool);
5592 // filter by pool
5593 if (app.empty()) {
5594 f->open_object_section(pool_name.c_str());
5595 for (auto &app_pair : p->application_metadata) {
5596 f->open_object_section(app_pair.first.c_str());
5597 for (auto &kv_pair : app_pair.second) {
5598 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5599 }
5600 f->close_section(); // application
5601 }
5602 f->close_section(); // pool_name
5603 f->flush(rdata);
5604 goto reply;
5605 }
5606
5607 auto app_it = p->application_metadata.find(app);
5608 if (app_it == p->application_metadata.end()) {
5609 ss << "pool '" << pool_name << "' has no application '" << app << "'";
5610 r = -ENOENT;
5611 goto reply;
5612 }
5613 // filter by pool + app
5614 if (key.empty()) {
5615 f->open_object_section(app_it->first.c_str());
5616 for (auto &kv_pair : app_it->second) {
5617 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5618 }
5619 f->close_section(); // application
5620 f->flush(rdata);
5621 goto reply;
5622 }
5623 // filter by pool + app + key
5624 auto key_it = app_it->second.find(key);
5625 if (key_it == app_it->second.end()) {
5626 ss << "application '" << app << "' on pool '" << pool_name
5627 << "' does not have key '" << key << "'";
5628 r = -ENOENT;
5629 goto reply;
5630 }
5631 ss << key_it->second << "\n";
5632 rdata.append(ss.str());
5633 ss.str("");
5634 }
7c673cae
FG
5635 } else {
5636 // try prepare update
5637 return false;
5638 }
5639
5640 reply:
5641 string rs;
5642 getline(ss, rs);
5643 mon->reply_command(op, r, rs, rdata, get_last_committed());
5644 return true;
5645}
5646
3efd9988
FG
5647void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
5648{
5649 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5650 osdmap.get_pg_pool(pool_id));
5651 assert(pool);
5652 pool->set_flag(flags);
5653}
5654
5655void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 5656{
3efd9988
FG
5657 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5658 osdmap.get_pg_pool(pool_id));
5659 assert(pool);
5660 pool->unset_flag(flags);
7c673cae
FG
5661}
5662
5663bool OSDMonitor::update_pools_status()
5664{
31f18b77 5665 if (!mon->pgservice->is_readable())
7c673cae
FG
5666 return false;
5667
5668 bool ret = false;
5669
5670 auto& pools = osdmap.get_pools();
5671 for (auto it = pools.begin(); it != pools.end(); ++it) {
31f18b77
FG
5672 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5673 if (!pstat)
7c673cae 5674 continue;
31f18b77 5675 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
5676 const pg_pool_t &pool = it->second;
5677 const string& pool_name = osdmap.get_pool_name(it->first);
5678
5679 bool pool_is_full =
5680 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
5681 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
5682
3efd9988 5683 if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
7c673cae
FG
5684 if (pool_is_full)
5685 continue;
5686
5687 mon->clog->info() << "pool '" << pool_name
3efd9988
FG
5688 << "' no longer out of quota; removing NO_QUOTA flag";
5689 // below we cancel FLAG_FULL too, we'll set it again in
5690 // OSDMonitor::encode_pending if it still fails the osd-full checking.
5691 clear_pool_flags(it->first,
5692 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
5693 ret = true;
5694 } else {
5695 if (!pool_is_full)
5696 continue;
5697
5698 if (pool.quota_max_bytes > 0 &&
5699 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5700 mon->clog->warn() << "pool '" << pool_name << "' is full"
5701 << " (reached quota's max_bytes: "
1adf2230 5702 << byte_u_t(pool.quota_max_bytes) << ")";
7c673cae
FG
5703 }
5704 if (pool.quota_max_objects > 0 &&
5705 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
5706 mon->clog->warn() << "pool '" << pool_name << "' is full"
5707 << " (reached quota's max_objects: "
5708 << pool.quota_max_objects << ")";
5709 }
3efd9988
FG
5710 // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
5711 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
5712 // since FLAG_FULL should always take precedence
5713 set_pool_flags(it->first,
5714 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
5715 clear_pool_flags(it->first,
5716 pg_pool_t::FLAG_NEARFULL |
5717 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
5718 ret = true;
5719 }
5720 }
5721 return ret;
5722}
5723
7c673cae
FG
5724int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5725{
5726 op->mark_osdmon_event(__func__);
5727 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5728 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5729 MonSession *session = m->get_session();
5730 if (!session)
5731 return -EPERM;
5732 string erasure_code_profile;
5733 stringstream ss;
31f18b77 5734 string rule_name;
94b18763 5735 int ret = 0;
7c673cae 5736 if (m->auid)
94b18763 5737 ret = prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
7c673cae
FG
5738 0, 0,
5739 erasure_code_profile,
5740 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5741 else
94b18763 5742 ret = prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
7c673cae
FG
5743 0, 0,
5744 erasure_code_profile,
5745 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
94b18763
FG
5746
5747 if (ret < 0) {
5748 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
5749 }
5750 return ret;
7c673cae
FG
5751}
5752
5753int OSDMonitor::crush_rename_bucket(const string& srcname,
5754 const string& dstname,
5755 ostream *ss)
5756{
5757 int ret;
5758 //
5759 // Avoid creating a pending crush if it does not already exists and
5760 // the rename would fail.
5761 //
5762 if (!_have_pending_crush()) {
5763 ret = _get_stable_crush().can_rename_bucket(srcname,
5764 dstname,
5765 ss);
5766 if (ret)
5767 return ret;
5768 }
5769
5770 CrushWrapper newcrush;
5771 _get_pending_crush(newcrush);
5772
5773 ret = newcrush.rename_bucket(srcname,
5774 dstname,
5775 ss);
5776 if (ret)
5777 return ret;
5778
5779 pending_inc.crush.clear();
5780 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5781 *ss << "renamed bucket " << srcname << " into " << dstname;
5782 return 0;
5783}
5784
5785void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5786{
5787 string replacement = "";
5788
5789 if (plugin == "jerasure_generic" ||
5790 plugin == "jerasure_sse3" ||
5791 plugin == "jerasure_sse4" ||
5792 plugin == "jerasure_neon") {
5793 replacement = "jerasure";
5794 } else if (plugin == "shec_generic" ||
5795 plugin == "shec_sse3" ||
5796 plugin == "shec_sse4" ||
5797 plugin == "shec_neon") {
5798 replacement = "shec";
5799 }
5800
5801 if (replacement != "") {
5802 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5803 << plugin << " that has been deprecated. Please use "
5804 << replacement << " instead." << dendl;
5805 }
5806}
5807
5808int OSDMonitor::normalize_profile(const string& profilename,
5809 ErasureCodeProfile &profile,
5810 bool force,
5811 ostream *ss)
5812{
5813 ErasureCodeInterfaceRef erasure_code;
5814 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5815 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5816 check_legacy_ec_plugin(plugin->second, profilename);
5817 int err = instance.factory(plugin->second,
5818 g_conf->get_val<std::string>("erasure_code_dir"),
5819 profile, &erasure_code, ss);
5820 if (err) {
5821 return err;
5822 }
5823
5824 err = erasure_code->init(profile, ss);
5825 if (err) {
5826 return err;
5827 }
5828
5829 auto it = profile.find("stripe_unit");
5830 if (it != profile.end()) {
5831 string err_str;
1adf2230 5832 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7c673cae
FG
5833 if (!err_str.empty()) {
5834 *ss << "could not parse stripe_unit '" << it->second
5835 << "': " << err_str << std::endl;
5836 return -EINVAL;
5837 }
5838 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5839 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5840 if (chunk_size != stripe_unit) {
5841 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5842 << "alignment. Would be padded to " << chunk_size
5843 << std::endl;
5844 return -EINVAL;
5845 }
5846 if ((stripe_unit % 4096) != 0 && !force) {
5847 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5848 << "use --force to override this check" << std::endl;
5849 return -EINVAL;
5850 }
5851 }
5852 return 0;
5853}
5854
31f18b77 5855int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 5856 const string &profile,
31f18b77 5857 int *rule,
7c673cae
FG
5858 ostream *ss)
5859{
5860 int ruleid = osdmap.crush->get_rule_id(name);
5861 if (ruleid != -ENOENT) {
31f18b77 5862 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
5863 return -EEXIST;
5864 }
5865
5866 CrushWrapper newcrush;
5867 _get_pending_crush(newcrush);
5868
5869 ruleid = newcrush.get_rule_id(name);
5870 if (ruleid != -ENOENT) {
31f18b77 5871 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
5872 return -EALREADY;
5873 } else {
5874 ErasureCodeInterfaceRef erasure_code;
5875 int err = get_erasure_code(profile, &erasure_code, ss);
5876 if (err) {
5877 *ss << "failed to load plugin using profile " << profile << std::endl;
5878 return err;
5879 }
5880
224ce89b 5881 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
5882 erasure_code.reset();
5883 if (err < 0)
5884 return err;
31f18b77 5885 *rule = err;
7c673cae
FG
5886 pending_inc.crush.clear();
5887 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5888 return 0;
5889 }
5890}
5891
5892int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5893 ErasureCodeInterfaceRef *erasure_code,
5894 ostream *ss) const
5895{
5896 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5897 return -EAGAIN;
5898 ErasureCodeProfile profile =
5899 osdmap.get_erasure_code_profile(erasure_code_profile);
5900 ErasureCodeProfile::const_iterator plugin =
5901 profile.find("plugin");
5902 if (plugin == profile.end()) {
5903 *ss << "cannot determine the erasure code plugin"
5904 << " because there is no 'plugin' entry in the erasure_code_profile "
5905 << profile << std::endl;
5906 return -EINVAL;
5907 }
5908 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5909 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5910 return instance.factory(plugin->second,
5911 g_conf->get_val<std::string>("erasure_code_dir"),
5912 profile, erasure_code, ss);
5913}
5914
5915int OSDMonitor::check_cluster_features(uint64_t features,
5916 stringstream &ss)
5917{
5918 stringstream unsupported_ss;
5919 int unsupported_count = 0;
5920 if ((mon->get_quorum_con_features() & features) != features) {
5921 unsupported_ss << "the monitor cluster";
5922 ++unsupported_count;
5923 }
5924
5925 set<int32_t> up_osds;
5926 osdmap.get_up_osds(up_osds);
5927 for (set<int32_t>::iterator it = up_osds.begin();
5928 it != up_osds.end(); ++it) {
5929 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5930 if ((xi.features & features) != features) {
5931 if (unsupported_count > 0)
5932 unsupported_ss << ", ";
5933 unsupported_ss << "osd." << *it;
5934 unsupported_count ++;
5935 }
5936 }
5937
5938 if (unsupported_count > 0) {
5939 ss << "features " << features << " unsupported by: "
5940 << unsupported_ss.str();
5941 return -ENOTSUP;
5942 }
5943
5944 // check pending osd state, too!
5945 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5946 pending_inc.new_xinfo.begin();
5947 p != pending_inc.new_xinfo.end(); ++p) {
5948 const osd_xinfo_t &xi = p->second;
5949 if ((xi.features & features) != features) {
5950 dout(10) << __func__ << " pending osd." << p->first
5951 << " features are insufficient; retry" << dendl;
5952 return -EAGAIN;
5953 }
5954 }
5955
5956 return 0;
5957}
5958
5959bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5960 stringstream& ss)
5961{
5962 OSDMap::Incremental new_pending = pending_inc;
5963 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5964 OSDMap newmap;
5965 newmap.deepish_copy_from(osdmap);
5966 newmap.apply_incremental(new_pending);
5967
5968 // client compat
31f18b77 5969 if (newmap.require_min_compat_client > 0) {
7c673cae 5970 auto mv = newmap.get_min_compat_client();
31f18b77
FG
5971 if (mv > newmap.require_min_compat_client) {
5972 ss << "new crush map requires client version " << ceph_release_name(mv)
7c673cae 5973 << " but require_min_compat_client is "
31f18b77 5974 << ceph_release_name(newmap.require_min_compat_client);
7c673cae
FG
5975 return false;
5976 }
5977 }
5978
5979 // osd compat
5980 uint64_t features =
5981 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5982 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5983 stringstream features_ss;
5984 int r = check_cluster_features(features, features_ss);
5985 if (r) {
5986 ss << "Could not change CRUSH: " << features_ss.str();
5987 return false;
5988 }
5989
5990 return true;
5991}
5992
5993bool OSDMonitor::erasure_code_profile_in_use(
5994 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5995 const string &profile,
5996 ostream *ss)
5997{
5998 bool found = false;
5999 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
6000 p != pools.end();
6001 ++p) {
6002 if (p->second.erasure_code_profile == profile) {
6003 *ss << osdmap.pool_name[p->first] << " ";
6004 found = true;
6005 }
6006 }
6007 if (found) {
6008 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
6009 }
6010 return found;
6011}
6012
6013int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
6014 map<string,string> *erasure_code_profile_map,
6015 ostream *ss)
6016{
6017 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
6018 *ss,
6019 erasure_code_profile_map);
6020 if (r)
6021 return r;
6022 assert((*erasure_code_profile_map).count("plugin"));
6023 string default_plugin = (*erasure_code_profile_map)["plugin"];
6024 map<string,string> user_map;
6025 for (vector<string>::const_iterator i = erasure_code_profile.begin();
6026 i != erasure_code_profile.end();
6027 ++i) {
6028 size_t equal = i->find('=');
6029 if (equal == string::npos) {
6030 user_map[*i] = string();
6031 (*erasure_code_profile_map)[*i] = string();
6032 } else {
3efd9988 6033 string key = i->substr(0, equal);
7c673cae
FG
6034 equal++;
6035 const string value = i->substr(equal);
b32b8144
FG
6036 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
6037 key.find("ruleset-") == 0) {
6038 if (g_conf->get_val<bool>("mon_fixup_legacy_erasure_code_profiles")) {
3efd9988
FG
6039 mon->clog->warn() << "erasure code profile property '" << key
6040 << "' is no longer supported; try "
6041 << "'crush-" << key.substr(8) << "' instead";
6042 key = string("crush-") + key.substr(8);
6043 } else {
6044 *ss << "property '" << key << "' is no longer supported; try "
6045 << "'crush-" << key.substr(8) << "' instead";
6046 return -EINVAL;
6047 }
6048 }
7c673cae
FG
6049 user_map[key] = value;
6050 (*erasure_code_profile_map)[key] = value;
6051 }
6052 }
6053
6054 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
6055 (*erasure_code_profile_map) = user_map;
6056
6057 return 0;
6058}
6059
6060int OSDMonitor::prepare_pool_size(const unsigned pool_type,
6061 const string &erasure_code_profile,
6062 unsigned *size, unsigned *min_size,
6063 ostream *ss)
6064{
6065 int err = 0;
6066 switch (pool_type) {
6067 case pg_pool_t::TYPE_REPLICATED:
6068 *size = g_conf->osd_pool_default_size;
6069 *min_size = g_conf->get_osd_pool_default_min_size();
6070 break;
6071 case pg_pool_t::TYPE_ERASURE:
6072 {
6073 ErasureCodeInterfaceRef erasure_code;
6074 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6075 if (err == 0) {
6076 *size = erasure_code->get_chunk_count();
6077 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
6078 }
6079 }
6080 break;
6081 default:
6082 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
6083 err = -EINVAL;
6084 break;
6085 }
6086 return err;
6087}
6088
6089int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
6090 const string &erasure_code_profile,
6091 uint32_t *stripe_width,
6092 ostream *ss)
6093{
6094 int err = 0;
6095 switch (pool_type) {
6096 case pg_pool_t::TYPE_REPLICATED:
6097 // ignored
6098 break;
6099 case pg_pool_t::TYPE_ERASURE:
6100 {
6101 ErasureCodeProfile profile =
6102 osdmap.get_erasure_code_profile(erasure_code_profile);
6103 ErasureCodeInterfaceRef erasure_code;
6104 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6105 if (err)
6106 break;
6107 uint32_t data_chunks = erasure_code->get_data_chunk_count();
6108 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
6109 auto it = profile.find("stripe_unit");
6110 if (it != profile.end()) {
6111 string err_str;
1adf2230 6112 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7c673cae
FG
6113 assert(err_str.empty());
6114 }
6115 *stripe_width = data_chunks *
6116 erasure_code->get_chunk_size(stripe_unit * data_chunks);
6117 }
6118 break;
6119 default:
6120 *ss << "prepare_pool_stripe_width: "
6121 << pool_type << " is not a known pool type";
6122 err = -EINVAL;
6123 break;
6124 }
6125 return err;
6126}
6127
31f18b77 6128int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
6129 const string &erasure_code_profile,
6130 const string &rule_name,
6131 int *crush_rule,
6132 ostream *ss)
7c673cae
FG
6133{
6134
31f18b77 6135 if (*crush_rule < 0) {
7c673cae
FG
6136 switch (pool_type) {
6137 case pg_pool_t::TYPE_REPLICATED:
6138 {
31f18b77 6139 if (rule_name == "") {
224ce89b 6140 // Use default rule
31f18b77
FG
6141 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
6142 if (*crush_rule < 0) {
6143 // Errors may happen e.g. if no valid rule is available
6144 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
6145 << "'osd pool default crush *' config options";
6146 return -ENOENT;
6147 }
6148 } else {
31f18b77 6149 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
6150 }
6151 }
6152 break;
6153 case pg_pool_t::TYPE_ERASURE:
6154 {
31f18b77 6155 int err = crush_rule_create_erasure(rule_name,
7c673cae 6156 erasure_code_profile,
31f18b77 6157 crush_rule, ss);
7c673cae
FG
6158 switch (err) {
6159 case -EALREADY:
31f18b77
FG
6160 dout(20) << "prepare_pool_crush_rule: rule "
6161 << rule_name << " try again" << dendl;
7c673cae
FG
6162 // fall through
6163 case 0:
6164 // need to wait for the crush rule to be proposed before proceeding
6165 err = -EAGAIN;
6166 break;
6167 case -EEXIST:
6168 err = 0;
6169 break;
6170 }
6171 return err;
6172 }
6173 break;
6174 default:
31f18b77 6175 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
6176 << " is not a known pool type";
6177 return -EINVAL;
6178 break;
6179 }
6180 } else {
31f18b77
FG
6181 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
6182 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
6183 return -ENOENT;
6184 }
6185 }
6186
6187 return 0;
6188}
6189
31f18b77 6190int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
6191 int *crush_rule,
6192 ostream *ss)
7c673cae
FG
6193{
6194 int ret;
31f18b77 6195 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
6196 if (ret != -ENOENT) {
6197 // found it, use it
31f18b77 6198 *crush_rule = ret;
7c673cae
FG
6199 } else {
6200 CrushWrapper newcrush;
6201 _get_pending_crush(newcrush);
6202
31f18b77 6203 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
6204 if (ret != -ENOENT) {
6205 // found it, wait for it to be proposed
31f18b77 6206 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
6207 << " try again" << dendl;
6208 return -EAGAIN;
6209 } else {
224ce89b 6210 // Cannot find it , return error
31f18b77 6211 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
6212 return ret;
6213 }
6214 }
6215 return 0;
6216}
6217
3efd9988
FG
6218int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6219{
6220 auto max_pgs_per_osd = g_conf->get_val<uint64_t>("mon_max_pg_per_osd");
6221 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
6222 auto max_pgs = max_pgs_per_osd * num_osds;
6223 uint64_t projected = 0;
6224 if (pool < 0) {
6225 projected += pg_num * size;
6226 }
6227 for (const auto& i : osdmap.get_pools()) {
6228 if (i.first == pool) {
6229 projected += pg_num * size;
6230 } else {
6231 projected += i.second.get_pg_num() * i.second.get_size();
6232 }
6233 }
6234 if (projected > max_pgs) {
6235 if (pool >= 0) {
6236 *ss << "pool id " << pool;
6237 }
6238 *ss << " pg_num " << pg_num << " size " << size
6239 << " would mean " << projected
6240 << " total pgs, which exceeds max " << max_pgs
6241 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6242 << " * num_in_osds " << num_osds << ")";
6243 return -ERANGE;
6244 }
6245 return 0;
6246}
6247
7c673cae
FG
6248/**
6249 * @param name The name of the new pool
6250 * @param auid The auid of the pool owner. Can be -1
31f18b77
FG
6251 * @param crush_rule The crush rule to use. If <0, will use the system default
6252 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
6253 * @param pg_num The pg_num to use. If set to 0, will use the system default
6254 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6255 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6256 * @param pool_type TYPE_ERASURE, or TYPE_REP
6257 * @param expected_num_objects expected number of objects on the pool
6258 * @param fast_read fast read type.
6259 * @param ss human readable error message, if any.
6260 *
6261 * @return 0 on success, negative errno on failure.
6262 */
6263int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
31f18b77
FG
6264 int crush_rule,
6265 const string &crush_rule_name,
7c673cae
FG
6266 unsigned pg_num, unsigned pgp_num,
6267 const string &erasure_code_profile,
6268 const unsigned pool_type,
6269 const uint64_t expected_num_objects,
6270 FastReadType fast_read,
6271 ostream *ss)
6272{
6273 if (name.length() == 0)
6274 return -EINVAL;
6275 if (pg_num == 0)
6276 pg_num = g_conf->osd_pool_default_pg_num;
6277 if (pgp_num == 0)
6278 pgp_num = g_conf->osd_pool_default_pgp_num;
6279 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
6280 *ss << "'pg_num' must be greater than 0 and less than or equal to "
6281 << g_conf->mon_max_pool_pg_num
6282 << " (you may adjust 'mon max pool pg num' for higher values)";
6283 return -ERANGE;
6284 }
6285 if (pgp_num > pg_num) {
6286 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6287 << ", which in this case is " << pg_num;
6288 return -ERANGE;
6289 }
6290 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6291 *ss << "'fast_read' can only apply to erasure coding pool";
6292 return -EINVAL;
6293 }
6294 int r;
31f18b77
FG
6295 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6296 crush_rule_name, &crush_rule, ss);
7c673cae 6297 if (r) {
94b18763 6298 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
6299 return r;
6300 }
224ce89b
WB
6301 if (g_conf->mon_osd_crush_smoke_test) {
6302 CrushWrapper newcrush;
6303 _get_pending_crush(newcrush);
6304 ostringstream err;
6305 CrushTester tester(newcrush, err);
b5b8bbf5 6306 tester.set_min_x(0);
224ce89b
WB
6307 tester.set_max_x(50);
6308 tester.set_rule(crush_rule);
b5b8bbf5 6309 auto start = ceph::coarse_mono_clock::now();
224ce89b 6310 r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 6311 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 6312 if (r < 0) {
94b18763 6313 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
6314 << ": " << err.str() << dendl;
6315 *ss << "crush test failed with " << r << ": " << err.str();
6316 return r;
6317 }
181888fb 6318 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 6319 << duration << dendl;
7c673cae
FG
6320 }
6321 unsigned size, min_size;
6322 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
6323 if (r) {
94b18763 6324 dout(10) << "prepare_pool_size returns " << r << dendl;
7c673cae
FG
6325 return r;
6326 }
3efd9988
FG
6327 r = check_pg_num(-1, pg_num, size, ss);
6328 if (r) {
94b18763 6329 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
6330 return r;
6331 }
7c673cae 6332
31f18b77 6333 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
6334 return -EINVAL;
6335 }
6336
6337 uint32_t stripe_width = 0;
6338 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6339 if (r) {
94b18763 6340 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
6341 return r;
6342 }
6343
6344 bool fread = false;
6345 if (pool_type == pg_pool_t::TYPE_ERASURE) {
6346 switch (fast_read) {
6347 case FAST_READ_OFF:
6348 fread = false;
6349 break;
6350 case FAST_READ_ON:
6351 fread = true;
6352 break;
6353 case FAST_READ_DEFAULT:
6354 fread = g_conf->mon_osd_pool_ec_fast_read;
6355 break;
6356 default:
6357 *ss << "invalid fast_read setting: " << fast_read;
6358 return -EINVAL;
6359 }
6360 }
6361
6362 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6363 p != pending_inc.new_pool_names.end();
6364 ++p) {
6365 if (p->second == name)
6366 return 0;
6367 }
6368
6369 if (-1 == pending_inc.new_pool_max)
6370 pending_inc.new_pool_max = osdmap.pool_max;
6371 int64_t pool = ++pending_inc.new_pool_max;
6372 pg_pool_t empty;
6373 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6374 pi->type = pool_type;
6375 pi->fast_read = fread;
6376 pi->flags = g_conf->osd_pool_default_flags;
6377 if (g_conf->osd_pool_default_flag_hashpspool)
6378 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6379 if (g_conf->osd_pool_default_flag_nodelete)
6380 pi->set_flag(pg_pool_t::FLAG_NODELETE);
6381 if (g_conf->osd_pool_default_flag_nopgchange)
6382 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6383 if (g_conf->osd_pool_default_flag_nosizechange)
6384 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
6385 if (g_conf->osd_pool_use_gmt_hitset &&
6386 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
6387 pi->use_gmt_hitset = true;
6388 else
6389 pi->use_gmt_hitset = false;
6390
6391 pi->size = size;
6392 pi->min_size = min_size;
31f18b77 6393 pi->crush_rule = crush_rule;
7c673cae
FG
6394 pi->expected_num_objects = expected_num_objects;
6395 pi->object_hash = CEPH_STR_HASH_RJENKINS;
6396 pi->set_pg_num(pg_num);
6397 pi->set_pgp_num(pgp_num);
6398 pi->last_change = pending_inc.epoch;
6399 pi->auid = auid;
6400 pi->erasure_code_profile = erasure_code_profile;
6401 pi->stripe_width = stripe_width;
6402 pi->cache_target_dirty_ratio_micro =
6403 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
6404 pi->cache_target_dirty_high_ratio_micro =
6405 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
6406 pi->cache_target_full_ratio_micro =
6407 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
6408 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
6409 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
6410 pending_inc.new_pool_names[pool] = name;
6411 return 0;
6412}
6413
6414bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
6415{
6416 op->mark_osdmon_event(__func__);
6417 ostringstream ss;
6418 if (pending_inc.new_flags < 0)
6419 pending_inc.new_flags = osdmap.get_flags();
6420 pending_inc.new_flags |= flag;
6421 ss << OSDMap::get_flag_string(flag) << " is set";
6422 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6423 get_last_committed() + 1));
6424 return true;
6425}
6426
6427bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
6428{
6429 op->mark_osdmon_event(__func__);
6430 ostringstream ss;
6431 if (pending_inc.new_flags < 0)
6432 pending_inc.new_flags = osdmap.get_flags();
6433 pending_inc.new_flags &= ~flag;
6434 ss << OSDMap::get_flag_string(flag) << " is unset";
6435 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6436 get_last_committed() + 1));
6437 return true;
6438}
6439
7c673cae
FG
6440int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
6441 stringstream& ss)
6442{
6443 string poolstr;
f64942e4 6444 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
6445 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6446 if (pool < 0) {
6447 ss << "unrecognized pool '" << poolstr << "'";
6448 return -ENOENT;
6449 }
6450 string var;
f64942e4 6451 cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
7c673cae
FG
6452
6453 pg_pool_t p = *osdmap.get_pg_pool(pool);
6454 if (pending_inc.new_pools.count(pool))
6455 p = pending_inc.new_pools[pool];
6456
6457 // accept val as a json string in the normal case (current
6458 // generation monitor). parse out int or float values from the
6459 // string as needed. however, if it is not a string, try to pull
6460 // out an int, in case an older monitor with an older json schema is
6461 // forwarding a request.
6462 string val;
6463 string interr, floaterr;
6464 int64_t n = 0;
6465 double f = 0;
6466 int64_t uf = 0; // micro-f
f64942e4
AA
6467 cmd_getval(g_ceph_context, cmdmap, "val", val);
6468
6469 // parse string as both int and float; different fields use different types.
6470 n = strict_strtoll(val.c_str(), 10, &interr);
6471 f = strict_strtod(val.c_str(), &floaterr);
6472 uf = llrintl(f * (double)1000000.0);
7c673cae
FG
6473
6474 if (!p.is_tier() &&
6475 (var == "hit_set_type" || var == "hit_set_period" ||
6476 var == "hit_set_count" || var == "hit_set_fpp" ||
6477 var == "target_max_objects" || var == "target_max_bytes" ||
6478 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
6479 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
6480 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
6481 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
6482 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
6483 return -EACCES;
6484 }
6485
6486 if (var == "size") {
6487 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6488 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
6489 return -EPERM;
6490 }
6491 if (p.type == pg_pool_t::TYPE_ERASURE) {
6492 ss << "can not change the size of an erasure-coded pool";
6493 return -ENOTSUP;
6494 }
6495 if (interr.length()) {
6496 ss << "error parsing integer value '" << val << "': " << interr;
6497 return -EINVAL;
6498 }
6499 if (n <= 0 || n > 10) {
6500 ss << "pool size must be between 1 and 10";
6501 return -EINVAL;
6502 }
3efd9988
FG
6503 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
6504 if (r < 0) {
6505 return r;
6506 }
7c673cae
FG
6507 p.size = n;
6508 if (n < p.min_size)
6509 p.min_size = n;
6510 } else if (var == "min_size") {
6511 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6512 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6513 return -EPERM;
6514 }
6515 if (interr.length()) {
6516 ss << "error parsing integer value '" << val << "': " << interr;
6517 return -EINVAL;
6518 }
6519
6520 if (p.type != pg_pool_t::TYPE_ERASURE) {
6521 if (n < 1 || n > p.size) {
6522 ss << "pool min_size must be between 1 and " << (int)p.size;
6523 return -EINVAL;
6524 }
6525 } else {
6526 ErasureCodeInterfaceRef erasure_code;
6527 int k;
6528 stringstream tmp;
6529 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
6530 if (err == 0) {
6531 k = erasure_code->get_data_chunk_count();
6532 } else {
b32b8144 6533 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
6534 return err;
6535 }
6536
6537 if (n < k || n > p.size) {
6538 ss << "pool min_size must be between " << k << " and " << (int)p.size;
6539 return -EINVAL;
6540 }
6541 }
6542 p.min_size = n;
6543 } else if (var == "auid") {
6544 if (interr.length()) {
6545 ss << "error parsing integer value '" << val << "': " << interr;
6546 return -EINVAL;
6547 }
6548 p.auid = n;
6549 } else if (var == "crash_replay_interval") {
6550 if (interr.length()) {
6551 ss << "error parsing integer value '" << val << "': " << interr;
6552 return -EINVAL;
6553 }
6554 p.crash_replay_interval = n;
6555 } else if (var == "pg_num") {
6556 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6557 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6558 return -EPERM;
6559 }
6560 if (interr.length()) {
6561 ss << "error parsing integer value '" << val << "': " << interr;
6562 return -EINVAL;
6563 }
6564 if (n <= (int)p.get_pg_num()) {
6565 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
6566 if (n < (int)p.get_pg_num())
6567 return -EEXIST;
6568 return 0;
6569 }
c07f9fc5
FG
6570 if (n > (unsigned)g_conf->mon_max_pool_pg_num) {
6571 ss << "'pg_num' must be greater than 0 and less than or equal to "
6572 << g_conf->mon_max_pool_pg_num
6573 << " (you may adjust 'mon max pool pg num' for higher values)";
6574 return -ERANGE;
6575 }
3efd9988
FG
6576 int r = check_pg_num(pool, n, p.get_size(), &ss);
6577 if (r) {
6578 return r;
6579 }
7c673cae 6580 string force;
f64942e4 6581 cmd_getval_throws(g_ceph_context,cmdmap, "force", force);
7c673cae
FG
6582 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
6583 force != "--yes-i-really-mean-it") {
6584 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6585 return -EPERM;
6586 }
6587 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
6588 int64_t new_pgs = n - p.get_pg_num();
6589 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
6590 ss << "specified pg_num " << n << " is too large (creating "
6591 << new_pgs << " new PGs on ~" << expected_osds
6592 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
6593 << ')';
6594 return -E2BIG;
6595 }
6596 p.set_pg_num(n);
6597 // force pre-luminous clients to resend their ops, since they
6598 // don't understand that split PGs now form a new interval.
6599 p.last_force_op_resend_preluminous = pending_inc.epoch;
6600 } else if (var == "pgp_num") {
6601 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6602 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6603 return -EPERM;
6604 }
6605 if (interr.length()) {
6606 ss << "error parsing integer value '" << val << "': " << interr;
6607 return -EINVAL;
6608 }
6609 if (n <= 0) {
6610 ss << "specified pgp_num must > 0, but you set to " << n;
6611 return -EINVAL;
6612 }
6613 if (n > (int)p.get_pg_num()) {
6614 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
6615 return -EINVAL;
6616 }
6617 p.set_pgp_num(n);
6618 } else if (var == "crush_rule") {
6619 int id = osdmap.crush->get_rule_id(val);
6620 if (id == -ENOENT) {
6621 ss << "crush rule " << val << " does not exist";
6622 return -ENOENT;
6623 }
6624 if (id < 0) {
6625 ss << cpp_strerror(id);
6626 return -ENOENT;
6627 }
6628 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
6629 return -EINVAL;
6630 }
31f18b77 6631 p.crush_rule = id;
7c673cae
FG
6632 } else if (var == "nodelete" || var == "nopgchange" ||
6633 var == "nosizechange" || var == "write_fadvise_dontneed" ||
6634 var == "noscrub" || var == "nodeep-scrub") {
6635 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6636 // make sure we only compare against 'n' if we didn't receive a string
6637 if (val == "true" || (interr.empty() && n == 1)) {
6638 p.set_flag(flag);
6639 } else if (val == "false" || (interr.empty() && n == 0)) {
6640 p.unset_flag(flag);
6641 } else {
6642 ss << "expecting value 'true', 'false', '0', or '1'";
6643 return -EINVAL;
6644 }
6645 } else if (var == "hashpspool") {
6646 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6647 string force;
f64942e4 6648 cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
7c673cae
FG
6649 if (force != "--yes-i-really-mean-it") {
6650 ss << "are you SURE? this will remap all placement groups in this pool,"
6651 " this triggers large data movement,"
6652 " pass --yes-i-really-mean-it if you really do.";
6653 return -EPERM;
6654 }
6655 // make sure we only compare against 'n' if we didn't receive a string
6656 if (val == "true" || (interr.empty() && n == 1)) {
6657 p.set_flag(flag);
6658 } else if (val == "false" || (interr.empty() && n == 0)) {
6659 p.unset_flag(flag);
6660 } else {
6661 ss << "expecting value 'true', 'false', '0', or '1'";
6662 return -EINVAL;
6663 }
6664 } else if (var == "hit_set_type") {
6665 if (val == "none")
6666 p.hit_set_params = HitSet::Params();
6667 else {
6668 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
6669 if (err)
6670 return err;
6671 if (val == "bloom") {
6672 BloomHitSet::Params *bsp = new BloomHitSet::Params;
6673 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
6674 p.hit_set_params = HitSet::Params(bsp);
6675 } else if (val == "explicit_hash")
6676 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
6677 else if (val == "explicit_object")
6678 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
6679 else {
6680 ss << "unrecognized hit_set type '" << val << "'";
6681 return -EINVAL;
6682 }
6683 }
6684 } else if (var == "hit_set_period") {
6685 if (interr.length()) {
6686 ss << "error parsing integer value '" << val << "': " << interr;
6687 return -EINVAL;
6688 }
6689 p.hit_set_period = n;
6690 } else if (var == "hit_set_count") {
6691 if (interr.length()) {
6692 ss << "error parsing integer value '" << val << "': " << interr;
6693 return -EINVAL;
6694 }
6695 p.hit_set_count = n;
6696 } else if (var == "hit_set_fpp") {
6697 if (floaterr.length()) {
6698 ss << "error parsing floating point value '" << val << "': " << floaterr;
6699 return -EINVAL;
6700 }
6701 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
6702 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
6703 return -EINVAL;
6704 }
6705 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
6706 bloomp->set_fpp(f);
6707 } else if (var == "use_gmt_hitset") {
6708 if (val == "true" || (interr.empty() && n == 1)) {
3efd9988
FG
6709 string force;
6710 cmd_getval(g_ceph_context, cmdmap, "force", force);
6711 if (!osdmap.get_num_up_osds() && force != "--yes-i-really-mean-it") {
6712 ss << "Not advisable to continue since no OSDs are up. Pass "
6713 << "--yes-i-really-mean-it if you really wish to continue.";
6714 return -EPERM;
6715 }
6716 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)
6717 && force != "--yes-i-really-mean-it") {
7c673cae
FG
6718 ss << "not all OSDs support GMT hit set.";
6719 return -EINVAL;
6720 }
6721 p.use_gmt_hitset = true;
6722 } else {
6723 ss << "expecting value 'true' or '1'";
6724 return -EINVAL;
6725 }
6726 } else if (var == "allow_ec_overwrites") {
6727 if (!p.is_erasure()) {
6728 ss << "ec overwrites can only be enabled for an erasure coded pool";
6729 return -EINVAL;
6730 }
224ce89b
WB
6731 stringstream err;
6732 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6733 !is_pool_currently_all_bluestore(pool, p, &err)) {
6734 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6735 return -EINVAL;
6736 }
7c673cae
FG
6737 if (val == "true" || (interr.empty() && n == 1)) {
6738 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6739 } else if (val == "false" || (interr.empty() && n == 0)) {
6740 ss << "ec overwrites cannot be disabled once enabled";
6741 return -EINVAL;
6742 } else {
6743 ss << "expecting value 'true', 'false', '0', or '1'";
6744 return -EINVAL;
6745 }
7c673cae
FG
6746 } else if (var == "target_max_objects") {
6747 if (interr.length()) {
6748 ss << "error parsing int '" << val << "': " << interr;
6749 return -EINVAL;
6750 }
6751 p.target_max_objects = n;
6752 } else if (var == "target_max_bytes") {
6753 if (interr.length()) {
6754 ss << "error parsing int '" << val << "': " << interr;
6755 return -EINVAL;
6756 }
6757 p.target_max_bytes = n;
6758 } else if (var == "cache_target_dirty_ratio") {
6759 if (floaterr.length()) {
6760 ss << "error parsing float '" << val << "': " << floaterr;
6761 return -EINVAL;
6762 }
6763 if (f < 0 || f > 1.0) {
6764 ss << "value must be in the range 0..1";
6765 return -ERANGE;
6766 }
6767 p.cache_target_dirty_ratio_micro = uf;
6768 } else if (var == "cache_target_dirty_high_ratio") {
6769 if (floaterr.length()) {
6770 ss << "error parsing float '" << val << "': " << floaterr;
6771 return -EINVAL;
6772 }
6773 if (f < 0 || f > 1.0) {
6774 ss << "value must be in the range 0..1";
6775 return -ERANGE;
6776 }
6777 p.cache_target_dirty_high_ratio_micro = uf;
6778 } else if (var == "cache_target_full_ratio") {
6779 if (floaterr.length()) {
6780 ss << "error parsing float '" << val << "': " << floaterr;
6781 return -EINVAL;
6782 }
6783 if (f < 0 || f > 1.0) {
6784 ss << "value must be in the range 0..1";
6785 return -ERANGE;
6786 }
6787 p.cache_target_full_ratio_micro = uf;
6788 } else if (var == "cache_min_flush_age") {
6789 if (interr.length()) {
6790 ss << "error parsing int '" << val << "': " << interr;
6791 return -EINVAL;
6792 }
6793 p.cache_min_flush_age = n;
6794 } else if (var == "cache_min_evict_age") {
6795 if (interr.length()) {
6796 ss << "error parsing int '" << val << "': " << interr;
6797 return -EINVAL;
6798 }
6799 p.cache_min_evict_age = n;
6800 } else if (var == "min_read_recency_for_promote") {
6801 if (interr.length()) {
6802 ss << "error parsing integer value '" << val << "': " << interr;
6803 return -EINVAL;
6804 }
6805 p.min_read_recency_for_promote = n;
6806 } else if (var == "hit_set_grade_decay_rate") {
6807 if (interr.length()) {
6808 ss << "error parsing integer value '" << val << "': " << interr;
6809 return -EINVAL;
6810 }
6811 if (n > 100 || n < 0) {
6812 ss << "value out of range,valid range is 0 - 100";
6813 return -EINVAL;
6814 }
6815 p.hit_set_grade_decay_rate = n;
6816 } else if (var == "hit_set_search_last_n") {
6817 if (interr.length()) {
6818 ss << "error parsing integer value '" << val << "': " << interr;
6819 return -EINVAL;
6820 }
6821 if (n > p.hit_set_count || n < 0) {
6822 ss << "value out of range,valid range is 0 - hit_set_count";
6823 return -EINVAL;
6824 }
6825 p.hit_set_search_last_n = n;
6826 } else if (var == "min_write_recency_for_promote") {
6827 if (interr.length()) {
6828 ss << "error parsing integer value '" << val << "': " << interr;
6829 return -EINVAL;
6830 }
6831 p.min_write_recency_for_promote = n;
6832 } else if (var == "fast_read") {
6833 if (p.is_replicated()) {
6834 ss << "fast read is not supported in replication pool";
6835 return -EINVAL;
6836 }
6837 if (val == "true" || (interr.empty() && n == 1)) {
6838 p.fast_read = true;
6839 } else if (val == "false" || (interr.empty() && n == 0)) {
6840 p.fast_read = false;
6841 } else {
6842 ss << "expecting value 'true', 'false', '0', or '1'";
6843 return -EINVAL;
6844 }
6845 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 6846 bool unset = val == "unset";
7c673cae 6847 if (var == "compression_mode") {
224ce89b
WB
6848 if (!unset) {
6849 auto cmode = Compressor::get_comp_mode_type(val);
6850 if (!cmode) {
6851 ss << "unrecognized compression mode '" << val << "'";
6852 return -EINVAL;
6853 }
7c673cae
FG
6854 }
6855 } else if (var == "compression_algorithm") {
224ce89b
WB
6856 if (!unset) {
6857 auto alg = Compressor::get_comp_alg_type(val);
6858 if (!alg) {
6859 ss << "unrecognized compression_algorithm '" << val << "'";
6860 return -EINVAL;
6861 }
7c673cae
FG
6862 }
6863 } else if (var == "compression_required_ratio") {
6864 if (floaterr.length()) {
6865 ss << "error parsing float value '" << val << "': " << floaterr;
6866 return -EINVAL;
6867 }
224ce89b 6868 if (f < 0 || f > 1) {
7c673cae 6869 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 6870 return -EINVAL;
7c673cae
FG
6871 }
6872 } else if (var == "csum_type") {
224ce89b 6873 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
6874 if (t < 0 ) {
6875 ss << "unrecognized csum_type '" << val << "'";
224ce89b 6876 return -EINVAL;
7c673cae
FG
6877 }
6878 //preserve csum_type numeric value
6879 n = t;
6880 interr.clear();
6881 } else if (var == "compression_max_blob_size" ||
6882 var == "compression_min_blob_size" ||
6883 var == "csum_max_block" ||
6884 var == "csum_min_block") {
6885 if (interr.length()) {
6886 ss << "error parsing int value '" << val << "': " << interr;
6887 return -EINVAL;
6888 }
6889 }
6890
6891 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6892 switch (desc.type) {
6893 case pool_opts_t::STR:
224ce89b 6894 if (unset) {
7c673cae
FG
6895 p.opts.unset(desc.key);
6896 } else {
6897 p.opts.set(desc.key, static_cast<std::string>(val));
6898 }
6899 break;
6900 case pool_opts_t::INT:
6901 if (interr.length()) {
6902 ss << "error parsing integer value '" << val << "': " << interr;
6903 return -EINVAL;
6904 }
6905 if (n == 0) {
6906 p.opts.unset(desc.key);
6907 } else {
6908 p.opts.set(desc.key, static_cast<int>(n));
6909 }
6910 break;
6911 case pool_opts_t::DOUBLE:
6912 if (floaterr.length()) {
6913 ss << "error parsing floating point value '" << val << "': " << floaterr;
6914 return -EINVAL;
6915 }
6916 if (f == 0) {
6917 p.opts.unset(desc.key);
6918 } else {
6919 p.opts.set(desc.key, static_cast<double>(f));
6920 }
6921 break;
6922 default:
6923 assert(!"unknown type");
6924 }
6925 } else {
6926 ss << "unrecognized variable '" << var << "'";
6927 return -EINVAL;
6928 }
224ce89b
WB
6929 if (val != "unset") {
6930 ss << "set pool " << pool << " " << var << " to " << val;
6931 } else {
6932 ss << "unset pool " << pool << " " << var;
6933 }
7c673cae
FG
6934 p.last_change = pending_inc.epoch;
6935 pending_inc.new_pools[pool] = p;
6936 return 0;
6937}
6938
c07f9fc5
FG
6939int OSDMonitor::prepare_command_pool_application(const string &prefix,
6940 map<string,cmd_vartype> &cmdmap,
6941 stringstream& ss)
6942{
6943 string pool_name;
f64942e4 6944 cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
c07f9fc5
FG
6945 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6946 if (pool < 0) {
6947 ss << "unrecognized pool '" << pool_name << "'";
6948 return -ENOENT;
6949 }
6950
6951 pg_pool_t p = *osdmap.get_pg_pool(pool);
6952 if (pending_inc.new_pools.count(pool)) {
6953 p = pending_inc.new_pools[pool];
6954 }
6955
6956 string app;
f64942e4 6957 cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
c07f9fc5
FG
6958 bool app_exists = (p.application_metadata.count(app) > 0);
6959
6960 if (boost::algorithm::ends_with(prefix, "enable")) {
6961 if (app.empty()) {
6962 ss << "application name must be provided";
6963 return -EINVAL;
6964 }
6965
6966 if (p.is_tier()) {
6967 ss << "application must be enabled on base tier";
6968 return -EINVAL;
6969 }
6970
6971 string force;
f64942e4 6972 cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
c07f9fc5
FG
6973
6974 if (!app_exists && !p.application_metadata.empty() &&
6975 force != "--yes-i-really-mean-it") {
6976 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
6977 << "application; pass --yes-i-really-mean-it to proceed anyway";
6978 return -EPERM;
6979 }
6980
6981 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
6982 ss << "too many enabled applications on pool '" << pool_name << "'; "
6983 << "max " << MAX_POOL_APPLICATIONS;
6984 return -EINVAL;
6985 }
6986
6987 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
6988 ss << "application name '" << app << "' too long; max length "
6989 << MAX_POOL_APPLICATION_LENGTH;
6990 return -EINVAL;
6991 }
6992
6993 if (!app_exists) {
6994 p.application_metadata[app] = {};
6995 }
6996 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
6997
6998 } else if (boost::algorithm::ends_with(prefix, "disable")) {
6999 string force;
f64942e4 7000 cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
c07f9fc5
FG
7001
7002 if (force != "--yes-i-really-mean-it") {
7003 ss << "Are you SURE? Disabling an application within a pool might result "
7004 << "in loss of application functionality; pass "
7005 << "--yes-i-really-mean-it to proceed anyway";
7006 return -EPERM;
7007 }
7008
7009 if (!app_exists) {
7010 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7011 << "'";
7012 return 0; // idempotent
7013 }
7014
7015 p.application_metadata.erase(app);
7016 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
7017
7018 } else if (boost::algorithm::ends_with(prefix, "set")) {
7019 if (p.is_tier()) {
7020 ss << "application metadata must be set on base tier";
7021 return -EINVAL;
7022 }
7023
7024 if (!app_exists) {
7025 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7026 << "'";
7027 return -ENOENT;
7028 }
7029
7030 string key;
f64942e4 7031 cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
c07f9fc5
FG
7032
7033 if (key.empty()) {
7034 ss << "key must be provided";
7035 return -EINVAL;
7036 }
7037
7038 auto &app_keys = p.application_metadata[app];
7039 if (app_keys.count(key) == 0 &&
7040 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
7041 ss << "too many keys set for application '" << app << "' on pool '"
7042 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
7043 return -EINVAL;
7044 }
7045
7046 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
7047 ss << "key '" << app << "' too long; max length "
7048 << MAX_POOL_APPLICATION_LENGTH;
7049 return -EINVAL;
7050 }
7051
7052 string value;
f64942e4 7053 cmd_getval_throws(g_ceph_context, cmdmap, "value", value);
c07f9fc5
FG
7054 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
7055 ss << "value '" << value << "' too long; max length "
7056 << MAX_POOL_APPLICATION_LENGTH;
7057 return -EINVAL;
7058 }
7059
7060 p.application_metadata[app][key] = value;
7061 ss << "set application '" << app << "' key '" << key << "' to '"
7062 << value << "' on pool '" << pool_name << "'";
7063 } else if (boost::algorithm::ends_with(prefix, "rm")) {
7064 if (!app_exists) {
7065 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7066 << "'";
7067 return -ENOENT;
7068 }
7069
7070 string key;
f64942e4 7071 cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
c07f9fc5
FG
7072 auto it = p.application_metadata[app].find(key);
7073 if (it == p.application_metadata[app].end()) {
7074 ss << "application '" << app << "' on pool '" << pool_name
7075 << "' does not have key '" << key << "'";
7076 return 0; // idempotent
7077 }
7078
7079 p.application_metadata[app].erase(it);
7080 ss << "removed application '" << app << "' key '" << key << "' on pool '"
7081 << pool_name << "'";
7082 } else {
7083 assert(false);
7084 }
7085
7086 p.last_change = pending_inc.epoch;
7087 pending_inc.new_pools[pool] = p;
7088 return 0;
7089}
7090
31f18b77
FG
7091int OSDMonitor::_prepare_command_osd_crush_remove(
7092 CrushWrapper &newcrush,
7093 int32_t id,
7094 int32_t ancestor,
7095 bool has_ancestor,
7096 bool unlink_only)
7097{
7098 int err = 0;
7099
7100 if (has_ancestor) {
7101 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
7102 unlink_only);
7103 } else {
7104 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
7105 }
7106 return err;
7107}
7108
7109void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
7110{
7111 pending_inc.crush.clear();
7112 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7113}
7114
7115int OSDMonitor::prepare_command_osd_crush_remove(
7116 CrushWrapper &newcrush,
7117 int32_t id,
7118 int32_t ancestor,
7119 bool has_ancestor,
7120 bool unlink_only)
7121{
7122 int err = _prepare_command_osd_crush_remove(
7123 newcrush, id, ancestor,
7124 has_ancestor, unlink_only);
7125
7126 if (err < 0)
7127 return err;
7128
7129 assert(err == 0);
7130 do_osd_crush_remove(newcrush);
7131
7132 return 0;
7133}
7134
7135int OSDMonitor::prepare_command_osd_remove(int32_t id)
7136{
7137 if (osdmap.is_up(id)) {
7138 return -EBUSY;
7139 }
7140
7141 pending_inc.new_state[id] = osdmap.get_state(id);
7142 pending_inc.new_uuid[id] = uuid_d();
7143 pending_metadata_rm.insert(id);
7144 pending_metadata.erase(id);
7145
7146 return 0;
7147}
7148
7149int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
7150{
7151 assert(existing_id);
7152 *existing_id = -1;
7153
7154 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
7155 if (!osdmap.exists(i) &&
7156 pending_inc.new_up_client.count(i) == 0 &&
7157 (pending_inc.new_state.count(i) == 0 ||
7158 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
7159 *existing_id = i;
7160 return -1;
7161 }
7162 }
7163
7164 if (pending_inc.new_max_osd < 0) {
7165 return osdmap.get_max_osd();
7166 }
7167 return pending_inc.new_max_osd;
7168}
7169
7170void OSDMonitor::do_osd_create(
7171 const int32_t id,
7172 const uuid_d& uuid,
3a9019d9 7173 const string& device_class,
31f18b77
FG
7174 int32_t* new_id)
7175{
7176 dout(10) << __func__ << " uuid " << uuid << dendl;
7177 assert(new_id);
7178
7179 // We presume validation has been performed prior to calling this
7180 // function. We assert with prejudice.
7181
7182 int32_t allocated_id = -1; // declare here so we can jump
7183 int32_t existing_id = -1;
7184 if (!uuid.is_zero()) {
7185 existing_id = osdmap.identify_osd(uuid);
7186 if (existing_id >= 0) {
7187 assert(id < 0 || id == existing_id);
7188 *new_id = existing_id;
7189 goto out;
7190 } else if (id >= 0) {
7191 // uuid does not exist, and id has been provided, so just create
7192 // the new osd.id
7193 *new_id = id;
7194 goto out;
7195 }
7196 }
7197
7198 // allocate a new id
7199 allocated_id = _allocate_osd_id(&existing_id);
7200 dout(10) << __func__ << " allocated id " << allocated_id
7201 << " existing id " << existing_id << dendl;
7202 if (existing_id >= 0) {
7203 assert(existing_id < osdmap.get_max_osd());
7204 assert(allocated_id < 0);
7205 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
7206 *new_id = existing_id;
31f18b77
FG
7207 } else if (allocated_id >= 0) {
7208 assert(existing_id < 0);
7209 // raise max_osd
7210 if (pending_inc.new_max_osd < 0) {
7211 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
7212 } else {
7213 ++pending_inc.new_max_osd;
7214 }
7215 *new_id = pending_inc.new_max_osd - 1;
7216 assert(*new_id == allocated_id);
7217 } else {
7218 assert(0 == "unexpected condition");
7219 }
7220
7221out:
3a9019d9
FG
7222 if (device_class.size()) {
7223 CrushWrapper newcrush;
7224 _get_pending_crush(newcrush);
7225 if (newcrush.get_max_devices() < *new_id + 1) {
7226 newcrush.set_max_devices(*new_id + 1);
7227 }
7228 string name = string("osd.") + stringify(*new_id);
7229 if (!newcrush.item_exists(*new_id)) {
7230 newcrush.set_item_name(*new_id, name);
7231 }
7232 ostringstream ss;
7233 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
7234 if (r < 0) {
7235 derr << __func__ << " failed to set " << name << " device_class "
7236 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
7237 << dendl;
7238 // non-fatal... this might be a replay and we want to be idempotent.
7239 } else {
7240 dout(20) << __func__ << " set " << name << " device_class " << device_class
7241 << dendl;
7242 pending_inc.crush.clear();
7243 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7244 }
7245 } else {
7246 dout(20) << __func__ << " no device_class" << dendl;
7247 }
7248
31f18b77
FG
7249 dout(10) << __func__ << " using id " << *new_id << dendl;
7250 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
7251 pending_inc.new_max_osd = *new_id + 1;
7252 }
7253
7254 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
7255 if (!uuid.is_zero())
7256 pending_inc.new_uuid[*new_id] = uuid;
7257}
7258
7259int OSDMonitor::validate_osd_create(
7260 const int32_t id,
7261 const uuid_d& uuid,
7262 const bool check_osd_exists,
7263 int32_t* existing_id,
7264 stringstream& ss)
7265{
7266
7267 dout(10) << __func__ << " id " << id << " uuid " << uuid
7268 << " check_osd_exists " << check_osd_exists << dendl;
7269
7270 assert(existing_id);
7271
7272 if (id < 0 && uuid.is_zero()) {
7273 // we have nothing to validate
7274 *existing_id = -1;
7275 return 0;
7276 } else if (uuid.is_zero()) {
7277 // we have an id but we will ignore it - because that's what
7278 // `osd create` does.
7279 return 0;
7280 }
7281
7282 /*
7283 * This function will be used to validate whether we are able to
7284 * create a new osd when the `uuid` is specified.
7285 *
7286 * It will be used by both `osd create` and `osd new`, as the checks
7287 * are basically the same when it pertains to osd id and uuid validation.
7288 * However, `osd create` presumes an `uuid` is optional, for legacy
7289 * reasons, while `osd new` requires the `uuid` to be provided. This
7290 * means that `osd create` will not be idempotent if an `uuid` is not
7291 * provided, but we will always guarantee the idempotency of `osd new`.
7292 */
7293
7294 assert(!uuid.is_zero());
7295 if (pending_inc.identify_osd(uuid) >= 0) {
7296 // osd is about to exist
7297 return -EAGAIN;
7298 }
7299
7300 int32_t i = osdmap.identify_osd(uuid);
7301 if (i >= 0) {
7302 // osd already exists
7303 if (id >= 0 && i != id) {
7304 ss << "uuid " << uuid << " already in use for different id " << i;
7305 return -EEXIST;
7306 }
7307 // return a positive errno to distinguish between a blocking error
7308 // and an error we consider to not be a problem (i.e., this would be
7309 // an idempotent operation).
7310 *existing_id = i;
7311 return EEXIST;
7312 }
7313 // i < 0
7314 if (id >= 0) {
7315 if (pending_inc.new_state.count(id)) {
7316 // osd is about to exist
7317 return -EAGAIN;
7318 }
7319 // we may not care if an osd exists if we are recreating a previously
7320 // destroyed osd.
7321 if (check_osd_exists && osdmap.exists(id)) {
7322 ss << "id " << id << " already in use and does not match uuid "
7323 << uuid;
7324 return -EINVAL;
7325 }
7326 }
7327 return 0;
7328}
7329
7330int OSDMonitor::prepare_command_osd_create(
7331 const int32_t id,
7332 const uuid_d& uuid,
7333 int32_t* existing_id,
7334 stringstream& ss)
7335{
7336 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7337 assert(existing_id);
b5b8bbf5
FG
7338 if (osdmap.is_destroyed(id)) {
7339 ss << "ceph osd create has been deprecated. Please use ceph osd new "
7340 "instead.";
7341 return -EINVAL;
7342 }
31f18b77
FG
7343
7344 if (uuid.is_zero()) {
7345 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
7346 }
7347
7348 return validate_osd_create(id, uuid, true, existing_id, ss);
7349}
7350
7351int OSDMonitor::prepare_command_osd_new(
7352 MonOpRequestRef op,
7353 const map<string,cmd_vartype>& cmdmap,
3a9019d9 7354 const map<string,string>& params,
31f18b77
FG
7355 stringstream &ss,
7356 Formatter *f)
7357{
7358 uuid_d uuid;
7359 string uuidstr;
7360 int64_t id = -1;
7361
7362 assert(paxos->is_plugged());
7363
7364 dout(10) << __func__ << " " << op << dendl;
7365
7366 /* validate command. abort now if something's wrong. */
7367
7368 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
7369 *
7370 * If `id` is not specified, we will identify any existing osd based
7371 * on `uuid`. Operation will be idempotent iff secrets match.
7372 *
7373 * If `id` is specified, we will identify any existing osd based on
7374 * `uuid` and match against `id`. If they match, operation will be
7375 * idempotent iff secrets match.
7376 *
7377 * `-i secrets.json` will be optional. If supplied, will be used
7378 * to check for idempotency when `id` and `uuid` match.
7379 *
7380 * If `id` is not specified, and `uuid` does not exist, an id will
7381 * be found or allocated for the osd.
7382 *
7383 * If `id` is specified, and the osd has been previously marked
7384 * as destroyed, then the `id` will be reused.
7385 */
f64942e4 7386 if (!cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
31f18b77
FG
7387 ss << "requires the OSD's UUID to be specified.";
7388 return -EINVAL;
7389 } else if (!uuid.parse(uuidstr.c_str())) {
7390 ss << "invalid UUID value '" << uuidstr << "'.";
7391 return -EINVAL;
7392 }
7393
f64942e4 7394 if (cmd_getval_throws(g_ceph_context, cmdmap, "id", id) &&
31f18b77
FG
7395 (id < 0)) {
7396 ss << "invalid OSD id; must be greater or equal than zero.";
7397 return -EINVAL;
7398 }
7399
7400 // are we running an `osd create`-like command, or recreating
7401 // a previously destroyed osd?
7402
7403 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
7404
7405 // we will care about `id` to assess whether osd is `destroyed`, or
7406 // to create a new osd.
7407 // we will need an `id` by the time we reach auth.
7408
7409 int32_t existing_id = -1;
7410 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
7411 &existing_id, ss);
7412
7413 bool may_be_idempotent = false;
7414 if (err == EEXIST) {
7415 // this is idempotent from the osdmon's point-of-view
7416 may_be_idempotent = true;
7417 assert(existing_id >= 0);
7418 id = existing_id;
7419 } else if (err < 0) {
7420 return err;
7421 }
7422
7423 if (!may_be_idempotent) {
7424 // idempotency is out of the window. We are either creating a new
7425 // osd or recreating a destroyed osd.
7426 //
7427 // We now need to figure out if we have an `id` (and if it's valid),
7428 // of find an `id` if we don't have one.
7429
7430 // NOTE: we need to consider the case where the `id` is specified for
7431 // `osd create`, and we must honor it. So this means checking if
7432 // the `id` is destroyed, and if so assume the destroy; otherwise,
7433 // check if it `exists` - in which case we complain about not being
7434 // `destroyed`. In the end, if nothing fails, we must allow the
7435 // creation, so that we are compatible with `create`.
7436 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
7437 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
7438 ss << "OSD " << id << " has not yet been destroyed";
7439 return -EINVAL;
7440 } else if (id < 0) {
7441 // find an `id`
7442 id = _allocate_osd_id(&existing_id);
7443 if (id < 0) {
7444 assert(existing_id >= 0);
7445 id = existing_id;
7446 }
7447 dout(10) << __func__ << " found id " << id << " to use" << dendl;
7448 } else if (id >= 0 && osdmap.is_destroyed(id)) {
7449 dout(10) << __func__ << " recreating osd." << id << dendl;
7450 } else {
7451 dout(10) << __func__ << " creating new osd." << id << dendl;
7452 }
7453 } else {
7454 assert(id >= 0);
7455 assert(osdmap.exists(id));
7456 }
7457
7458 // we are now able to either create a brand new osd or reuse an existing
7459 // osd that has been previously destroyed.
7460
7461 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7462
3a9019d9 7463 if (may_be_idempotent && params.empty()) {
31f18b77 7464 // nothing to do, really.
3a9019d9 7465 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
31f18b77
FG
7466 assert(id >= 0);
7467 if (f) {
7468 f->open_object_section("created_osd");
7469 f->dump_int("osdid", id);
7470 f->close_section();
7471 } else {
7472 ss << id;
7473 }
7474 return EEXIST;
7475 }
7476
3a9019d9
FG
7477 string device_class;
7478 auto p = params.find("crush_device_class");
7479 if (p != params.end()) {
7480 device_class = p->second;
7481 dout(20) << __func__ << " device_class will be " << device_class << dendl;
7482 }
31f18b77
FG
7483 string cephx_secret, lockbox_secret, dmcrypt_key;
7484 bool has_lockbox = false;
3a9019d9
FG
7485 bool has_secrets = params.count("cephx_secret")
7486 || params.count("cephx_lockbox_secret")
7487 || params.count("dmcrypt_key");
31f18b77
FG
7488
7489 ConfigKeyService *svc = nullptr;
7490 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
7491
7492 if (has_secrets) {
3a9019d9 7493 if (params.count("cephx_secret") == 0) {
31f18b77
FG
7494 ss << "requires a cephx secret.";
7495 return -EINVAL;
7496 }
3a9019d9 7497 cephx_secret = params.at("cephx_secret");
31f18b77 7498
3a9019d9
FG
7499 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
7500 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
7501
7502 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
7503 << " dmcrypt " << has_dmcrypt_key << dendl;
7504
7505 if (has_lockbox_secret && has_dmcrypt_key) {
7506 has_lockbox = true;
3a9019d9
FG
7507 lockbox_secret = params.at("cephx_lockbox_secret");
7508 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
7509 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
7510 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
7511 return -EINVAL;
7512 }
7513
7514 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
7515
7516 err = mon->authmon()->validate_osd_new(id, uuid,
7517 cephx_secret,
7518 lockbox_secret,
7519 cephx_entity,
7520 lockbox_entity,
7521 ss);
7522 if (err < 0) {
7523 return err;
7524 } else if (may_be_idempotent && err != EEXIST) {
7525 // for this to be idempotent, `id` should already be >= 0; no need
7526 // to use validate_id.
7527 assert(id >= 0);
7528 ss << "osd." << id << " exists but secrets do not match";
7529 return -EEXIST;
7530 }
7531
7532 if (has_lockbox) {
7533 svc = (ConfigKeyService*)mon->config_key_service;
7534 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
7535 if (err < 0) {
7536 return err;
7537 } else if (may_be_idempotent && err != EEXIST) {
7538 assert(id >= 0);
7539 ss << "osd." << id << " exists but dm-crypt key does not match.";
7540 return -EEXIST;
7541 }
7542 }
7543 }
7544 assert(!has_secrets || !cephx_secret.empty());
7545 assert(!has_lockbox || !lockbox_secret.empty());
7546
7547 if (may_be_idempotent) {
7548 // we have nothing to do for either the osdmon or the authmon,
7549 // and we have no lockbox - so the config key service will not be
7550 // touched. This is therefore an idempotent operation, and we can
7551 // just return right away.
7552 dout(10) << __func__ << " idempotent -- no op." << dendl;
7553 assert(id >= 0);
7554 if (f) {
7555 f->open_object_section("created_osd");
7556 f->dump_int("osdid", id);
7557 f->close_section();
7558 } else {
7559 ss << id;
7560 }
7561 return EEXIST;
7562 }
7563 assert(!may_be_idempotent);
7564
7565 // perform updates.
7566 if (has_secrets) {
7567 assert(!cephx_secret.empty());
7568 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
7569 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
7570
7571 err = mon->authmon()->do_osd_new(cephx_entity,
7572 lockbox_entity,
7573 has_lockbox);
7574 assert(0 == err);
7575
7576 if (has_lockbox) {
7577 assert(nullptr != svc);
7578 svc->do_osd_new(uuid, dmcrypt_key);
7579 }
7580 }
7581
7582 if (is_recreate_destroyed) {
7583 assert(id >= 0);
7584 assert(osdmap.is_destroyed(id));
7585 pending_inc.new_weight[id] = CEPH_OSD_OUT;
7586 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
c07f9fc5
FG
7587 if (osdmap.get_state(id) & CEPH_OSD_UP) {
7588 // due to http://tracker.ceph.com/issues/20751 some clusters may
7589 // have UP set for non-existent OSDs; make sure it is cleared
7590 // for a newly created osd.
7591 pending_inc.new_state[id] |= CEPH_OSD_UP;
7592 }
31f18b77
FG
7593 pending_inc.new_uuid[id] = uuid;
7594 } else {
7595 assert(id >= 0);
7596 int32_t new_id = -1;
3a9019d9 7597 do_osd_create(id, uuid, device_class, &new_id);
31f18b77
FG
7598 assert(new_id >= 0);
7599 assert(id == new_id);
7600 }
7601
7602 if (f) {
7603 f->open_object_section("created_osd");
7604 f->dump_int("osdid", id);
7605 f->close_section();
7606 } else {
7607 ss << id;
7608 }
7609
7610 return 0;
7611}
7612
7c673cae
FG
7613bool OSDMonitor::prepare_command(MonOpRequestRef op)
7614{
7615 op->mark_osdmon_event(__func__);
7616 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7617 stringstream ss;
7618 map<string, cmd_vartype> cmdmap;
7619 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
7620 string rs = ss.str();
7621 mon->reply_command(op, -EINVAL, rs, get_last_committed());
7622 return true;
7623 }
7624
7625 MonSession *session = m->get_session();
7626 if (!session) {
7627 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
7628 return true;
7629 }
7630
7631 return prepare_command_impl(op, cmdmap);
7632}
7633
7634static int parse_reweights(CephContext *cct,
7635 const map<string,cmd_vartype> &cmdmap,
7636 const OSDMap& osdmap,
7637 map<int32_t, uint32_t>* weights)
7638{
7639 string weights_str;
f64942e4 7640 if (!cmd_getval_throws(g_ceph_context, cmdmap, "weights", weights_str)) {
7c673cae
FG
7641 return -EINVAL;
7642 }
7643 std::replace(begin(weights_str), end(weights_str), '\'', '"');
7644 json_spirit::mValue json_value;
7645 if (!json_spirit::read(weights_str, json_value)) {
7646 return -EINVAL;
7647 }
7648 if (json_value.type() != json_spirit::obj_type) {
7649 return -EINVAL;
7650 }
7651 const auto obj = json_value.get_obj();
7652 try {
7653 for (auto& osd_weight : obj) {
7654 auto osd_id = std::stoi(osd_weight.first);
7655 if (!osdmap.exists(osd_id)) {
7656 return -ENOENT;
7657 }
7658 if (osd_weight.second.type() != json_spirit::str_type) {
7659 return -EINVAL;
7660 }
7661 auto weight = std::stoul(osd_weight.second.get_str());
7662 weights->insert({osd_id, weight});
7663 }
7664 } catch (const std::logic_error& e) {
7665 return -EINVAL;
7666 }
7667 return 0;
7668}
7669
31f18b77
FG
7670int OSDMonitor::prepare_command_osd_destroy(
7671 int32_t id,
7672 stringstream& ss)
7673{
7674 assert(paxos->is_plugged());
7675
7676 // we check if the osd exists for the benefit of `osd purge`, which may
7677 // have previously removed the osd. If the osd does not exist, return
7678 // -ENOENT to convey this, and let the caller deal with it.
7679 //
7680 // we presume that all auth secrets and config keys were removed prior
7681 // to this command being called. if they exist by now, we also assume
7682 // they must have been created by some other command and do not pertain
7683 // to this non-existent osd.
7684 if (!osdmap.exists(id)) {
7685 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
7686 return -ENOENT;
7687 }
7688
7689 uuid_d uuid = osdmap.get_uuid(id);
7690 dout(10) << __func__ << " destroying osd." << id
7691 << " uuid " << uuid << dendl;
7692
7693 // if it has been destroyed, we assume our work here is done.
7694 if (osdmap.is_destroyed(id)) {
7695 ss << "destroyed osd." << id;
7696 return 0;
7697 }
7698
7699 EntityName cephx_entity, lockbox_entity;
7700 bool idempotent_auth = false, idempotent_cks = false;
7701
7702 int err = mon->authmon()->validate_osd_destroy(id, uuid,
7703 cephx_entity,
7704 lockbox_entity,
7705 ss);
7706 if (err < 0) {
7707 if (err == -ENOENT) {
7708 idempotent_auth = true;
31f18b77
FG
7709 } else {
7710 return err;
7711 }
7712 }
7713
7714 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
7715 err = svc->validate_osd_destroy(id, uuid);
7716 if (err < 0) {
7717 assert(err == -ENOENT);
7718 err = 0;
7719 idempotent_cks = true;
7720 }
7721
7722 if (!idempotent_auth) {
7723 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
7724 assert(0 == err);
7725 }
7726
7727 if (!idempotent_cks) {
7728 svc->do_osd_destroy(id, uuid);
7729 }
7730
7731 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
7732 pending_inc.new_uuid[id] = uuid_d();
7733
7734 // we can only propose_pending() once per service, otherwise we'll be
7735 // defying PaxosService and all laws of nature. Therefore, as we may
7736 // be used during 'osd purge', let's keep the caller responsible for
7737 // proposing.
7738 assert(err == 0);
7739 return 0;
7740}
7741
7742int OSDMonitor::prepare_command_osd_purge(
7743 int32_t id,
7744 stringstream& ss)
7745{
7746 assert(paxos->is_plugged());
7747 dout(10) << __func__ << " purging osd." << id << dendl;
7748
7749 assert(!osdmap.is_up(id));
7750
7751 /*
7752 * This may look a bit weird, but this is what's going to happen:
7753 *
7754 * 1. we make sure that removing from crush works
7755 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7756 * error, then we abort the whole operation, as no updates
7757 * have been made. However, we this function will have
7758 * side-effects, thus we need to make sure that all operations
7759 * performed henceforth will *always* succeed.
7760 * 3. we call `prepare_command_osd_remove()`. Although this
7761 * function can return an error, it currently only checks if the
7762 * osd is up - and we have made sure that it is not so, so there
7763 * is no conflict, and it is effectively an update.
7764 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7765 * the crush update we delayed from before.
7766 */
7767
7768 CrushWrapper newcrush;
7769 _get_pending_crush(newcrush);
7770
7771 bool may_be_idempotent = false;
7772
7773 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
7774 if (err == -ENOENT) {
7775 err = 0;
7776 may_be_idempotent = true;
7777 } else if (err < 0) {
7778 ss << "error removing osd." << id << " from crush";
7779 return err;
7780 }
7781
7782 // no point destroying the osd again if it has already been marked destroyed
7783 if (!osdmap.is_destroyed(id)) {
7784 err = prepare_command_osd_destroy(id, ss);
7785 if (err < 0) {
7786 if (err == -ENOENT) {
7787 err = 0;
7788 } else {
7789 return err;
7790 }
7791 } else {
7792 may_be_idempotent = false;
7793 }
7794 }
7795 assert(0 == err);
7796
7797 if (may_be_idempotent && !osdmap.exists(id)) {
7798 dout(10) << __func__ << " osd." << id << " does not exist and "
7799 << "we are idempotent." << dendl;
7800 return -ENOENT;
7801 }
7802
7803 err = prepare_command_osd_remove(id);
7804 // we should not be busy, as we should have made sure this id is not up.
7805 assert(0 == err);
7806
7807 do_osd_crush_remove(newcrush);
7808 return 0;
7809}
7810
7c673cae
FG
7811bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
7812 map<string,cmd_vartype> &cmdmap)
7813{
7814 op->mark_osdmon_event(__func__);
7815 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7816 bool ret = false;
7817 stringstream ss;
7818 string rs;
7819 bufferlist rdata;
7820 int err = 0;
7821
7822 string format;
f64942e4 7823 cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
7c673cae
FG
7824 boost::scoped_ptr<Formatter> f(Formatter::create(format));
7825
7826 string prefix;
f64942e4 7827 cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
7c673cae
FG
7828
7829 int64_t osdid;
7830 string name;
b32b8144
FG
7831 bool osdid_present = false;
7832 if (prefix != "osd pg-temp" &&
7833 prefix != "osd pg-upmap" &&
7834 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
f64942e4 7835 osdid_present = cmd_getval_throws(g_ceph_context, cmdmap, "id", osdid);
b32b8144 7836 }
7c673cae
FG
7837 if (osdid_present) {
7838 ostringstream oss;
7839 oss << "osd." << osdid;
7840 name = oss.str();
7841 }
7842
7843 // Even if there's a pending state with changes that could affect
7844 // a command, considering that said state isn't yet committed, we
7845 // just don't care about those changes if the command currently being
7846 // handled acts as a no-op against the current committed state.
7847 // In a nutshell, we assume this command happens *before*.
7848 //
7849 // Let me make this clearer:
7850 //
7851 // - If we have only one client, and that client issues some
7852 // operation that would conflict with this operation but is
7853 // still on the pending state, then we would be sure that said
7854 // operation wouldn't have returned yet, so the client wouldn't
7855 // issue this operation (unless the client didn't wait for the
7856 // operation to finish, and that would be the client's own fault).
7857 //
7858 // - If we have more than one client, each client will observe
7859 // whatever is the state at the moment of the commit. So, if we
7860 // have two clients, one issuing an unlink and another issuing a
7861 // link, and if the link happens while the unlink is still on the
7862 // pending state, from the link's point-of-view this is a no-op.
7863 // If different clients are issuing conflicting operations and
7864 // they care about that, then the clients should make sure they
7865 // enforce some kind of concurrency mechanism -- from our
7866 // perspective that's what Douglas Adams would call an SEP.
7867 //
7868 // This should be used as a general guideline for most commands handled
7869 // in this function. Adapt as you see fit, but please bear in mind that
7870 // this is the expected behavior.
7871
7872
7873 if (prefix == "osd setcrushmap" ||
7874 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
7875 if (pending_inc.crush.length()) {
7876 dout(10) << __func__ << " waiting for pending crush update " << dendl;
7877 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7878 return true;
7879 }
7c673cae
FG
7880 dout(10) << "prepare_command setting new crush map" << dendl;
7881 bufferlist data(m->get_data());
7882 CrushWrapper crush;
7883 try {
7884 bufferlist::iterator bl(data.begin());
7885 crush.decode(bl);
7886 }
7887 catch (const std::exception &e) {
7888 err = -EINVAL;
7889 ss << "Failed to parse crushmap: " << e.what();
7890 goto reply;
7891 }
31f18b77
FG
7892
7893 int64_t prior_version = 0;
f64942e4 7894 if (cmd_getval_throws(g_ceph_context, cmdmap, "prior_version", prior_version)) {
31f18b77
FG
7895 if (prior_version == osdmap.get_crush_version() - 1) {
7896 // see if we are a resend of the last update. this is imperfect
7897 // (multiple racing updaters may not both get reliable success)
7898 // but we expect crush updaters (via this interface) to be rare-ish.
7899 bufferlist current, proposed;
7900 osdmap.crush->encode(current, mon->get_quorum_con_features());
7901 crush.encode(proposed, mon->get_quorum_con_features());
7902 if (current.contents_equal(proposed)) {
7903 dout(10) << __func__
7904 << " proposed matches current and version equals previous"
7905 << dendl;
7906 err = 0;
7907 ss << osdmap.get_crush_version();
7908 goto reply;
7909 }
7910 }
7911 if (prior_version != osdmap.get_crush_version()) {
7912 err = -EPERM;
7913 ss << "prior_version " << prior_version << " != crush version "
7914 << osdmap.get_crush_version();
7915 goto reply;
7916 }
7917 }
7c673cae 7918
3efd9988 7919 if (crush.has_legacy_rule_ids()) {
31f18b77
FG
7920 err = -EINVAL;
7921 ss << "crush maps with ruleset != ruleid are no longer allowed";
7922 goto reply;
7923 }
7c673cae
FG
7924 if (!validate_crush_against_features(&crush, ss)) {
7925 err = -EINVAL;
7926 goto reply;
7927 }
31f18b77 7928
3efd9988
FG
7929 err = osdmap.validate_crush_rules(&crush, &ss);
7930 if (err < 0) {
7931 goto reply;
7c673cae
FG
7932 }
7933
224ce89b
WB
7934 if (g_conf->mon_osd_crush_smoke_test) {
7935 // sanity check: test some inputs to make sure this map isn't
7936 // totally broken
7937 dout(10) << " testing map" << dendl;
7938 stringstream ess;
7939 CrushTester tester(crush, ess);
b5b8bbf5 7940 tester.set_min_x(0);
224ce89b 7941 tester.set_max_x(50);
b5b8bbf5 7942 auto start = ceph::coarse_mono_clock::now();
224ce89b 7943 int r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 7944 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
7945 if (r < 0) {
7946 dout(10) << " tester.test_with_fork returns " << r
7947 << ": " << ess.str() << dendl;
7948 ss << "crush smoke test failed with " << r << ": " << ess.str();
7949 err = r;
7950 goto reply;
7951 }
b5b8bbf5
FG
7952 dout(10) << __func__ << " crush somke test duration: "
7953 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
7954 }
7955
7c673cae 7956 pending_inc.crush = data;
31f18b77 7957 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
7958 goto update;
7959
3efd9988
FG
7960 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
7961 CrushWrapper newcrush;
7962 _get_pending_crush(newcrush);
7963 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
7964 int bid = -1 - b;
7965 if (newcrush.bucket_exists(bid) &&
7966 newcrush.get_bucket_alg(bid)) {
7967 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
7968 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
7969 }
7970 }
7971 if (!validate_crush_against_features(&newcrush, ss)) {
7972 err = -EINVAL;
7973 goto reply;
7974 }
7975 pending_inc.crush.clear();
7976 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7977 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7978 get_last_committed() + 1));
7979 return true;
7c673cae 7980 } else if (prefix == "osd crush set-device-class") {
224ce89b
WB
7981 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7982 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7983 << "luminous' before using crush device classes";
7984 err = -EPERM;
7c673cae
FG
7985 goto reply;
7986 }
7987
7988 string device_class;
f64942e4 7989 if (!cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class)) {
7c673cae
FG
7990 err = -EINVAL; // no value!
7991 goto reply;
7992 }
7993
224ce89b
WB
7994 bool stop = false;
7995 vector<string> idvec;
f64942e4 7996 cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
7c673cae
FG
7997 CrushWrapper newcrush;
7998 _get_pending_crush(newcrush);
224ce89b
WB
7999 set<int> updated;
8000 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8001 set<int> osds;
8002 // wildcard?
8003 if (j == 0 &&
8004 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8005 osdmap.get_all_osds(osds);
8006 stop = true;
8007 } else {
8008 // try traditional single osd way
8009 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8010 if (osd < 0) {
8011 // ss has reason for failure
8012 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8013 err = -EINVAL;
8014 continue;
8015 }
8016 osds.insert(osd);
8017 }
7c673cae 8018
224ce89b
WB
8019 for (auto &osd : osds) {
8020 if (!osdmap.exists(osd)) {
8021 ss << "osd." << osd << " does not exist. ";
8022 continue;
8023 }
7c673cae 8024
224ce89b
WB
8025 ostringstream oss;
8026 oss << "osd." << osd;
8027 string name = oss.str();
7c673cae 8028
3a9019d9
FG
8029 if (newcrush.get_max_devices() < osd + 1) {
8030 newcrush.set_max_devices(osd + 1);
8031 }
224ce89b
WB
8032 string action;
8033 if (newcrush.item_exists(osd)) {
8034 action = "updating";
8035 } else {
8036 action = "creating";
8037 newcrush.set_item_name(osd, name);
8038 }
7c673cae 8039
224ce89b
WB
8040 dout(5) << action << " crush item id " << osd << " name '" << name
8041 << "' device_class '" << device_class << "'"
8042 << dendl;
8043 err = newcrush.update_device_class(osd, device_class, name, &ss);
8044 if (err < 0) {
8045 goto reply;
8046 }
8047 if (err == 0 && !_have_pending_crush()) {
8048 if (!stop) {
8049 // for single osd only, wildcard makes too much noise
8050 ss << "set-device-class item id " << osd << " name '" << name
8051 << "' device_class '" << device_class << "': no change";
8052 }
8053 } else {
8054 updated.insert(osd);
8055 }
8056 }
7c673cae
FG
8057 }
8058
224ce89b
WB
8059 if (!updated.empty()) {
8060 pending_inc.crush.clear();
8061 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8062 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
8063 getline(ss, rs);
8064 wait_for_finished_proposal(op,
8065 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8066 return true;
8067 }
7c673cae 8068
c07f9fc5
FG
8069 } else if (prefix == "osd crush rm-device-class") {
8070 bool stop = false;
8071 vector<string> idvec;
f64942e4 8072 cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
c07f9fc5
FG
8073 CrushWrapper newcrush;
8074 _get_pending_crush(newcrush);
8075 set<int> updated;
8076
8077 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8078 set<int> osds;
8079
8080 // wildcard?
8081 if (j == 0 &&
8082 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8083 osdmap.get_all_osds(osds);
8084 stop = true;
8085 } else {
8086 // try traditional single osd way
8087 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8088 if (osd < 0) {
8089 // ss has reason for failure
8090 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8091 err = -EINVAL;
8092 goto reply;
8093 }
8094 osds.insert(osd);
8095 }
8096
8097 for (auto &osd : osds) {
8098 if (!osdmap.exists(osd)) {
8099 ss << "osd." << osd << " does not exist. ";
8100 continue;
8101 }
8102
8103 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
8104 if (!class_name) {
8105 ss << "osd." << osd << " belongs to no class, ";
8106 continue;
8107 }
8108 // note that we do not verify if class_is_in_use here
8109 // in case the device is misclassified and user wants
8110 // to overridely reset...
8111
8112 err = newcrush.remove_device_class(g_ceph_context, osd, &ss);
8113 if (err < 0) {
8114 // ss has reason for failure
8115 goto reply;
8116 }
8117 updated.insert(osd);
8118 }
8119 }
8120
8121 if (!updated.empty()) {
8122 pending_inc.crush.clear();
8123 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8124 ss << "done removing class of osd(s): " << updated;
8125 getline(ss, rs);
8126 wait_for_finished_proposal(op,
8127 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8128 return true;
8129 }
35e4c445
FG
8130 } else if (prefix == "osd crush class rename") {
8131 string srcname, dstname;
f64942e4 8132 if (!cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname)) {
35e4c445
FG
8133 err = -EINVAL;
8134 goto reply;
8135 }
f64942e4 8136 if (!cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname)) {
35e4c445
FG
8137 err = -EINVAL;
8138 goto reply;
8139 }
8140
8141 CrushWrapper newcrush;
8142 _get_pending_crush(newcrush);
181888fb
FG
8143 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
8144 // suppose this is a replay and return success
8145 // so command is idempotent
8146 ss << "already renamed to '" << dstname << "'";
8147 err = 0;
35e4c445
FG
8148 goto reply;
8149 }
c07f9fc5 8150
35e4c445
FG
8151 err = newcrush.rename_class(srcname, dstname);
8152 if (err < 0) {
8153 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
8154 << cpp_strerror(err);
8155 goto reply;
8156 }
8157
8158 pending_inc.crush.clear();
8159 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8160 ss << "rename class '" << srcname << "' to '" << dstname << "'";
8161 goto update;
7c673cae
FG
8162 } else if (prefix == "osd crush add-bucket") {
8163 // os crush add-bucket <name> <type>
8164 string name, typestr;
f64942e4
AA
8165 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
8166 cmd_getval_throws(g_ceph_context, cmdmap, "type", typestr);
7c673cae
FG
8167
8168 if (!_have_pending_crush() &&
8169 _get_stable_crush().name_exists(name)) {
8170 ss << "bucket '" << name << "' already exists";
8171 goto reply;
8172 }
8173
8174 CrushWrapper newcrush;
8175 _get_pending_crush(newcrush);
8176
8177 if (newcrush.name_exists(name)) {
8178 ss << "bucket '" << name << "' already exists";
8179 goto update;
8180 }
8181 int type = newcrush.get_type_id(typestr);
8182 if (type < 0) {
8183 ss << "type '" << typestr << "' does not exist";
8184 err = -EINVAL;
8185 goto reply;
8186 }
8187 if (type == 0) {
8188 ss << "type '" << typestr << "' is for devices, not buckets";
8189 err = -EINVAL;
8190 goto reply;
8191 }
8192 int bucketno;
8193 err = newcrush.add_bucket(0, 0,
8194 CRUSH_HASH_DEFAULT, type, 0, NULL,
8195 NULL, &bucketno);
8196 if (err < 0) {
8197 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
8198 goto reply;
8199 }
8200 err = newcrush.set_item_name(bucketno, name);
8201 if (err < 0) {
8202 ss << "error setting bucket name to '" << name << "'";
8203 goto reply;
8204 }
8205
8206 pending_inc.crush.clear();
8207 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8208 ss << "added bucket " << name << " type " << typestr
8209 << " to crush map";
8210 goto update;
8211 } else if (prefix == "osd crush rename-bucket") {
8212 string srcname, dstname;
f64942e4
AA
8213 cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
8214 cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
7c673cae
FG
8215
8216 err = crush_rename_bucket(srcname, dstname, &ss);
8217 if (err == -EALREADY) // equivalent to success for idempotency
8218 err = 0;
8219 if (err)
8220 goto reply;
8221 else
8222 goto update;
c07f9fc5
FG
8223 } else if (prefix == "osd crush weight-set create" ||
8224 prefix == "osd crush weight-set create-compat") {
8225 CrushWrapper newcrush;
8226 _get_pending_crush(newcrush);
8227 int64_t pool;
8228 int positions;
8229 if (newcrush.has_non_straw2_buckets()) {
8230 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
8231 err = -EPERM;
8232 goto reply;
8233 }
c07f9fc5
FG
8234 if (prefix == "osd crush weight-set create") {
8235 if (osdmap.require_min_compat_client > 0 &&
8236 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8237 ss << "require_min_compat_client "
8238 << ceph_release_name(osdmap.require_min_compat_client)
8239 << " < luminous, which is required for per-pool weight-sets. "
8240 << "Try 'ceph osd set-require-min-compat-client luminous' "
8241 << "before using the new interface";
8242 err = -EPERM;
8243 goto reply;
8244 }
8245 string poolname, mode;
f64942e4 8246 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
c07f9fc5
FG
8247 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8248 if (pool < 0) {
8249 ss << "pool '" << poolname << "' not found";
8250 err = -ENOENT;
8251 goto reply;
8252 }
f64942e4 8253 cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
c07f9fc5
FG
8254 if (mode != "flat" && mode != "positional") {
8255 ss << "unrecognized weight-set mode '" << mode << "'";
8256 err = -EINVAL;
8257 goto reply;
8258 }
8259 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
8260 } else {
8261 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8262 positions = 1;
224ce89b 8263 }
c07f9fc5
FG
8264 newcrush.create_choose_args(pool, positions);
8265 pending_inc.crush.clear();
8266 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8267 goto update;
224ce89b 8268
c07f9fc5
FG
8269 } else if (prefix == "osd crush weight-set rm" ||
8270 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
8271 CrushWrapper newcrush;
8272 _get_pending_crush(newcrush);
c07f9fc5
FG
8273 int64_t pool;
8274 if (prefix == "osd crush weight-set rm") {
8275 string poolname;
f64942e4 8276 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
c07f9fc5
FG
8277 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8278 if (pool < 0) {
8279 ss << "pool '" << poolname << "' not found";
8280 err = -ENOENT;
8281 goto reply;
8282 }
8283 } else {
8284 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 8285 }
c07f9fc5
FG
8286 newcrush.rm_choose_args(pool);
8287 pending_inc.crush.clear();
8288 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8289 goto update;
224ce89b 8290
c07f9fc5
FG
8291 } else if (prefix == "osd crush weight-set reweight" ||
8292 prefix == "osd crush weight-set reweight-compat") {
8293 string poolname, item;
8294 vector<double> weight;
f64942e4
AA
8295 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
8296 cmd_getval_throws(g_ceph_context, cmdmap, "item", item);
8297 cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight);
c07f9fc5
FG
8298 CrushWrapper newcrush;
8299 _get_pending_crush(newcrush);
8300 int64_t pool;
8301 if (prefix == "osd crush weight-set reweight") {
8302 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8303 if (pool < 0) {
8304 ss << "pool '" << poolname << "' not found";
8305 err = -ENOENT;
8306 goto reply;
8307 }
8308 if (!newcrush.have_choose_args(pool)) {
8309 ss << "no weight-set for pool '" << poolname << "'";
8310 err = -ENOENT;
8311 goto reply;
8312 }
8313 auto arg_map = newcrush.choose_args_get(pool);
8314 int positions = newcrush.get_choose_args_positions(arg_map);
8315 if (weight.size() != (size_t)positions) {
8316 ss << "must specify exact " << positions << " weight values";
8317 err = -EINVAL;
8318 goto reply;
8319 }
8320 } else {
8321 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8322 if (!newcrush.have_choose_args(pool)) {
8323 ss << "no backward-compatible weight-set";
8324 err = -ENOENT;
8325 goto reply;
8326 }
224ce89b 8327 }
c07f9fc5
FG
8328 if (!newcrush.name_exists(item)) {
8329 ss << "item '" << item << "' does not exist";
8330 err = -ENOENT;
224ce89b
WB
8331 goto reply;
8332 }
c07f9fc5
FG
8333 err = newcrush.choose_args_adjust_item_weightf(
8334 g_ceph_context,
8335 newcrush.choose_args_get(pool),
8336 newcrush.get_item_id(item),
8337 weight,
8338 &ss);
224ce89b 8339 if (err < 0) {
224ce89b
WB
8340 goto reply;
8341 }
c07f9fc5 8342 err = 0;
224ce89b
WB
8343 pending_inc.crush.clear();
8344 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
224ce89b 8345 goto update;
7c673cae
FG
8346 } else if (osdid_present &&
8347 (prefix == "osd crush set" || prefix == "osd crush add")) {
8348 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
8349 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
8350 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
8351
8352 if (!osdmap.exists(osdid)) {
8353 err = -ENOENT;
c07f9fc5 8354 ss << name << " does not exist. Create it before updating the crush map";
7c673cae
FG
8355 goto reply;
8356 }
8357
8358 double weight;
f64942e4 8359 if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
7c673cae
FG
8360 ss << "unable to parse weight value '"
8361 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8362 err = -EINVAL;
8363 goto reply;
8364 }
8365
8366 string args;
8367 vector<string> argvec;
f64942e4 8368 cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
7c673cae
FG
8369 map<string,string> loc;
8370 CrushWrapper::parse_loc_map(argvec, &loc);
8371
8372 if (prefix == "osd crush set"
8373 && !_get_stable_crush().item_exists(osdid)) {
8374 err = -ENOENT;
8375 ss << "unable to set item id " << osdid << " name '" << name
8376 << "' weight " << weight << " at location " << loc
8377 << ": does not exist";
8378 goto reply;
8379 }
8380
8381 dout(5) << "adding/updating crush item id " << osdid << " name '"
8382 << name << "' weight " << weight << " at location "
8383 << loc << dendl;
8384 CrushWrapper newcrush;
8385 _get_pending_crush(newcrush);
8386
8387 string action;
8388 if (prefix == "osd crush set" ||
8389 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
8390 action = "set";
8391 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
8392 } else {
8393 action = "add";
8394 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
8395 if (err == 0)
8396 err = 1;
8397 }
8398
8399 if (err < 0)
8400 goto reply;
8401
8402 if (err == 0 && !_have_pending_crush()) {
8403 ss << action << " item id " << osdid << " name '" << name << "' weight "
8404 << weight << " at location " << loc << ": no change";
8405 goto reply;
8406 }
8407
8408 pending_inc.crush.clear();
8409 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8410 ss << action << " item id " << osdid << " name '" << name << "' weight "
8411 << weight << " at location " << loc << " to crush map";
8412 getline(ss, rs);
8413 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8414 get_last_committed() + 1));
8415 return true;
8416
8417 } else if (prefix == "osd crush create-or-move") {
8418 do {
8419 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
8420 if (!osdmap.exists(osdid)) {
8421 err = -ENOENT;
8422 ss << name << " does not exist. create it before updating the crush map";
8423 goto reply;
8424 }
8425
8426 double weight;
f64942e4 8427 if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
7c673cae
FG
8428 ss << "unable to parse weight value '"
8429 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8430 err = -EINVAL;
8431 goto reply;
8432 }
8433
8434 string args;
8435 vector<string> argvec;
f64942e4 8436 cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
7c673cae
FG
8437 map<string,string> loc;
8438 CrushWrapper::parse_loc_map(argvec, &loc);
8439
8440 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
8441 << " at location " << loc << dendl;
8442
8443 CrushWrapper newcrush;
8444 _get_pending_crush(newcrush);
8445
8446 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
8447 if (err == 0) {
8448 ss << "create-or-move updated item name '" << name << "' weight " << weight
8449 << " at location " << loc << " to crush map";
8450 break;
8451 }
8452 if (err > 0) {
8453 pending_inc.crush.clear();
8454 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8455 ss << "create-or-move updating item name '" << name << "' weight " << weight
8456 << " at location " << loc << " to crush map";
8457 getline(ss, rs);
8458 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8459 get_last_committed() + 1));
8460 return true;
8461 }
8462 } while (false);
8463
8464 } else if (prefix == "osd crush move") {
8465 do {
8466 // osd crush move <name> <loc1> [<loc2> ...]
8467
8468 string args;
8469 vector<string> argvec;
f64942e4
AA
8470 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
8471 cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
7c673cae
FG
8472 map<string,string> loc;
8473 CrushWrapper::parse_loc_map(argvec, &loc);
8474
8475 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
8476 CrushWrapper newcrush;
8477 _get_pending_crush(newcrush);
8478
8479 if (!newcrush.name_exists(name)) {
8480 err = -ENOENT;
8481 ss << "item " << name << " does not exist";
8482 break;
8483 }
8484 int id = newcrush.get_item_id(name);
8485
8486 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8487 if (id >= 0) {
8488 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
8489 } else {
8490 err = newcrush.move_bucket(g_ceph_context, id, loc);
8491 }
8492 if (err >= 0) {
8493 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8494 pending_inc.crush.clear();
8495 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8496 getline(ss, rs);
8497 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8498 get_last_committed() + 1));
8499 return true;
8500 }
8501 } else {
8502 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8503 err = 0;
8504 }
8505 } while (false);
31f18b77
FG
8506 } else if (prefix == "osd crush swap-bucket") {
8507 string source, dest, force;
f64942e4
AA
8508 cmd_getval_throws(g_ceph_context, cmdmap, "source", source);
8509 cmd_getval_throws(g_ceph_context, cmdmap, "dest", dest);
8510 cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
31f18b77
FG
8511 CrushWrapper newcrush;
8512 _get_pending_crush(newcrush);
8513 if (!newcrush.name_exists(source)) {
8514 ss << "source item " << source << " does not exist";
8515 err = -ENOENT;
8516 goto reply;
8517 }
8518 if (!newcrush.name_exists(dest)) {
8519 ss << "dest item " << dest << " does not exist";
8520 err = -ENOENT;
8521 goto reply;
8522 }
8523 int sid = newcrush.get_item_id(source);
8524 int did = newcrush.get_item_id(dest);
8525 int sparent;
8526 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
8527 force != "--yes-i-really-mean-it") {
8528 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8529 err = -EPERM;
8530 goto reply;
8531 }
8532 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
8533 force != "--yes-i-really-mean-it") {
8534 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
8535 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
8536 << "; pass --yes-i-really-mean-it to proceed anyway";
8537 err = -EPERM;
8538 goto reply;
8539 }
8540 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
8541 if (r < 0) {
8542 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 8543 err = r;
31f18b77
FG
8544 goto reply;
8545 }
8546 ss << "swapped bucket of " << source << " to " << dest;
8547 pending_inc.crush.clear();
8548 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8549 wait_for_finished_proposal(op,
8550 new Monitor::C_Command(mon, op, err, ss.str(),
8551 get_last_committed() + 1));
8552 return true;
8553 } else if (prefix == "osd crush link") {
8554 // osd crush link <name> <loc1> [<loc2> ...]
8555 string name;
f64942e4 8556 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
31f18b77 8557 vector<string> argvec;
f64942e4 8558 cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
31f18b77
FG
8559 map<string,string> loc;
8560 CrushWrapper::parse_loc_map(argvec, &loc);
8561
8562 // Need an explicit check for name_exists because get_item_id returns
8563 // 0 on unfound.
8564 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
8565 if (!osdmap.crush->name_exists(name)) {
8566 err = -ENOENT;
8567 ss << "item " << name << " does not exist";
8568 goto reply;
8569 } else {
8570 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
8571 }
8572 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
8573 ss << "no need to move item id " << id << " name '" << name
8574 << "' to location " << loc << " in crush map";
8575 err = 0;
8576 goto reply;
8577 }
8578
8579 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
8580 CrushWrapper newcrush;
8581 _get_pending_crush(newcrush);
8582
8583 if (!newcrush.name_exists(name)) {
8584 err = -ENOENT;
8585 ss << "item " << name << " does not exist";
8586 goto reply;
8587 } else {
8588 int id = newcrush.get_item_id(name);
8589 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8590 err = newcrush.link_bucket(g_ceph_context, id, loc);
8591 if (err >= 0) {
8592 ss << "linked item id " << id << " name '" << name
8593 << "' to location " << loc << " in crush map";
8594 pending_inc.crush.clear();
8595 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8596 } else {
8597 ss << "cannot link item id " << id << " name '" << name
8598 << "' to location " << loc;
8599 goto reply;
8600 }
8601 } else {
8602 ss << "no need to move item id " << id << " name '" << name
8603 << "' to location " << loc << " in crush map";
8604 err = 0;
8605 }
8606 }
8607 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
8608 get_last_committed() + 1));
8609 return true;
8610 } else if (prefix == "osd crush rm" ||
8611 prefix == "osd crush remove" ||
8612 prefix == "osd crush unlink") {
8613 do {
8614 // osd crush rm <id> [ancestor]
8615 CrushWrapper newcrush;
8616 _get_pending_crush(newcrush);
8617
8618 string name;
f64942e4 8619 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae
FG
8620
8621 if (!osdmap.crush->name_exists(name)) {
8622 err = 0;
8623 ss << "device '" << name << "' does not appear in the crush map";
8624 break;
8625 }
8626 if (!newcrush.name_exists(name)) {
8627 err = 0;
8628 ss << "device '" << name << "' does not appear in the crush map";
8629 getline(ss, rs);
8630 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8631 get_last_committed() + 1));
8632 return true;
8633 }
8634 int id = newcrush.get_item_id(name);
31f18b77
FG
8635 int ancestor = 0;
8636
7c673cae
FG
8637 bool unlink_only = prefix == "osd crush unlink";
8638 string ancestor_str;
f64942e4 8639 if (cmd_getval_throws(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
7c673cae
FG
8640 if (!newcrush.name_exists(ancestor_str)) {
8641 err = -ENOENT;
8642 ss << "ancestor item '" << ancestor_str
8643 << "' does not appear in the crush map";
8644 break;
8645 }
31f18b77 8646 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 8647 }
31f18b77
FG
8648
8649 err = prepare_command_osd_crush_remove(
8650 newcrush,
8651 id, ancestor,
8652 (ancestor < 0), unlink_only);
8653
7c673cae
FG
8654 if (err == -ENOENT) {
8655 ss << "item " << id << " does not appear in that position";
8656 err = 0;
8657 break;
8658 }
8659 if (err == 0) {
7c673cae
FG
8660 ss << "removed item id " << id << " name '" << name << "' from crush map";
8661 getline(ss, rs);
8662 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8663 get_last_committed() + 1));
8664 return true;
8665 }
8666 } while (false);
8667
8668 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
8669 CrushWrapper newcrush;
8670 _get_pending_crush(newcrush);
8671
8672 newcrush.reweight(g_ceph_context);
8673 pending_inc.crush.clear();
8674 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8675 ss << "reweighted crush hierarchy";
8676 getline(ss, rs);
8677 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8678 get_last_committed() + 1));
8679 return true;
8680 } else if (prefix == "osd crush reweight") {
8681 // osd crush reweight <name> <weight>
8682 CrushWrapper newcrush;
8683 _get_pending_crush(newcrush);
8684
8685 string name;
f64942e4 8686 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae
FG
8687 if (!newcrush.name_exists(name)) {
8688 err = -ENOENT;
8689 ss << "device '" << name << "' does not appear in the crush map";
8690 goto reply;
8691 }
8692
8693 int id = newcrush.get_item_id(name);
8694 if (id < 0) {
8695 ss << "device '" << name << "' is not a leaf in the crush map";
8696 err = -EINVAL;
8697 goto reply;
8698 }
8699 double w;
f64942e4 8700 if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
7c673cae
FG
8701 ss << "unable to parse weight value '"
8702 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8703 err = -EINVAL;
8704 goto reply;
8705 }
8706
8707 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
8708 if (err < 0)
8709 goto reply;
8710 pending_inc.crush.clear();
8711 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8712 ss << "reweighted item id " << id << " name '" << name << "' to " << w
8713 << " in crush map";
8714 getline(ss, rs);
8715 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8716 get_last_committed() + 1));
8717 return true;
8718 } else if (prefix == "osd crush reweight-subtree") {
8719 // osd crush reweight <name> <weight>
8720 CrushWrapper newcrush;
8721 _get_pending_crush(newcrush);
8722
8723 string name;
f64942e4 8724 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae
FG
8725 if (!newcrush.name_exists(name)) {
8726 err = -ENOENT;
8727 ss << "device '" << name << "' does not appear in the crush map";
8728 goto reply;
8729 }
8730
8731 int id = newcrush.get_item_id(name);
8732 if (id >= 0) {
8733 ss << "device '" << name << "' is not a subtree in the crush map";
8734 err = -EINVAL;
8735 goto reply;
8736 }
8737 double w;
f64942e4 8738 if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
7c673cae
FG
8739 ss << "unable to parse weight value '"
8740 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8741 err = -EINVAL;
8742 goto reply;
8743 }
8744
8745 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
8746 if (err < 0)
8747 goto reply;
8748 pending_inc.crush.clear();
8749 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8750 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
8751 << " in crush map";
8752 getline(ss, rs);
8753 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8754 get_last_committed() + 1));
8755 return true;
8756 } else if (prefix == "osd crush tunables") {
8757 CrushWrapper newcrush;
8758 _get_pending_crush(newcrush);
8759
8760 err = 0;
8761 string profile;
f64942e4 8762 cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
7c673cae
FG
8763 if (profile == "legacy" || profile == "argonaut") {
8764 newcrush.set_tunables_legacy();
8765 } else if (profile == "bobtail") {
8766 newcrush.set_tunables_bobtail();
8767 } else if (profile == "firefly") {
8768 newcrush.set_tunables_firefly();
8769 } else if (profile == "hammer") {
8770 newcrush.set_tunables_hammer();
8771 } else if (profile == "jewel") {
8772 newcrush.set_tunables_jewel();
8773 } else if (profile == "optimal") {
8774 newcrush.set_tunables_optimal();
8775 } else if (profile == "default") {
8776 newcrush.set_tunables_default();
8777 } else {
8778 ss << "unrecognized profile '" << profile << "'";
8779 err = -EINVAL;
8780 goto reply;
8781 }
8782
8783 if (!validate_crush_against_features(&newcrush, ss)) {
8784 err = -EINVAL;
8785 goto reply;
8786 }
8787
8788 pending_inc.crush.clear();
8789 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8790 ss << "adjusted tunables profile to " << profile;
8791 getline(ss, rs);
8792 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8793 get_last_committed() + 1));
8794 return true;
8795 } else if (prefix == "osd crush set-tunable") {
8796 CrushWrapper newcrush;
8797 _get_pending_crush(newcrush);
8798
8799 err = 0;
8800 string tunable;
f64942e4 8801 cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
7c673cae
FG
8802
8803 int64_t value = -1;
f64942e4 8804 if (!cmd_getval_throws(g_ceph_context, cmdmap, "value", value)) {
7c673cae
FG
8805 err = -EINVAL;
8806 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
8807 goto reply;
8808 }
8809
8810 if (tunable == "straw_calc_version") {
224ce89b 8811 if (value != 0 && value != 1) {
7c673cae
FG
8812 ss << "value must be 0 or 1; got " << value;
8813 err = -EINVAL;
8814 goto reply;
8815 }
8816 newcrush.set_straw_calc_version(value);
8817 } else {
8818 ss << "unrecognized tunable '" << tunable << "'";
8819 err = -EINVAL;
8820 goto reply;
8821 }
8822
8823 if (!validate_crush_against_features(&newcrush, ss)) {
8824 err = -EINVAL;
8825 goto reply;
8826 }
8827
8828 pending_inc.crush.clear();
8829 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8830 ss << "adjusted tunable " << tunable << " to " << value;
8831 getline(ss, rs);
8832 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8833 get_last_committed() + 1));
8834 return true;
8835
8836 } else if (prefix == "osd crush rule create-simple") {
8837 string name, root, type, mode;
f64942e4
AA
8838 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
8839 cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
8840 cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
8841 cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
7c673cae
FG
8842 if (mode == "")
8843 mode = "firstn";
8844
8845 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
8846 // The name is uniquely associated to a ruleid and the rule it contains
8847 // From the user point of view, the rule is more meaningfull.
8848 ss << "rule " << name << " already exists";
7c673cae
FG
8849 err = 0;
8850 goto reply;
8851 }
8852
8853 CrushWrapper newcrush;
8854 _get_pending_crush(newcrush);
8855
8856 if (newcrush.rule_exists(name)) {
31f18b77
FG
8857 // The name is uniquely associated to a ruleid and the rule it contains
8858 // From the user point of view, the rule is more meaningfull.
8859 ss << "rule " << name << " already exists";
7c673cae
FG
8860 err = 0;
8861 } else {
224ce89b 8862 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
8863 pg_pool_t::TYPE_REPLICATED, &ss);
8864 if (ruleno < 0) {
8865 err = ruleno;
8866 goto reply;
8867 }
8868
8869 pending_inc.crush.clear();
8870 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8871 }
8872 getline(ss, rs);
8873 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8874 get_last_committed() + 1));
8875 return true;
8876
224ce89b
WB
8877 } else if (prefix == "osd crush rule create-replicated") {
8878 string name, root, type, device_class;
f64942e4
AA
8879 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
8880 cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
8881 cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
8882 cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class);
224ce89b
WB
8883
8884 if (!device_class.empty()) {
8885 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8886 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8887 << "luminous' before using crush device classes";
8888 err = -EPERM;
8889 goto reply;
8890 }
8891 }
8892
8893 if (osdmap.crush->rule_exists(name)) {
8894 // The name is uniquely associated to a ruleid and the rule it contains
8895 // From the user point of view, the rule is more meaningfull.
8896 ss << "rule " << name << " already exists";
8897 err = 0;
8898 goto reply;
8899 }
8900
8901 CrushWrapper newcrush;
8902 _get_pending_crush(newcrush);
8903
8904 if (newcrush.rule_exists(name)) {
8905 // The name is uniquely associated to a ruleid and the rule it contains
8906 // From the user point of view, the rule is more meaningfull.
8907 ss << "rule " << name << " already exists";
8908 err = 0;
8909 } else {
8910 int ruleno = newcrush.add_simple_rule(
8911 name, root, type, device_class,
8912 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
8913 if (ruleno < 0) {
8914 err = ruleno;
8915 goto reply;
8916 }
8917
8918 pending_inc.crush.clear();
8919 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8920 }
8921 getline(ss, rs);
8922 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8923 get_last_committed() + 1));
8924 return true;
8925
7c673cae
FG
8926 } else if (prefix == "osd erasure-code-profile rm") {
8927 string name;
f64942e4 8928 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae
FG
8929
8930 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
8931 goto wait;
8932
8933 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
8934 err = -EBUSY;
8935 goto reply;
8936 }
8937
8938 if (osdmap.has_erasure_code_profile(name) ||
8939 pending_inc.new_erasure_code_profiles.count(name)) {
8940 if (osdmap.has_erasure_code_profile(name)) {
8941 pending_inc.old_erasure_code_profiles.push_back(name);
8942 } else {
8943 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
8944 pending_inc.new_erasure_code_profiles.erase(name);
8945 }
8946
8947 getline(ss, rs);
8948 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8949 get_last_committed() + 1));
8950 return true;
8951 } else {
8952 ss << "erasure-code-profile " << name << " does not exist";
8953 err = 0;
8954 goto reply;
8955 }
8956
8957 } else if (prefix == "osd erasure-code-profile set") {
8958 string name;
f64942e4 8959 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae 8960 vector<string> profile;
f64942e4 8961 cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
7c673cae
FG
8962 bool force;
8963 if (profile.size() > 0 && profile.back() == "--force") {
8964 profile.pop_back();
8965 force = true;
8966 } else {
8967 force = false;
8968 }
8969 map<string,string> profile_map;
8970 err = parse_erasure_code_profile(profile, &profile_map, &ss);
8971 if (err)
8972 goto reply;
8973 if (profile_map.find("plugin") == profile_map.end()) {
8974 ss << "erasure-code-profile " << profile_map
8975 << " must contain a plugin entry" << std::endl;
8976 err = -EINVAL;
8977 goto reply;
8978 }
8979 string plugin = profile_map["plugin"];
8980
8981 if (pending_inc.has_erasure_code_profile(name)) {
8982 dout(20) << "erasure code profile " << name << " try again" << dendl;
8983 goto wait;
8984 } else {
8985 if (plugin == "isa" || plugin == "lrc") {
8986 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
8987 if (err == -EAGAIN)
8988 goto wait;
8989 if (err)
8990 goto reply;
8991 } else if (plugin == "shec") {
8992 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
8993 if (err == -EAGAIN)
8994 goto wait;
8995 if (err)
8996 goto reply;
8997 }
8998 err = normalize_profile(name, profile_map, force, &ss);
8999 if (err)
9000 goto reply;
9001
9002 if (osdmap.has_erasure_code_profile(name)) {
9003 ErasureCodeProfile existing_profile_map =
9004 osdmap.get_erasure_code_profile(name);
9005 err = normalize_profile(name, existing_profile_map, force, &ss);
9006 if (err)
9007 goto reply;
9008
9009 if (existing_profile_map == profile_map) {
9010 err = 0;
9011 goto reply;
9012 }
9013 if (!force) {
9014 err = -EPERM;
9015 ss << "will not override erasure code profile " << name
9016 << " because the existing profile "
9017 << existing_profile_map
9018 << " is different from the proposed profile "
9019 << profile_map;
9020 goto reply;
9021 }
9022 }
9023
9024 dout(20) << "erasure code profile set " << name << "="
9025 << profile_map << dendl;
9026 pending_inc.set_erasure_code_profile(name, profile_map);
9027 }
9028
9029 getline(ss, rs);
9030 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9031 get_last_committed() + 1));
9032 return true;
9033
9034 } else if (prefix == "osd crush rule create-erasure") {
9035 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
9036 if (err == -EAGAIN)
9037 goto wait;
9038 if (err)
9039 goto reply;
9040 string name, poolstr;
f64942e4 9041 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae 9042 string profile;
f64942e4 9043 cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
7c673cae
FG
9044 if (profile == "")
9045 profile = "default";
9046 if (profile == "default") {
9047 if (!osdmap.has_erasure_code_profile(profile)) {
9048 if (pending_inc.has_erasure_code_profile(profile)) {
9049 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
9050 goto wait;
9051 }
9052
9053 map<string,string> profile_map;
9054 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
9055 profile_map,
9056 &ss);
9057 if (err)
9058 goto reply;
9059 err = normalize_profile(name, profile_map, true, &ss);
9060 if (err)
9061 goto reply;
9062 dout(20) << "erasure code profile set " << profile << "="
9063 << profile_map << dendl;
9064 pending_inc.set_erasure_code_profile(profile, profile_map);
9065 goto wait;
9066 }
9067 }
9068
31f18b77
FG
9069 int rule;
9070 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
9071 if (err < 0) {
9072 switch(err) {
9073 case -EEXIST: // return immediately
9074 ss << "rule " << name << " already exists";
9075 err = 0;
9076 goto reply;
9077 break;
9078 case -EALREADY: // wait for pending to be proposed
9079 ss << "rule " << name << " already exists";
9080 err = 0;
9081 break;
9082 default: // non recoverable error
9083 goto reply;
9084 break;
9085 }
9086 } else {
31f18b77 9087 ss << "created rule " << name << " at " << rule;
7c673cae
FG
9088 }
9089
9090 getline(ss, rs);
9091 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9092 get_last_committed() + 1));
9093 return true;
9094
9095 } else if (prefix == "osd crush rule rm") {
9096 string name;
f64942e4 9097 cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
7c673cae
FG
9098
9099 if (!osdmap.crush->rule_exists(name)) {
9100 ss << "rule " << name << " does not exist";
9101 err = 0;
9102 goto reply;
9103 }
9104
9105 CrushWrapper newcrush;
9106 _get_pending_crush(newcrush);
9107
9108 if (!newcrush.rule_exists(name)) {
9109 ss << "rule " << name << " does not exist";
9110 err = 0;
9111 } else {
9112 int ruleno = newcrush.get_rule_id(name);
9113 assert(ruleno >= 0);
9114
9115 // make sure it is not in use.
9116 // FIXME: this is ok in some situations, but let's not bother with that
9117 // complexity now.
9118 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
3efd9988 9119 if (osdmap.crush_rule_in_use(ruleset)) {
7c673cae
FG
9120 ss << "crush ruleset " << name << " " << ruleset << " is in use";
9121 err = -EBUSY;
9122 goto reply;
9123 }
9124
9125 err = newcrush.remove_rule(ruleno);
9126 if (err < 0) {
9127 goto reply;
9128 }
9129
9130 pending_inc.crush.clear();
9131 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9132 }
9133 getline(ss, rs);
9134 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9135 get_last_committed() + 1));
9136 return true;
9137
b5b8bbf5
FG
9138 } else if (prefix == "osd crush rule rename") {
9139 string srcname;
9140 string dstname;
f64942e4
AA
9141 cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
9142 cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
b5b8bbf5
FG
9143 if (srcname.empty() || dstname.empty()) {
9144 ss << "must specify both source rule name and destination rule name";
9145 err = -EINVAL;
9146 goto reply;
9147 }
9148 if (srcname == dstname) {
9149 ss << "destination rule name is equal to source rule name";
9150 err = 0;
9151 goto reply;
9152 }
9153
9154 CrushWrapper newcrush;
9155 _get_pending_crush(newcrush);
181888fb
FG
9156 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
9157 // srcname does not exist and dstname already exists
9158 // suppose this is a replay and return success
9159 // (so this command is idempotent)
9160 ss << "already renamed to '" << dstname << "'";
9161 err = 0;
9162 goto reply;
9163 }
9164
b5b8bbf5
FG
9165 err = newcrush.rename_rule(srcname, dstname, &ss);
9166 if (err < 0) {
9167 // ss has reason for failure
9168 goto reply;
9169 }
9170 pending_inc.crush.clear();
9171 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9172 getline(ss, rs);
9173 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9174 get_last_committed() + 1));
9175 return true;
9176
7c673cae
FG
9177 } else if (prefix == "osd setmaxosd") {
9178 int64_t newmax;
f64942e4 9179 if (!cmd_getval_throws(g_ceph_context, cmdmap, "newmax", newmax)) {
7c673cae
FG
9180 ss << "unable to parse 'newmax' value '"
9181 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
9182 err = -EINVAL;
9183 goto reply;
9184 }
9185
9186 if (newmax > g_conf->mon_max_osd) {
9187 err = -ERANGE;
9188 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
9189 << g_conf->mon_max_osd << ")";
9190 goto reply;
9191 }
9192
9193 // Don't allow shrinking OSD number as this will cause data loss
9194 // and may cause kernel crashes.
9195 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
9196 if (newmax < osdmap.get_max_osd()) {
9197 // Check if the OSDs exist between current max and new value.
9198 // If there are any OSDs exist, then don't allow shrinking number
9199 // of OSDs.
9200 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
9201 if (osdmap.exists(i)) {
9202 err = -EBUSY;
9203 ss << "cannot shrink max_osd to " << newmax
9204 << " because osd." << i << " (and possibly others) still in use";
9205 goto reply;
9206 }
9207 }
9208 }
9209
9210 pending_inc.new_max_osd = newmax;
9211 ss << "set new max_osd = " << pending_inc.new_max_osd;
9212 getline(ss, rs);
9213 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9214 get_last_committed() + 1));
9215 return true;
9216
9217 } else if (prefix == "osd set-full-ratio" ||
9218 prefix == "osd set-backfillfull-ratio" ||
9219 prefix == "osd set-nearfull-ratio") {
31f18b77 9220 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9221 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9222 << "luminous' before using the new interface";
7c673cae
FG
9223 err = -EPERM;
9224 goto reply;
9225 }
9226 double n;
f64942e4 9227 if (!cmd_getval_throws(g_ceph_context, cmdmap, "ratio", n)) {
7c673cae 9228 ss << "unable to parse 'ratio' value '"
224ce89b 9229 << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
7c673cae
FG
9230 err = -EINVAL;
9231 goto reply;
9232 }
9233 if (prefix == "osd set-full-ratio")
9234 pending_inc.new_full_ratio = n;
9235 else if (prefix == "osd set-backfillfull-ratio")
9236 pending_inc.new_backfillfull_ratio = n;
9237 else if (prefix == "osd set-nearfull-ratio")
9238 pending_inc.new_nearfull_ratio = n;
9239 ss << prefix << " " << n;
9240 getline(ss, rs);
9241 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9242 get_last_committed() + 1));
9243 return true;
9244 } else if (prefix == "osd set-require-min-compat-client") {
31f18b77 9245 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9246 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9247 << "luminous' before using the new interface";
7c673cae
FG
9248 err = -EPERM;
9249 goto reply;
9250 }
9251 string v;
f64942e4 9252 cmd_getval_throws(g_ceph_context, cmdmap, "version", v);
31f18b77
FG
9253 int vno = ceph_release_from_name(v.c_str());
9254 if (vno <= 0) {
7c673cae
FG
9255 ss << "version " << v << " is not recognized";
9256 err = -EINVAL;
9257 goto reply;
9258 }
9259 OSDMap newmap;
9260 newmap.deepish_copy_from(osdmap);
9261 newmap.apply_incremental(pending_inc);
31f18b77
FG
9262 newmap.require_min_compat_client = vno;
9263 auto mvno = newmap.get_min_compat_client();
9264 if (vno < mvno) {
9265 ss << "osdmap current utilizes features that require "
9266 << ceph_release_name(mvno)
9267 << "; cannot set require_min_compat_client below that to "
9268 << ceph_release_name(vno);
7c673cae
FG
9269 err = -EPERM;
9270 goto reply;
9271 }
31f18b77 9272 string sure;
f64942e4 9273 cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
31f18b77
FG
9274 if (sure != "--yes-i-really-mean-it") {
9275 FeatureMap m;
9276 mon->get_combined_feature_map(&m);
9277 uint64_t features = ceph_release_features(vno);
9278 bool first = true;
9279 bool ok = true;
9280 for (int type : {
9281 CEPH_ENTITY_TYPE_CLIENT,
9282 CEPH_ENTITY_TYPE_MDS,
9283 CEPH_ENTITY_TYPE_MGR }) {
9284 auto p = m.m.find(type);
9285 if (p == m.m.end()) {
9286 continue;
9287 }
9288 for (auto& q : p->second) {
9289 uint64_t missing = ~q.first & features;
9290 if (missing) {
9291 if (first) {
9292 ss << "cannot set require_min_compat_client to " << v << ": ";
9293 } else {
9294 ss << "; ";
9295 }
9296 first = false;
9297 ss << q.second << " connected " << ceph_entity_type_name(type)
9298 << "(s) look like " << ceph_release_name(
9299 ceph_release_from_features(q.first))
9300 << " (missing 0x" << std::hex << missing << std::dec << ")";
9301 ok = false;
9302 }
9303 }
9304 }
9305 if (!ok) {
9306 ss << "; add --yes-i-really-mean-it to do it anyway";
9307 err = -EPERM;
9308 goto reply;
9309 }
9310 }
9311 ss << "set require_min_compat_client to " << ceph_release_name(vno);
9312 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
9313 getline(ss, rs);
9314 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9315 get_last_committed() + 1));
9316 return true;
9317 } else if (prefix == "osd pause") {
9318 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9319
9320 } else if (prefix == "osd unpause") {
9321 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9322
9323 } else if (prefix == "osd set") {
3efd9988 9324 string sure;
f64942e4 9325 cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
7c673cae 9326 string key;
f64942e4 9327 cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
7c673cae
FG
9328 if (key == "full")
9329 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
9330 else if (key == "pause")
9331 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9332 else if (key == "noup")
9333 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
9334 else if (key == "nodown")
9335 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
9336 else if (key == "noout")
9337 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
9338 else if (key == "noin")
9339 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
9340 else if (key == "nobackfill")
9341 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
9342 else if (key == "norebalance")
9343 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
9344 else if (key == "norecover")
9345 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
9346 else if (key == "noscrub")
9347 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
9348 else if (key == "nodeep-scrub")
9349 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9350 else if (key == "notieragent")
9351 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
9352 else if (key == "sortbitwise") {
3efd9988
FG
9353 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9354 ss << "Not advisable to continue since no OSDs are up. Pass "
9355 << "--yes-i-really-mean-it if you really wish to continue.";
9356 err = -EPERM;
9357 goto reply;
9358 }
9359 if ((osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)
9360 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9361 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
9362 } else {
9363 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
9364 err = -EPERM;
31f18b77 9365 goto reply;
7c673cae 9366 }
c07f9fc5 9367 } else if (key == "recovery_deletes") {
3efd9988
FG
9368 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9369 ss << "Not advisable to continue since no OSDs are up. Pass "
9370 << "--yes-i-really-mean-it if you really wish to continue.";
9371 err = -EPERM;
9372 goto reply;
9373 }
9374 if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)
9375 || sure == "--yes-i-really-mean-it") {
c07f9fc5
FG
9376 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9377 } else {
9378 ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
9379 err = -EPERM;
9380 goto reply;
9381 }
f64942e4
AA
9382 } else if (key == "pglog_hardlimit") {
9383 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9384 ss << "Not advisable to continue since no OSDs are up. Pass "
9385 << "--yes-i-really-mean-it if you really wish to continue.";
9386 err = -EPERM;
9387 goto reply;
9388 }
9389 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
9390 // we are reusing a jewel feature bit that was retired in luminous.
9391 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
9392 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
9393 || sure == "--yes-i-really-mean-it")) {
9394 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
9395 } else {
9396 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
9397 err = -EPERM;
9398 goto reply;
9399 }
7c673cae 9400 } else if (key == "require_jewel_osds") {
3efd9988
FG
9401 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9402 ss << "Not advisable to continue since no OSDs are up. Pass "
9403 << "--yes-i-really-mean-it if you really wish to continue.";
9404 err = -EPERM;
9405 goto reply;
9406 }
7c673cae
FG
9407 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9408 ss << "the sortbitwise flag must be set before require_jewel_osds";
9409 err = -EPERM;
31f18b77
FG
9410 goto reply;
9411 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
9412 ss << "require_osd_release is already >= jewel";
9413 err = 0;
9414 goto reply;
3efd9988
FG
9415 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)
9416 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9417 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
9418 } else {
9419 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
9420 err = -EPERM;
9421 }
9422 } else if (key == "require_kraken_osds") {
3efd9988
FG
9423 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9424 ss << "Not advisable to continue since no OSDs are up. Pass "
9425 << "--yes-i-really-mean-it if you really wish to continue.";
9426 err = -EPERM;
9427 goto reply;
9428 }
7c673cae
FG
9429 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9430 ss << "the sortbitwise flag must be set before require_kraken_osds";
9431 err = -EPERM;
31f18b77
FG
9432 goto reply;
9433 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
9434 ss << "require_osd_release is already >= kraken";
9435 err = 0;
9436 goto reply;
3efd9988
FG
9437 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)
9438 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9439 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
9440 // ensure JEWEL is also set
9441 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
9442 return r;
9443 } else {
9444 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
9445 err = -EPERM;
9446 }
7c673cae
FG
9447 } else {
9448 ss << "unrecognized flag '" << key << "'";
9449 err = -EINVAL;
9450 }
9451
9452 } else if (prefix == "osd unset") {
9453 string key;
f64942e4 9454 cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
7c673cae
FG
9455 if (key == "full")
9456 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
9457 else if (key == "pause")
9458 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9459 else if (key == "noup")
9460 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
9461 else if (key == "nodown")
9462 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
9463 else if (key == "noout")
9464 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
9465 else if (key == "noin")
9466 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
9467 else if (key == "nobackfill")
9468 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
9469 else if (key == "norebalance")
9470 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
9471 else if (key == "norecover")
9472 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
9473 else if (key == "noscrub")
9474 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
9475 else if (key == "nodeep-scrub")
9476 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9477 else if (key == "notieragent")
9478 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
224ce89b 9479 else {
7c673cae
FG
9480 ss << "unrecognized flag '" << key << "'";
9481 err = -EINVAL;
9482 }
9483
31f18b77
FG
9484 } else if (prefix == "osd require-osd-release") {
9485 string release;
f64942e4 9486 cmd_getval_throws(g_ceph_context, cmdmap, "release", release);
3efd9988 9487 string sure;
f64942e4 9488 cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
31f18b77
FG
9489 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9490 ss << "the sortbitwise flag must be set first";
9491 err = -EPERM;
9492 goto reply;
9493 }
9494 int rel = ceph_release_from_name(release.c_str());
9495 if (rel <= 0) {
9496 ss << "unrecognized release " << release;
9497 err = -EINVAL;
9498 goto reply;
9499 }
9500 if (rel < CEPH_RELEASE_LUMINOUS) {
9501 ss << "use this command only for luminous and later";
9502 err = -EINVAL;
9503 goto reply;
9504 }
d2e6a577
FG
9505 if (rel == osdmap.require_osd_release) {
9506 // idempotent
9507 err = 0;
9508 goto reply;
9509 }
31f18b77
FG
9510 if (rel == CEPH_RELEASE_LUMINOUS) {
9511 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
9512 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
9513 err = -EPERM;
9514 goto reply;
9515 }
9516 } else {
9517 ss << "not supported for this release yet";
9518 err = -EPERM;
9519 goto reply;
9520 }
9521 if (rel < osdmap.require_osd_release) {
9522 ss << "require_osd_release cannot be lowered once it has been set";
9523 err = -EPERM;
9524 goto reply;
9525 }
9526 pending_inc.new_require_osd_release = rel;
c07f9fc5
FG
9527 if (rel >= CEPH_RELEASE_LUMINOUS &&
9528 !osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
9529 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9530 }
31f18b77 9531 goto update;
7c673cae
FG
9532 } else if (prefix == "osd cluster_snap") {
9533 // ** DISABLE THIS FOR NOW **
9534 ss << "cluster snapshot currently disabled (broken implementation)";
9535 // ** DISABLE THIS FOR NOW **
9536
9537 } else if (prefix == "osd down" ||
9538 prefix == "osd out" ||
9539 prefix == "osd in" ||
9540 prefix == "osd rm") {
9541
9542 bool any = false;
31f18b77
FG
9543 bool stop = false;
9544 bool verbose = true;
7c673cae
FG
9545
9546 vector<string> idvec;
f64942e4 9547 cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
31f18b77
FG
9548 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9549 set<int> osds;
9550
9551 // wildcard?
9552 if (j == 0 &&
9553 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9554 if (prefix == "osd in") {
9555 // touch out osds only
9556 osdmap.get_out_osds(osds);
9557 } else {
9558 osdmap.get_all_osds(osds);
9559 }
9560 stop = true;
9561 verbose = false; // so the output is less noisy.
9562 } else {
9563 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9564 if (osd < 0) {
9565 ss << "invalid osd id" << osd;
9566 err = -EINVAL;
9567 continue;
9568 } else if (!osdmap.exists(osd)) {
9569 ss << "osd." << osd << " does not exist. ";
9570 continue;
9571 }
9572
9573 osds.insert(osd);
7c673cae 9574 }
31f18b77
FG
9575
9576 for (auto &osd : osds) {
9577 if (prefix == "osd down") {
9578 if (osdmap.is_down(osd)) {
9579 if (verbose)
9580 ss << "osd." << osd << " is already down. ";
9581 } else {
9582 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
9583 ss << "marked down osd." << osd << ". ";
9584 any = true;
9585 }
9586 } else if (prefix == "osd out") {
9587 if (osdmap.is_out(osd)) {
9588 if (verbose)
9589 ss << "osd." << osd << " is already out. ";
9590 } else {
9591 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
9592 if (osdmap.osd_weight[osd]) {
9593 if (pending_inc.new_xinfo.count(osd) == 0) {
9594 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9595 }
9596 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 9597 }
31f18b77 9598 ss << "marked out osd." << osd << ". ";
224ce89b
WB
9599 std::ostringstream msg;
9600 msg << "Client " << op->get_session()->entity_name
9601 << " marked osd." << osd << " out";
9602 if (osdmap.is_up(osd)) {
9603 msg << ", while it was still marked up";
9604 } else {
3efd9988
FG
9605 auto period = ceph_clock_now() - down_pending_out[osd];
9606 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
9607 << " seconds";
9608 }
9609
9610 mon->clog->info() << msg.str();
31f18b77 9611 any = true;
7c673cae 9612 }
31f18b77
FG
9613 } else if (prefix == "osd in") {
9614 if (osdmap.is_in(osd)) {
9615 if (verbose)
9616 ss << "osd." << osd << " is already in. ";
9617 } else {
9618 if (osdmap.osd_xinfo[osd].old_weight > 0) {
9619 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
9620 if (pending_inc.new_xinfo.count(osd) == 0) {
9621 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9622 }
9623 pending_inc.new_xinfo[osd].old_weight = 0;
9624 } else {
9625 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 9626 }
31f18b77
FG
9627 ss << "marked in osd." << osd << ". ";
9628 any = true;
9629 }
9630 } else if (prefix == "osd rm") {
9631 err = prepare_command_osd_remove(osd);
9632
9633 if (err == -EBUSY) {
9634 if (any)
9635 ss << ", ";
9636 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 9637 } else {
31f18b77
FG
9638 assert(err == 0);
9639 if (any) {
9640 ss << ", osd." << osd;
9641 } else {
9642 ss << "removed osd." << osd;
9643 }
9644 any = true;
7c673cae 9645 }
31f18b77
FG
9646 }
9647 }
9648 }
9649 if (any) {
9650 getline(ss, rs);
9651 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9652 get_last_committed() + 1));
9653 return true;
9654 }
9655 } else if (prefix == "osd add-noup" ||
9656 prefix == "osd add-nodown" ||
9657 prefix == "osd add-noin" ||
9658 prefix == "osd add-noout") {
9659
9660 enum {
9661 OP_NOUP,
9662 OP_NODOWN,
9663 OP_NOIN,
9664 OP_NOOUT,
9665 } option;
9666
9667 if (prefix == "osd add-noup") {
9668 option = OP_NOUP;
9669 } else if (prefix == "osd add-nodown") {
9670 option = OP_NODOWN;
9671 } else if (prefix == "osd add-noin") {
9672 option = OP_NOIN;
9673 } else {
9674 option = OP_NOOUT;
9675 }
9676
9677 bool any = false;
9678 bool stop = false;
9679
9680 vector<string> idvec;
f64942e4 9681 cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
31f18b77
FG
9682 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9683
9684 set<int> osds;
9685
9686 // wildcard?
9687 if (j == 0 &&
9688 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9689 osdmap.get_all_osds(osds);
9690 stop = true;
9691 } else {
9692 // try traditional single osd way
9693
9694 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9695 if (osd < 0) {
9696 // ss has reason for failure
9697 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9698 err = -EINVAL;
9699 continue;
9700 }
9701
9702 osds.insert(osd);
9703 }
9704
9705 for (auto &osd : osds) {
9706
9707 if (!osdmap.exists(osd)) {
9708 ss << "osd." << osd << " does not exist. ";
9709 continue;
9710 }
9711
9712 switch (option) {
9713 case OP_NOUP:
9714 if (osdmap.is_up(osd)) {
9715 ss << "osd." << osd << " is already up. ";
9716 continue;
9717 }
9718
9719 if (osdmap.is_noup(osd)) {
9720 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
9721 any = true;
7c673cae 9722 } else {
31f18b77
FG
9723 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9724 any = true;
7c673cae 9725 }
31f18b77
FG
9726
9727 break;
9728
9729 case OP_NODOWN:
9730 if (osdmap.is_down(osd)) {
9731 ss << "osd." << osd << " is already down. ";
9732 continue;
9733 }
9734
9735 if (osdmap.is_nodown(osd)) {
9736 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
9737 any = true;
9738 } else {
9739 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9740 any = true;
9741 }
9742
9743 break;
9744
9745 case OP_NOIN:
9746 if (osdmap.is_in(osd)) {
9747 ss << "osd." << osd << " is already in. ";
9748 continue;
9749 }
9750
9751 if (osdmap.is_noin(osd)) {
9752 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
9753 any = true;
9754 } else {
9755 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9756 any = true;
9757 }
9758
9759 break;
9760
9761 case OP_NOOUT:
9762 if (osdmap.is_out(osd)) {
9763 ss << "osd." << osd << " is already out. ";
9764 continue;
9765 }
9766
9767 if (osdmap.is_noout(osd)) {
9768 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
9769 any = true;
9770 } else {
9771 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9772 any = true;
9773 }
9774
9775 break;
9776
9777 default:
9778 assert(0 == "invalid option");
9779 }
7c673cae
FG
9780 }
9781 }
31f18b77 9782
7c673cae
FG
9783 if (any) {
9784 getline(ss, rs);
9785 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
31f18b77
FG
9786 get_last_committed() + 1));
9787 return true;
9788 }
9789 } else if (prefix == "osd rm-noup" ||
9790 prefix == "osd rm-nodown" ||
9791 prefix == "osd rm-noin" ||
9792 prefix == "osd rm-noout") {
9793
9794 enum {
9795 OP_NOUP,
9796 OP_NODOWN,
9797 OP_NOIN,
9798 OP_NOOUT,
9799 } option;
9800
9801 if (prefix == "osd rm-noup") {
9802 option = OP_NOUP;
9803 } else if (prefix == "osd rm-nodown") {
9804 option = OP_NODOWN;
9805 } else if (prefix == "osd rm-noin") {
9806 option = OP_NOIN;
9807 } else {
9808 option = OP_NOOUT;
9809 }
9810
9811 bool any = false;
9812 bool stop = false;
9813
9814 vector<string> idvec;
f64942e4 9815 cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
31f18b77
FG
9816
9817 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9818
9819 vector<int> osds;
9820
9821 // wildcard?
9822 if (j == 0 &&
9823 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9824
9825 // touch previous noup/nodown/noin/noout osds only
9826 switch (option) {
9827 case OP_NOUP:
9828 osdmap.get_noup_osds(&osds);
9829 break;
9830 case OP_NODOWN:
9831 osdmap.get_nodown_osds(&osds);
9832 break;
9833 case OP_NOIN:
9834 osdmap.get_noin_osds(&osds);
9835 break;
9836 case OP_NOOUT:
9837 osdmap.get_noout_osds(&osds);
9838 break;
9839 default:
9840 assert(0 == "invalid option");
9841 }
9842
9843 // cancel any pending noup/nodown/noin/noout requests too
9844 vector<int> pending_state_osds;
9845 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
9846 for (auto &p : pending_state_osds) {
9847
9848 switch (option) {
9849 case OP_NOUP:
9850 if (!osdmap.is_noup(p) &&
9851 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
9852 any = true;
9853 }
9854 break;
9855
9856 case OP_NODOWN:
9857 if (!osdmap.is_nodown(p) &&
9858 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
9859 any = true;
9860 }
9861 break;
9862
9863 case OP_NOIN:
9864 if (!osdmap.is_noin(p) &&
9865 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
9866 any = true;
9867 }
9868 break;
9869
9870 case OP_NOOUT:
9871 if (!osdmap.is_noout(p) &&
9872 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
9873 any = true;
9874 }
9875 break;
9876
9877 default:
9878 assert(0 == "invalid option");
9879 }
9880 }
9881
9882 stop = true;
9883 } else {
9884 // try traditional single osd way
9885
9886 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9887 if (osd < 0) {
9888 // ss has reason for failure
9889 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9890 err = -EINVAL;
9891 continue;
9892 }
9893
9894 osds.push_back(osd);
9895 }
9896
9897 for (auto &osd : osds) {
9898
9899 if (!osdmap.exists(osd)) {
9900 ss << "osd." << osd << " does not exist. ";
9901 continue;
9902 }
9903
9904 switch (option) {
9905 case OP_NOUP:
9906 if (osdmap.is_noup(osd)) {
9907 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9908 any = true;
9909 } else if (pending_inc.pending_osd_state_clear(
9910 osd, CEPH_OSD_NOUP)) {
9911 any = true;
9912 }
9913 break;
9914
9915 case OP_NODOWN:
9916 if (osdmap.is_nodown(osd)) {
9917 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9918 any = true;
9919 } else if (pending_inc.pending_osd_state_clear(
9920 osd, CEPH_OSD_NODOWN)) {
9921 any = true;
9922 }
9923 break;
9924
9925 case OP_NOIN:
9926 if (osdmap.is_noin(osd)) {
9927 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9928 any = true;
9929 } else if (pending_inc.pending_osd_state_clear(
9930 osd, CEPH_OSD_NOIN)) {
9931 any = true;
9932 }
9933 break;
9934
9935 case OP_NOOUT:
9936 if (osdmap.is_noout(osd)) {
9937 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9938 any = true;
9939 } else if (pending_inc.pending_osd_state_clear(
9940 osd, CEPH_OSD_NOOUT)) {
9941 any = true;
9942 }
9943 break;
9944
9945 default:
9946 assert(0 == "invalid option");
9947 }
9948 }
9949 }
9950
9951 if (any) {
9952 getline(ss, rs);
9953 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9954 get_last_committed() + 1));
7c673cae
FG
9955 return true;
9956 }
9957 } else if (prefix == "osd pg-temp") {
9958 string pgidstr;
f64942e4 9959 if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7c673cae
FG
9960 ss << "unable to parse 'pgid' value '"
9961 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9962 err = -EINVAL;
9963 goto reply;
9964 }
9965 pg_t pgid;
9966 if (!pgid.parse(pgidstr.c_str())) {
9967 ss << "invalid pgid '" << pgidstr << "'";
9968 err = -EINVAL;
9969 goto reply;
9970 }
9971 if (!osdmap.pg_exists(pgid)) {
9972 ss << "pg " << pgid << " does not exist";
9973 err = -ENOENT;
9974 goto reply;
9975 }
9976 if (pending_inc.new_pg_temp.count(pgid)) {
9977 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
9978 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9979 return true;
9980 }
9981
9982 vector<int64_t> id_vec;
9983 vector<int32_t> new_pg_temp;
f64942e4 9984 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
7c673cae
FG
9985 ss << "unable to parse 'id' value(s) '"
9986 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9987 err = -EINVAL;
9988 goto reply;
9989 }
9990 for (auto osd : id_vec) {
9991 if (!osdmap.exists(osd)) {
9992 ss << "osd." << osd << " does not exist";
9993 err = -ENOENT;
9994 goto reply;
9995 }
9996 new_pg_temp.push_back(osd);
9997 }
9998
224ce89b
WB
9999 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10000 if ((int)new_pg_temp.size() < pool_min_size) {
10001 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
10002 << pool_min_size << ")";
10003 err = -EINVAL;
10004 goto reply;
10005 }
10006
10007 int pool_size = osdmap.get_pg_pool_size(pgid);
10008 if ((int)new_pg_temp.size() > pool_size) {
10009 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
10010 << pool_size << ")";
10011 err = -EINVAL;
10012 goto reply;
10013 }
10014
7c673cae
FG
10015 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
10016 new_pg_temp.begin(), new_pg_temp.end());
10017 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
10018 goto update;
10019 } else if (prefix == "osd primary-temp") {
10020 string pgidstr;
f64942e4 10021 if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7c673cae
FG
10022 ss << "unable to parse 'pgid' value '"
10023 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
10024 err = -EINVAL;
10025 goto reply;
10026 }
10027 pg_t pgid;
10028 if (!pgid.parse(pgidstr.c_str())) {
10029 ss << "invalid pgid '" << pgidstr << "'";
10030 err = -EINVAL;
10031 goto reply;
10032 }
10033 if (!osdmap.pg_exists(pgid)) {
10034 ss << "pg " << pgid << " does not exist";
10035 err = -ENOENT;
10036 goto reply;
10037 }
10038
10039 int64_t osd;
f64942e4 10040 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
7c673cae
FG
10041 ss << "unable to parse 'id' value '"
10042 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10043 err = -EINVAL;
10044 goto reply;
10045 }
10046 if (osd != -1 && !osdmap.exists(osd)) {
10047 ss << "osd." << osd << " does not exist";
10048 err = -ENOENT;
10049 goto reply;
10050 }
10051
31f18b77
FG
10052 if (osdmap.require_min_compat_client > 0 &&
10053 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10054 ss << "require_min_compat_client "
10055 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
10056 << " < firefly, which is required for primary-temp";
10057 err = -EPERM;
10058 goto reply;
10059 } else if (!g_conf->mon_osd_allow_primary_temp) {
10060 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
10061 err = -EPERM;
10062 goto reply;
10063 }
10064
10065 pending_inc.new_primary_temp[pgid] = osd;
10066 ss << "set " << pgid << " primary_temp mapping to " << osd;
10067 goto update;
224ce89b
WB
10068 } else if (prefix == "osd pg-upmap" ||
10069 prefix == "osd rm-pg-upmap" ||
10070 prefix == "osd pg-upmap-items" ||
10071 prefix == "osd rm-pg-upmap-items") {
31f18b77 10072 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
10073 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10074 << "luminous' before using the new interface";
7c673cae
FG
10075 err = -EPERM;
10076 goto reply;
10077 }
31f18b77
FG
10078 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
10079 ss << "min_compat_client "
10080 << ceph_release_name(osdmap.require_min_compat_client)
224ce89b
WB
10081 << " < luminous, which is required for pg-upmap. "
10082 << "Try 'ceph osd set-require-min-compat-client luminous' "
10083 << "before using the new interface";
7c673cae
FG
10084 err = -EPERM;
10085 goto reply;
10086 }
10087 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
10088 if (err == -EAGAIN)
10089 goto wait;
10090 if (err < 0)
10091 goto reply;
10092 string pgidstr;
f64942e4 10093 if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
7c673cae
FG
10094 ss << "unable to parse 'pgid' value '"
10095 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
10096 err = -EINVAL;
10097 goto reply;
10098 }
10099 pg_t pgid;
10100 if (!pgid.parse(pgidstr.c_str())) {
10101 ss << "invalid pgid '" << pgidstr << "'";
10102 err = -EINVAL;
10103 goto reply;
10104 }
10105 if (!osdmap.pg_exists(pgid)) {
10106 ss << "pg " << pgid << " does not exist";
10107 err = -ENOENT;
10108 goto reply;
10109 }
94b18763
FG
10110 if (pending_inc.old_pools.count(pgid.pool())) {
10111 ss << "pool of " << pgid << " is pending removal";
10112 err = -ENOENT;
10113 getline(ss, rs);
10114 wait_for_finished_proposal(op,
10115 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
10116 return true;
10117 }
224ce89b
WB
10118
10119 enum {
10120 OP_PG_UPMAP,
10121 OP_RM_PG_UPMAP,
10122 OP_PG_UPMAP_ITEMS,
10123 OP_RM_PG_UPMAP_ITEMS,
10124 } option;
10125
10126 if (prefix == "osd pg-upmap") {
10127 option = OP_PG_UPMAP;
10128 } else if (prefix == "osd rm-pg-upmap") {
10129 option = OP_RM_PG_UPMAP;
10130 } else if (prefix == "osd pg-upmap-items") {
10131 option = OP_PG_UPMAP_ITEMS;
10132 } else {
10133 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 10134 }
224ce89b
WB
10135
10136 // check pending upmap changes
10137 switch (option) {
10138 case OP_PG_UPMAP: // fall through
10139 case OP_RM_PG_UPMAP:
10140 if (pending_inc.new_pg_upmap.count(pgid) ||
10141 pending_inc.old_pg_upmap.count(pgid)) {
10142 dout(10) << __func__ << " waiting for pending update on "
10143 << pgid << dendl;
10144 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10145 return true;
7c673cae 10146 }
224ce89b 10147 break;
7c673cae 10148
224ce89b
WB
10149 case OP_PG_UPMAP_ITEMS: // fall through
10150 case OP_RM_PG_UPMAP_ITEMS:
10151 if (pending_inc.new_pg_upmap_items.count(pgid) ||
10152 pending_inc.old_pg_upmap_items.count(pgid)) {
10153 dout(10) << __func__ << " waiting for pending update on "
10154 << pgid << dendl;
10155 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10156 return true;
10157 }
10158 break;
7c673cae 10159
224ce89b
WB
10160 default:
10161 assert(0 == "invalid option");
7c673cae 10162 }
224ce89b
WB
10163
10164 switch (option) {
10165 case OP_PG_UPMAP:
10166 {
10167 vector<int64_t> id_vec;
f64942e4 10168 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
224ce89b
WB
10169 ss << "unable to parse 'id' value(s) '"
10170 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10171 err = -EINVAL;
10172 goto reply;
10173 }
10174
10175 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10176 if ((int)id_vec.size() < pool_min_size) {
10177 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
10178 << pool_min_size << ")";
10179 err = -EINVAL;
10180 goto reply;
10181 }
10182
10183 int pool_size = osdmap.get_pg_pool_size(pgid);
10184 if ((int)id_vec.size() > pool_size) {
10185 ss << "num of osds (" << id_vec.size() <<") > pool size ("
10186 << pool_size << ")";
10187 err = -EINVAL;
10188 goto reply;
10189 }
10190
10191 vector<int32_t> new_pg_upmap;
10192 for (auto osd : id_vec) {
10193 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
10194 ss << "osd." << osd << " does not exist";
10195 err = -ENOENT;
10196 goto reply;
10197 }
10198 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
10199 if (it != new_pg_upmap.end()) {
10200 ss << "osd." << osd << " already exists, ";
10201 continue;
10202 }
10203 new_pg_upmap.push_back(osd);
10204 }
10205
10206 if (new_pg_upmap.empty()) {
10207 ss << "no valid upmap items(pairs) is specified";
10208 err = -EINVAL;
10209 goto reply;
10210 }
10211
10212 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
10213 new_pg_upmap.begin(), new_pg_upmap.end());
10214 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 10215 }
224ce89b
WB
10216 break;
10217
10218 case OP_RM_PG_UPMAP:
10219 {
10220 pending_inc.old_pg_upmap.insert(pgid);
10221 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 10222 }
224ce89b 10223 break;
7c673cae 10224
224ce89b
WB
10225 case OP_PG_UPMAP_ITEMS:
10226 {
10227 vector<int64_t> id_vec;
f64942e4 10228 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
224ce89b
WB
10229 ss << "unable to parse 'id' value(s) '"
10230 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10231 err = -EINVAL;
10232 goto reply;
10233 }
10234
10235 if (id_vec.size() % 2) {
10236 ss << "you must specify pairs of osd ids to be remapped";
10237 err = -EINVAL;
10238 goto reply;
10239 }
10240
10241 int pool_size = osdmap.get_pg_pool_size(pgid);
10242 if ((int)(id_vec.size() / 2) > pool_size) {
10243 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
10244 << pool_size << ")";
10245 err = -EINVAL;
10246 goto reply;
10247 }
10248
10249 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
10250 ostringstream items;
10251 items << "[";
10252 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
10253 int from = *p++;
10254 int to = *p;
10255 if (from == to) {
10256 ss << "from osd." << from << " == to osd." << to << ", ";
10257 continue;
10258 }
10259 if (!osdmap.exists(from)) {
10260 ss << "osd." << from << " does not exist";
10261 err = -ENOENT;
10262 goto reply;
10263 }
10264 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
10265 ss << "osd." << to << " does not exist";
10266 err = -ENOENT;
10267 goto reply;
10268 }
c07f9fc5
FG
10269 pair<int32_t,int32_t> entry = make_pair(from, to);
10270 auto it = std::find(new_pg_upmap_items.begin(),
10271 new_pg_upmap_items.end(), entry);
10272 if (it != new_pg_upmap_items.end()) {
10273 ss << "osd." << from << " -> osd." << to << " already exists, ";
10274 continue;
10275 }
10276 new_pg_upmap_items.push_back(entry);
224ce89b
WB
10277 items << from << "->" << to << ",";
10278 }
10279 string out(items.str());
10280 out.resize(out.size() - 1); // drop last ','
10281 out += "]";
10282
10283 if (new_pg_upmap_items.empty()) {
10284 ss << "no valid upmap items(pairs) is specified";
10285 err = -EINVAL;
10286 goto reply;
10287 }
10288
10289 pending_inc.new_pg_upmap_items[pgid] =
10290 mempool::osdmap::vector<pair<int32_t,int32_t>>(
10291 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
10292 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
10293 }
10294 break;
10295
10296 case OP_RM_PG_UPMAP_ITEMS:
10297 {
10298 pending_inc.old_pg_upmap_items.insert(pgid);
10299 ss << "clear " << pgid << " pg_upmap_items mapping";
10300 }
10301 break;
10302
10303 default:
10304 assert(0 == "invalid option");
7c673cae
FG
10305 }
10306
7c673cae
FG
10307 goto update;
10308 } else if (prefix == "osd primary-affinity") {
10309 int64_t id;
f64942e4 10310 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
7c673cae
FG
10311 ss << "invalid osd id value '"
10312 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10313 err = -EINVAL;
10314 goto reply;
10315 }
10316 double w;
f64942e4 10317 if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
7c673cae
FG
10318 ss << "unable to parse 'weight' value '"
10319 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10320 err = -EINVAL;
10321 goto reply;
10322 }
10323 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
10324 if (ww < 0L) {
10325 ss << "weight must be >= 0";
10326 err = -EINVAL;
10327 goto reply;
10328 }
31f18b77
FG
10329 if (osdmap.require_min_compat_client > 0 &&
10330 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10331 ss << "require_min_compat_client "
10332 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
10333 << " < firefly, which is required for primary-affinity";
10334 err = -EPERM;
10335 goto reply;
10336 } else if (!g_conf->mon_osd_allow_primary_affinity) {
10337 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
10338 err = -EPERM;
10339 goto reply;
10340 }
10341 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
10342 if (err == -EAGAIN)
10343 goto wait;
10344 if (err < 0)
10345 goto reply;
10346 if (osdmap.exists(id)) {
10347 pending_inc.new_primary_affinity[id] = ww;
10348 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
10349 getline(ss, rs);
10350 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10351 get_last_committed() + 1));
10352 return true;
10353 } else {
10354 ss << "osd." << id << " does not exist";
10355 err = -ENOENT;
10356 goto reply;
10357 }
10358 } else if (prefix == "osd reweight") {
10359 int64_t id;
f64942e4 10360 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
7c673cae
FG
10361 ss << "unable to parse osd id value '"
10362 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10363 err = -EINVAL;
10364 goto reply;
10365 }
10366 double w;
f64942e4 10367 if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
7c673cae
FG
10368 ss << "unable to parse weight value '"
10369 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10370 err = -EINVAL;
10371 goto reply;
10372 }
10373 long ww = (int)((double)CEPH_OSD_IN*w);
10374 if (ww < 0L) {
10375 ss << "weight must be >= 0";
10376 err = -EINVAL;
10377 goto reply;
10378 }
10379 if (osdmap.exists(id)) {
10380 pending_inc.new_weight[id] = ww;
10381 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
10382 getline(ss, rs);
10383 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10384 get_last_committed() + 1));
10385 return true;
10386 } else {
10387 ss << "osd." << id << " does not exist";
10388 err = -ENOENT;
10389 goto reply;
10390 }
10391 } else if (prefix == "osd reweightn") {
10392 map<int32_t, uint32_t> weights;
10393 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
10394 if (err) {
10395 ss << "unable to parse 'weights' value '"
10396 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
10397 goto reply;
10398 }
10399 pending_inc.new_weight.insert(weights.begin(), weights.end());
10400 wait_for_finished_proposal(
10401 op,
10402 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 10403 return true;
7c673cae
FG
10404 } else if (prefix == "osd lost") {
10405 int64_t id;
f64942e4 10406 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
7c673cae
FG
10407 ss << "unable to parse osd id value '"
10408 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10409 err = -EINVAL;
10410 goto reply;
10411 }
10412 string sure;
f64942e4 10413 if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
7c673cae
FG
10414 ss << "are you SURE? this might mean real, permanent data loss. pass "
10415 "--yes-i-really-mean-it if you really do.";
10416 err = -EPERM;
10417 goto reply;
10418 } else if (!osdmap.exists(id)) {
10419 ss << "osd." << id << " does not exist";
10420 err = -ENOENT;
10421 goto reply;
10422 } else if (!osdmap.is_down(id)) {
10423 ss << "osd." << id << " is not down";
10424 err = -EBUSY;
10425 goto reply;
10426 } else {
10427 epoch_t e = osdmap.get_info(id).down_at;
10428 pending_inc.new_lost[id] = e;
10429 ss << "marked osd lost in epoch " << e;
10430 getline(ss, rs);
10431 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10432 get_last_committed() + 1));
10433 return true;
10434 }
10435
31f18b77
FG
10436 } else if (prefix == "osd destroy" || prefix == "osd purge") {
10437 /* Destroying an OSD means that we don't expect to further make use of
10438 * the OSDs data (which may even become unreadable after this operation),
10439 * and that we are okay with scrubbing all its cephx keys and config-key
10440 * data (which may include lockbox keys, thus rendering the osd's data
10441 * unreadable).
10442 *
10443 * The OSD will not be removed. Instead, we will mark it as destroyed,
10444 * such that a subsequent call to `create` will not reuse the osd id.
10445 * This will play into being able to recreate the OSD, at the same
10446 * crush location, with minimal data movement.
10447 */
10448
10449 // make sure authmon is writeable.
10450 if (!mon->authmon()->is_writeable()) {
10451 dout(10) << __func__ << " waiting for auth mon to be writeable for "
10452 << "osd destroy" << dendl;
10453 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10454 return false;
10455 }
10456
10457 int64_t id;
f64942e4 10458 if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
31f18b77
FG
10459 ss << "unable to parse osd id value '"
10460 << cmd_vartype_stringify(cmdmap["id"]) << "";
10461 err = -EINVAL;
10462 goto reply;
10463 }
10464
10465 bool is_destroy = (prefix == "osd destroy");
10466 if (!is_destroy) {
10467 assert("osd purge" == prefix);
10468 }
10469
10470 string sure;
f64942e4 10471 if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) ||
31f18b77
FG
10472 sure != "--yes-i-really-mean-it") {
10473 ss << "Are you SURE? This will mean real, permanent data loss, as well "
10474 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
10475 << "really do.";
10476 err = -EPERM;
10477 goto reply;
d2e6a577 10478 } else if (!osdmap.exists(id)) {
31f18b77 10479 ss << "osd." << id << " does not exist";
d2e6a577 10480 err = 0; // idempotent
31f18b77
FG
10481 goto reply;
10482 } else if (osdmap.is_up(id)) {
10483 ss << "osd." << id << " is not `down`.";
10484 err = -EBUSY;
10485 goto reply;
10486 } else if (is_destroy && osdmap.is_destroyed(id)) {
10487 ss << "destroyed osd." << id;
10488 err = 0;
10489 goto reply;
10490 }
10491
10492 bool goto_reply = false;
10493
10494 paxos->plug();
10495 if (is_destroy) {
10496 err = prepare_command_osd_destroy(id, ss);
10497 // we checked above that it should exist.
10498 assert(err != -ENOENT);
10499 } else {
10500 err = prepare_command_osd_purge(id, ss);
10501 if (err == -ENOENT) {
10502 err = 0;
10503 ss << "osd." << id << " does not exist.";
10504 goto_reply = true;
10505 }
10506 }
10507 paxos->unplug();
10508
10509 if (err < 0 || goto_reply) {
10510 goto reply;
10511 }
10512
10513 if (is_destroy) {
10514 ss << "destroyed osd." << id;
10515 } else {
10516 ss << "purged osd." << id;
10517 }
10518
10519 getline(ss, rs);
10520 wait_for_finished_proposal(op,
10521 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
10522 force_immediate_propose();
10523 return true;
10524
10525 } else if (prefix == "osd new") {
10526
10527 // make sure authmon is writeable.
10528 if (!mon->authmon()->is_writeable()) {
10529 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 10530 << "osd new" << dendl;
31f18b77
FG
10531 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10532 return false;
10533 }
10534
3a9019d9 10535 map<string,string> param_map;
31f18b77
FG
10536
10537 bufferlist bl = m->get_data();
3a9019d9
FG
10538 string param_json = bl.to_str();
10539 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 10540
3a9019d9 10541 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
10542 if (err < 0)
10543 goto reply;
10544
3a9019d9 10545 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77
FG
10546
10547 paxos->plug();
3a9019d9 10548 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
31f18b77
FG
10549 paxos->unplug();
10550
10551 if (err < 0) {
10552 goto reply;
10553 }
10554
10555 if (f) {
10556 f->flush(rdata);
10557 } else {
10558 rdata.append(ss);
10559 }
10560
10561 if (err == EEXIST) {
10562 // idempotent operation
10563 err = 0;
10564 goto reply;
10565 }
10566
10567 wait_for_finished_proposal(op,
10568 new Monitor::C_Command(mon, op, 0, rs, rdata,
10569 get_last_committed() + 1));
10570 force_immediate_propose();
10571 return true;
10572
7c673cae 10573 } else if (prefix == "osd create") {
7c673cae
FG
10574
10575 // optional id provided?
31f18b77 10576 int64_t id = -1, cmd_id = -1;
f64942e4 10577 if (cmd_getval_throws(g_ceph_context, cmdmap, "id", cmd_id)) {
31f18b77
FG
10578 if (cmd_id < 0) {
10579 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
10580 err = -EINVAL;
10581 goto reply;
10582 }
31f18b77 10583 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
10584 }
10585
7c673cae
FG
10586 uuid_d uuid;
10587 string uuidstr;
f64942e4 10588 if (cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
7c673cae 10589 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
10590 ss << "invalid uuid value '" << uuidstr << "'";
10591 err = -EINVAL;
10592 goto reply;
7c673cae 10593 }
31f18b77
FG
10594 // we only care about the id if we also have the uuid, to
10595 // ensure the operation's idempotency.
10596 id = cmd_id;
7c673cae
FG
10597 }
10598
31f18b77
FG
10599 int32_t new_id = -1;
10600 err = prepare_command_osd_create(id, uuid, &new_id, ss);
10601 if (err < 0) {
10602 if (err == -EAGAIN) {
10603 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10604 return true;
10605 }
10606 // a check has failed; reply to the user.
10607 goto reply;
10608
10609 } else if (err == EEXIST) {
10610 // this is an idempotent operation; we can go ahead and reply.
10611 if (f) {
10612 f->open_object_section("created_osd");
10613 f->dump_int("osdid", new_id);
10614 f->close_section();
10615 f->flush(rdata);
10616 } else {
10617 ss << new_id;
10618 rdata.append(ss);
7c673cae 10619 }
31f18b77
FG
10620 err = 0;
10621 goto reply;
7c673cae
FG
10622 }
10623
3a9019d9
FG
10624 string empty_device_class;
10625 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 10626
7c673cae
FG
10627 if (f) {
10628 f->open_object_section("created_osd");
31f18b77 10629 f->dump_int("osdid", new_id);
7c673cae
FG
10630 f->close_section();
10631 f->flush(rdata);
10632 } else {
31f18b77 10633 ss << new_id;
7c673cae
FG
10634 rdata.append(ss);
10635 }
31f18b77
FG
10636 wait_for_finished_proposal(op,
10637 new Monitor::C_Command(mon, op, 0, rs, rdata,
10638 get_last_committed() + 1));
7c673cae
FG
10639 return true;
10640
10641 } else if (prefix == "osd blacklist clear") {
10642 pending_inc.new_blacklist.clear();
10643 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
10644 osdmap.get_blacklist(&blacklist);
10645 for (const auto &entry : blacklist) {
10646 pending_inc.old_blacklist.push_back(entry.first);
10647 }
10648 ss << " removed all blacklist entries";
10649 getline(ss, rs);
10650 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10651 get_last_committed() + 1));
10652 return true;
10653 } else if (prefix == "osd blacklist") {
10654 string addrstr;
f64942e4 10655 cmd_getval_throws(g_ceph_context, cmdmap, "addr", addrstr);
7c673cae
FG
10656 entity_addr_t addr;
10657 if (!addr.parse(addrstr.c_str(), 0)) {
10658 ss << "unable to parse address " << addrstr;
10659 err = -EINVAL;
10660 goto reply;
10661 }
10662 else {
10663 string blacklistop;
f64942e4 10664 cmd_getval_throws(g_ceph_context, cmdmap, "blacklistop", blacklistop);
7c673cae
FG
10665 if (blacklistop == "add") {
10666 utime_t expires = ceph_clock_now();
10667 double d;
10668 // default one hour
f64942e4 10669 cmd_getval_throws(g_ceph_context, cmdmap, "expire", d,
224ce89b 10670 g_conf->mon_osd_blacklist_default_expire);
7c673cae
FG
10671 expires += d;
10672
10673 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
10674
10675 {
10676 // cancel any pending un-blacklisting request too
10677 auto it = std::find(pending_inc.old_blacklist.begin(),
10678 pending_inc.old_blacklist.end(), addr);
10679 if (it != pending_inc.old_blacklist.end()) {
10680 pending_inc.old_blacklist.erase(it);
10681 }
10682 }
10683
7c673cae
FG
10684 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
10685 getline(ss, rs);
10686 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10687 get_last_committed() + 1));
10688 return true;
10689 } else if (blacklistop == "rm") {
10690 if (osdmap.is_blacklisted(addr) ||
10691 pending_inc.new_blacklist.count(addr)) {
10692 if (osdmap.is_blacklisted(addr))
10693 pending_inc.old_blacklist.push_back(addr);
10694 else
10695 pending_inc.new_blacklist.erase(addr);
10696 ss << "un-blacklisting " << addr;
10697 getline(ss, rs);
10698 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10699 get_last_committed() + 1));
10700 return true;
10701 }
10702 ss << addr << " isn't blacklisted";
10703 err = 0;
10704 goto reply;
10705 }
10706 }
10707 } else if (prefix == "osd pool mksnap") {
10708 string poolstr;
f64942e4 10709 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
10710 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10711 if (pool < 0) {
10712 ss << "unrecognized pool '" << poolstr << "'";
10713 err = -ENOENT;
10714 goto reply;
10715 }
10716 string snapname;
f64942e4 10717 cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
7c673cae
FG
10718 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10719 if (p->is_unmanaged_snaps_mode()) {
10720 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10721 err = -EINVAL;
10722 goto reply;
10723 } else if (p->snap_exists(snapname.c_str())) {
10724 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10725 err = 0;
10726 goto reply;
10727 } else if (p->is_tier()) {
10728 ss << "pool " << poolstr << " is a cache tier";
10729 err = -EINVAL;
10730 goto reply;
10731 }
10732 pg_pool_t *pp = 0;
10733 if (pending_inc.new_pools.count(pool))
10734 pp = &pending_inc.new_pools[pool];
10735 if (!pp) {
10736 pp = &pending_inc.new_pools[pool];
10737 *pp = *p;
10738 }
10739 if (pp->snap_exists(snapname.c_str())) {
10740 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10741 } else {
10742 pp->add_snap(snapname.c_str(), ceph_clock_now());
10743 pp->set_snap_epoch(pending_inc.epoch);
10744 ss << "created pool " << poolstr << " snap " << snapname;
10745 }
10746 getline(ss, rs);
10747 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10748 get_last_committed() + 1));
10749 return true;
10750 } else if (prefix == "osd pool rmsnap") {
10751 string poolstr;
f64942e4 10752 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
10753 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10754 if (pool < 0) {
10755 ss << "unrecognized pool '" << poolstr << "'";
10756 err = -ENOENT;
10757 goto reply;
10758 }
10759 string snapname;
f64942e4 10760 cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
7c673cae
FG
10761 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10762 if (p->is_unmanaged_snaps_mode()) {
10763 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10764 err = -EINVAL;
10765 goto reply;
10766 } else if (!p->snap_exists(snapname.c_str())) {
10767 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
10768 err = 0;
10769 goto reply;
10770 }
10771 pg_pool_t *pp = 0;
10772 if (pending_inc.new_pools.count(pool))
10773 pp = &pending_inc.new_pools[pool];
10774 if (!pp) {
10775 pp = &pending_inc.new_pools[pool];
10776 *pp = *p;
10777 }
10778 snapid_t sn = pp->snap_exists(snapname.c_str());
10779 if (sn) {
10780 pp->remove_snap(sn);
10781 pp->set_snap_epoch(pending_inc.epoch);
10782 ss << "removed pool " << poolstr << " snap " << snapname;
10783 } else {
10784 ss << "already removed pool " << poolstr << " snap " << snapname;
10785 }
10786 getline(ss, rs);
10787 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10788 get_last_committed() + 1));
10789 return true;
10790 } else if (prefix == "osd pool create") {
10791 int64_t pg_num;
10792 int64_t pgp_num;
f64942e4
AA
10793 cmd_getval_throws(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
10794 cmd_getval_throws(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
7c673cae
FG
10795
10796 string pool_type_str;
f64942e4 10797 cmd_getval_throws(g_ceph_context, cmdmap, "pool_type", pool_type_str);
7c673cae 10798 if (pool_type_str.empty())
224ce89b 10799 pool_type_str = g_conf->osd_pool_default_type;
7c673cae
FG
10800
10801 string poolstr;
f64942e4 10802 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
10803 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10804 if (pool_id >= 0) {
10805 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10806 if (pool_type_str != p->get_type_name()) {
10807 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
10808 err = -EINVAL;
10809 } else {
10810 ss << "pool '" << poolstr << "' already exists";
10811 err = 0;
10812 }
10813 goto reply;
10814 }
10815
10816 int pool_type;
10817 if (pool_type_str == "replicated") {
10818 pool_type = pg_pool_t::TYPE_REPLICATED;
10819 } else if (pool_type_str == "erasure") {
10820 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
10821 CEPH_FEATURE_OSD_ERASURE_CODES,
10822 ss);
10823 if (err == -EAGAIN)
10824 goto wait;
10825 if (err)
10826 goto reply;
10827 pool_type = pg_pool_t::TYPE_ERASURE;
10828 } else {
10829 ss << "unknown pool type '" << pool_type_str << "'";
10830 err = -EINVAL;
10831 goto reply;
10832 }
10833
31f18b77 10834 bool implicit_rule_creation = false;
94b18763 10835 int64_t expected_num_objects = 0;
31f18b77 10836 string rule_name;
f64942e4 10837 cmd_getval_throws(g_ceph_context, cmdmap, "rule", rule_name);
7c673cae 10838 string erasure_code_profile;
f64942e4 10839 cmd_getval_throws(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
7c673cae
FG
10840
10841 if (pool_type == pg_pool_t::TYPE_ERASURE) {
10842 if (erasure_code_profile == "")
10843 erasure_code_profile = "default";
10844 //handle the erasure code profile
10845 if (erasure_code_profile == "default") {
10846 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
10847 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
10848 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
10849 goto wait;
10850 }
10851
10852 map<string,string> profile_map;
10853 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
10854 profile_map,
10855 &ss);
10856 if (err)
10857 goto reply;
10858 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
10859 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
10860 goto wait;
10861 }
10862 }
31f18b77
FG
10863 if (rule_name == "") {
10864 implicit_rule_creation = true;
7c673cae 10865 if (erasure_code_profile == "default") {
31f18b77 10866 rule_name = "erasure-code";
7c673cae 10867 } else {
31f18b77 10868 dout(1) << "implicitly use rule named after the pool: "
7c673cae 10869 << poolstr << dendl;
31f18b77 10870 rule_name = poolstr;
7c673cae
FG
10871 }
10872 }
f64942e4 10873 cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
94b18763 10874 expected_num_objects, int64_t(0));
7c673cae 10875 } else {
31f18b77 10876 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
10877 // and put expected_num_objects to rule field
10878 if (erasure_code_profile != "") { // cmd is from CLI
10879 if (rule_name != "") {
10880 string interr;
10881 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
10882 if (interr.length()) {
10883 ss << "error parsing integer value '" << rule_name << "': " << interr;
10884 err = -EINVAL;
10885 goto reply;
10886 }
10887 }
10888 rule_name = erasure_code_profile;
10889 } else { // cmd is well-formed
f64942e4 10890 cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
94b18763
FG
10891 expected_num_objects, int64_t(0));
10892 }
7c673cae
FG
10893 }
10894
31f18b77
FG
10895 if (!implicit_rule_creation && rule_name != "") {
10896 int rule;
10897 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
10898 if (err == -EAGAIN) {
10899 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10900 return true;
10901 }
10902 if (err)
10903 goto reply;
10904 }
10905
7c673cae
FG
10906 if (expected_num_objects < 0) {
10907 ss << "'expected_num_objects' must be non-negative";
10908 err = -EINVAL;
10909 goto reply;
10910 }
10911
91327a77
AA
10912 if (expected_num_objects > 0 &&
10913 cct->_conf->osd_objectstore == "filestore" &&
10914 cct->_conf->filestore_merge_threshold > 0) {
10915 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
10916 err = -EINVAL;
10917 goto reply;
10918 }
10919
10920 if (expected_num_objects == 0 &&
10921 cct->_conf->osd_objectstore == "filestore" &&
10922 cct->_conf->filestore_merge_threshold < 0) {
10923 int osds = osdmap.get_num_osds();
10924 if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
10925 ss << "For better initial performance on pools expected to store a "
10926 << "large number of objects, consider supplying the "
10927 << "expected_num_objects parameter when creating the pool.\n";
10928 }
10929 }
10930
7c673cae 10931 int64_t fast_read_param;
f64942e4 10932 cmd_getval_throws(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
7c673cae
FG
10933 FastReadType fast_read = FAST_READ_DEFAULT;
10934 if (fast_read_param == 0)
10935 fast_read = FAST_READ_OFF;
10936 else if (fast_read_param > 0)
10937 fast_read = FAST_READ_ON;
10938
10939 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
10940 -1, // default crush rule
31f18b77 10941 rule_name,
7c673cae
FG
10942 pg_num, pgp_num,
10943 erasure_code_profile, pool_type,
10944 (uint64_t)expected_num_objects,
10945 fast_read,
10946 &ss);
10947 if (err < 0) {
10948 switch(err) {
10949 case -EEXIST:
10950 ss << "pool '" << poolstr << "' already exists";
10951 break;
10952 case -EAGAIN:
10953 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10954 return true;
10955 case -ERANGE:
10956 goto reply;
10957 default:
10958 goto reply;
10959 break;
10960 }
10961 } else {
10962 ss << "pool '" << poolstr << "' created";
10963 }
10964 getline(ss, rs);
10965 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10966 get_last_committed() + 1));
10967 return true;
10968
10969 } else if (prefix == "osd pool delete" ||
10970 prefix == "osd pool rm") {
10971 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10972 string poolstr, poolstr2, sure;
f64942e4
AA
10973 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
10974 cmd_getval_throws(g_ceph_context, cmdmap, "pool2", poolstr2);
10975 cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
7c673cae
FG
10976 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10977 if (pool < 0) {
10978 ss << "pool '" << poolstr << "' does not exist";
10979 err = 0;
10980 goto reply;
10981 }
10982
10983 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
10984 if (poolstr2 != poolstr ||
10985 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
10986 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10987 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10988 << "followed by --yes-i-really-really-mean-it.";
10989 err = -EPERM;
10990 goto reply;
10991 }
10992 err = _prepare_remove_pool(pool, &ss, force_no_fake);
10993 if (err == -EAGAIN) {
10994 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10995 return true;
10996 }
10997 if (err < 0)
10998 goto reply;
10999 goto update;
11000 } else if (prefix == "osd pool rename") {
11001 string srcpoolstr, destpoolstr;
f64942e4
AA
11002 cmd_getval_throws(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
11003 cmd_getval_throws(g_ceph_context, cmdmap, "destpool", destpoolstr);
7c673cae
FG
11004 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
11005 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
11006
11007 if (pool_src < 0) {
11008 if (pool_dst >= 0) {
11009 // src pool doesn't exist, dst pool does exist: to ensure idempotency
11010 // of operations, assume this rename succeeded, as it is not changing
11011 // the current state. Make sure we output something understandable
11012 // for whoever is issuing the command, if they are paying attention,
11013 // in case it was not intentional; or to avoid a "wtf?" and a bug
11014 // report in case it was intentional, while expecting a failure.
11015 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
11016 << destpoolstr << "' does -- assuming successful rename";
11017 err = 0;
11018 } else {
11019 ss << "unrecognized pool '" << srcpoolstr << "'";
11020 err = -ENOENT;
11021 }
11022 goto reply;
11023 } else if (pool_dst >= 0) {
11024 // source pool exists and so does the destination pool
11025 ss << "pool '" << destpoolstr << "' already exists";
11026 err = -EEXIST;
11027 goto reply;
11028 }
11029
11030 int ret = _prepare_rename_pool(pool_src, destpoolstr);
11031 if (ret == 0) {
11032 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
11033 } else {
11034 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
11035 << cpp_strerror(ret);
11036 }
11037 getline(ss, rs);
11038 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
11039 get_last_committed() + 1));
11040 return true;
11041
11042 } else if (prefix == "osd pool set") {
11043 err = prepare_command_pool_set(cmdmap, ss);
11044 if (err == -EAGAIN)
11045 goto wait;
11046 if (err < 0)
11047 goto reply;
11048
11049 getline(ss, rs);
11050 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11051 get_last_committed() + 1));
11052 return true;
11053 } else if (prefix == "osd tier add") {
11054 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11055 if (err == -EAGAIN)
11056 goto wait;
11057 if (err)
11058 goto reply;
11059 string poolstr;
f64942e4 11060 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
11061 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11062 if (pool_id < 0) {
11063 ss << "unrecognized pool '" << poolstr << "'";
11064 err = -ENOENT;
11065 goto reply;
11066 }
11067 string tierpoolstr;
f64942e4 11068 cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
11069 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11070 if (tierpool_id < 0) {
11071 ss << "unrecognized pool '" << tierpoolstr << "'";
11072 err = -ENOENT;
11073 goto reply;
11074 }
11075 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11076 assert(p);
11077 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11078 assert(tp);
11079
11080 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11081 goto reply;
11082 }
11083
11084 // make sure new tier is empty
11085 string force_nonempty;
f64942e4 11086 cmd_getval_throws(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
31f18b77
FG
11087 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
11088 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
11089 force_nonempty != "--force-nonempty") {
11090 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
11091 err = -ENOTEMPTY;
11092 goto reply;
11093 }
11094 if (tp->ec_pool()) {
11095 ss << "tier pool '" << tierpoolstr
11096 << "' is an ec pool, which cannot be a tier";
11097 err = -ENOTSUP;
11098 goto reply;
11099 }
11100 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
11101 ((force_nonempty != "--force-nonempty") ||
11102 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
11103 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
11104 err = -ENOTEMPTY;
11105 goto reply;
11106 }
11107 // go
11108 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11109 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11110 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11111 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11112 return true;
11113 }
11114 np->tiers.insert(tierpool_id);
11115 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11116 ntp->tier_of = pool_id;
11117 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
11118 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11119 get_last_committed() + 1));
11120 return true;
11121 } else if (prefix == "osd tier remove" ||
11122 prefix == "osd tier rm") {
11123 string poolstr;
f64942e4 11124 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
11125 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11126 if (pool_id < 0) {
11127 ss << "unrecognized pool '" << poolstr << "'";
11128 err = -ENOENT;
11129 goto reply;
11130 }
11131 string tierpoolstr;
f64942e4 11132 cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
11133 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11134 if (tierpool_id < 0) {
11135 ss << "unrecognized pool '" << tierpoolstr << "'";
11136 err = -ENOENT;
11137 goto reply;
11138 }
11139 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11140 assert(p);
11141 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11142 assert(tp);
11143
11144 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
11145 goto reply;
11146 }
11147
11148 if (p->tiers.count(tierpool_id) == 0) {
11149 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11150 err = 0;
11151 goto reply;
11152 }
11153 if (tp->tier_of != pool_id) {
11154 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
11155 << osdmap.get_pool_name(tp->tier_of) << "': "
11156 // be scary about it; this is an inconsistency and bells must go off
11157 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11158 err = -EINVAL;
11159 goto reply;
11160 }
11161 if (p->read_tier == tierpool_id) {
11162 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
11163 err = -EBUSY;
11164 goto reply;
11165 }
11166 // go
11167 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11168 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11169 if (np->tiers.count(tierpool_id) == 0 ||
11170 ntp->tier_of != pool_id ||
11171 np->read_tier == tierpool_id) {
11172 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11173 return true;
11174 }
11175 np->tiers.erase(tierpool_id);
11176 ntp->clear_tier();
11177 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11178 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11179 get_last_committed() + 1));
11180 return true;
11181 } else if (prefix == "osd tier set-overlay") {
11182 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11183 if (err == -EAGAIN)
11184 goto wait;
11185 if (err)
11186 goto reply;
11187 string poolstr;
f64942e4 11188 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
11189 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11190 if (pool_id < 0) {
11191 ss << "unrecognized pool '" << poolstr << "'";
11192 err = -ENOENT;
11193 goto reply;
11194 }
11195 string overlaypoolstr;
f64942e4 11196 cmd_getval_throws(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
7c673cae
FG
11197 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
11198 if (overlaypool_id < 0) {
11199 ss << "unrecognized pool '" << overlaypoolstr << "'";
11200 err = -ENOENT;
11201 goto reply;
11202 }
11203 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11204 assert(p);
11205 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11206 assert(overlay_p);
11207 if (p->tiers.count(overlaypool_id) == 0) {
11208 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
11209 err = -EINVAL;
11210 goto reply;
11211 }
11212 if (p->read_tier == overlaypool_id) {
11213 err = 0;
11214 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11215 goto reply;
11216 }
11217 if (p->has_read_tier()) {
11218 ss << "pool '" << poolstr << "' has overlay '"
11219 << osdmap.get_pool_name(p->read_tier)
11220 << "'; please remove-overlay first";
11221 err = -EINVAL;
11222 goto reply;
11223 }
11224
11225 // go
11226 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11227 np->read_tier = overlaypool_id;
11228 np->write_tier = overlaypool_id;
11229 np->set_last_force_op_resend(pending_inc.epoch);
11230 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
11231 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
11232 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11233 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
11234 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
11235 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11236 get_last_committed() + 1));
11237 return true;
11238 } else if (prefix == "osd tier remove-overlay" ||
11239 prefix == "osd tier rm-overlay") {
11240 string poolstr;
f64942e4 11241 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
11242 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11243 if (pool_id < 0) {
11244 ss << "unrecognized pool '" << poolstr << "'";
11245 err = -ENOENT;
11246 goto reply;
11247 }
11248 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11249 assert(p);
11250 if (!p->has_read_tier()) {
11251 err = 0;
11252 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11253 goto reply;
11254 }
11255
11256 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
11257 goto reply;
11258 }
11259
11260 // go
11261 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11262 if (np->has_read_tier()) {
11263 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
11264 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
11265 nop->set_last_force_op_resend(pending_inc.epoch);
11266 }
11267 if (np->has_write_tier()) {
11268 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
11269 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
11270 nop->set_last_force_op_resend(pending_inc.epoch);
11271 }
11272 np->clear_read_tier();
11273 np->clear_write_tier();
11274 np->set_last_force_op_resend(pending_inc.epoch);
11275 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11276 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11277 get_last_committed() + 1));
11278 return true;
11279 } else if (prefix == "osd tier cache-mode") {
11280 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11281 if (err == -EAGAIN)
11282 goto wait;
11283 if (err)
11284 goto reply;
11285 string poolstr;
f64942e4 11286 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
11287 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11288 if (pool_id < 0) {
11289 ss << "unrecognized pool '" << poolstr << "'";
11290 err = -ENOENT;
11291 goto reply;
11292 }
11293 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11294 assert(p);
11295 if (!p->is_tier()) {
11296 ss << "pool '" << poolstr << "' is not a tier";
11297 err = -EINVAL;
11298 goto reply;
11299 }
11300 string modestr;
f64942e4 11301 cmd_getval_throws(g_ceph_context, cmdmap, "mode", modestr);
7c673cae
FG
11302 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11303 if (mode < 0) {
11304 ss << "'" << modestr << "' is not a valid cache mode";
11305 err = -EINVAL;
11306 goto reply;
11307 }
11308
11309 string sure;
f64942e4 11310 cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
7c673cae
FG
11311 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11312 mode != pg_pool_t::CACHEMODE_NONE &&
11313 mode != pg_pool_t::CACHEMODE_PROXY &&
11314 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11315 sure != "--yes-i-really-mean-it") {
11316 ss << "'" << modestr << "' is not a well-supported cache mode and may "
11317 << "corrupt your data. pass --yes-i-really-mean-it to force.";
11318 err = -EPERM;
11319 goto reply;
11320 }
11321
11322 // pool already has this cache-mode set and there are no pending changes
11323 if (p->cache_mode == mode &&
11324 (pending_inc.new_pools.count(pool_id) == 0 ||
11325 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
11326 ss << "set cache-mode for pool '" << poolstr << "'"
11327 << " to " << pg_pool_t::get_cache_mode_name(mode);
11328 err = 0;
11329 goto reply;
11330 }
11331
11332 /* Mode description:
11333 *
11334 * none: No cache-mode defined
11335 * forward: Forward all reads and writes to base pool
11336 * writeback: Cache writes, promote reads from base pool
11337 * readonly: Forward writes to base pool
11338 * readforward: Writes are in writeback mode, Reads are in forward mode
11339 * proxy: Proxy all reads and writes to base pool
11340 * readproxy: Writes are in writeback mode, Reads are in proxy mode
11341 *
11342 * Hence, these are the allowed transitions:
11343 *
11344 * none -> any
11345 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11346 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11347 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
11348 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
11349 * writeback -> readforward || readproxy || forward || proxy
11350 * readonly -> any
11351 */
11352
11353 // We check if the transition is valid against the current pool mode, as
11354 // it is the only committed state thus far. We will blantly squash
11355 // whatever mode is on the pending state.
11356
11357 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
11358 (mode != pg_pool_t::CACHEMODE_FORWARD &&
11359 mode != pg_pool_t::CACHEMODE_PROXY &&
11360 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11361 mode != pg_pool_t::CACHEMODE_READPROXY)) {
11362 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
11363 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
11364 << "' pool; only '"
11365 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
11366 << "','"
11367 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
11368 << "','"
11369 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
11370 << "','"
11371 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
11372 << "' allowed.";
11373 err = -EINVAL;
11374 goto reply;
11375 }
11376 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
11377 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11378 mode != pg_pool_t::CACHEMODE_FORWARD &&
11379 mode != pg_pool_t::CACHEMODE_PROXY &&
11380 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11381
11382 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
11383 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11384 mode != pg_pool_t::CACHEMODE_FORWARD &&
11385 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11386 mode != pg_pool_t::CACHEMODE_PROXY)) ||
11387
11388 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
11389 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11390 mode != pg_pool_t::CACHEMODE_FORWARD &&
11391 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11392 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11393
11394 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
11395 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11396 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11397 mode != pg_pool_t::CACHEMODE_PROXY &&
11398 mode != pg_pool_t::CACHEMODE_READPROXY))) {
11399
31f18b77
FG
11400 const pool_stat_t* pstats =
11401 mon->pgservice->get_pool_stat(pool_id);
7c673cae 11402
31f18b77 11403 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
11404 ss << "unable to set cache-mode '"
11405 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
11406 << "': dirty objects found";
11407 err = -EBUSY;
11408 goto reply;
11409 }
11410 }
11411 // go
11412 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11413 np->cache_mode = mode;
11414 // set this both when moving to and from cache_mode NONE. this is to
11415 // capture legacy pools that were set up before this flag existed.
11416 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
11417 ss << "set cache-mode for pool '" << poolstr
11418 << "' to " << pg_pool_t::get_cache_mode_name(mode);
11419 if (mode == pg_pool_t::CACHEMODE_NONE) {
11420 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11421 assert(base_pool);
11422 if (base_pool->read_tier == pool_id ||
11423 base_pool->write_tier == pool_id)
11424 ss <<" (WARNING: pool is still configured as read or write tier)";
11425 }
11426 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11427 get_last_committed() + 1));
11428 return true;
11429 } else if (prefix == "osd tier add-cache") {
11430 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11431 if (err == -EAGAIN)
11432 goto wait;
11433 if (err)
11434 goto reply;
11435 string poolstr;
f64942e4 11436 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
11437 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11438 if (pool_id < 0) {
11439 ss << "unrecognized pool '" << poolstr << "'";
11440 err = -ENOENT;
11441 goto reply;
11442 }
11443 string tierpoolstr;
f64942e4 11444 cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
7c673cae
FG
11445 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11446 if (tierpool_id < 0) {
11447 ss << "unrecognized pool '" << tierpoolstr << "'";
11448 err = -ENOENT;
11449 goto reply;
11450 }
11451 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11452 assert(p);
11453 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11454 assert(tp);
11455
11456 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11457 goto reply;
11458 }
11459
11460 int64_t size = 0;
f64942e4 11461 if (!cmd_getval_throws(g_ceph_context, cmdmap, "size", size)) {
7c673cae
FG
11462 ss << "unable to parse 'size' value '"
11463 << cmd_vartype_stringify(cmdmap["size"]) << "'";
11464 err = -EINVAL;
11465 goto reply;
11466 }
11467 // make sure new tier is empty
31f18b77
FG
11468 const pool_stat_t *pstats =
11469 mon->pgservice->get_pool_stat(tierpool_id);
11470 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
11471 ss << "tier pool '" << tierpoolstr << "' is not empty";
11472 err = -ENOTEMPTY;
11473 goto reply;
11474 }
11475 string modestr = g_conf->osd_tier_default_cache_mode;
11476 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11477 if (mode < 0) {
11478 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
11479 err = -EINVAL;
11480 goto reply;
11481 }
11482 HitSet::Params hsp;
11483 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
11484 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11485 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
11486 hsp = HitSet::Params(bsp);
11487 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
11488 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11489 }
11490 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
11491 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
11492 } else {
11493 ss << "osd tier cache default hit set type '" <<
11494 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
11495 err = -EINVAL;
11496 goto reply;
11497 }
11498 // go
11499 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11500 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11501 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11502 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11503 return true;
11504 }
11505 np->tiers.insert(tierpool_id);
11506 np->read_tier = np->write_tier = tierpool_id;
11507 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11508 np->set_last_force_op_resend(pending_inc.epoch);
11509 ntp->set_last_force_op_resend(pending_inc.epoch);
11510 ntp->tier_of = pool_id;
11511 ntp->cache_mode = mode;
11512 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
11513 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
11514 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
11515 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
11516 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
11517 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
11518 ntp->hit_set_params = hsp;
11519 ntp->target_max_bytes = size;
11520 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
11521 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11522 get_last_committed() + 1));
11523 return true;
11524 } else if (prefix == "osd pool set-quota") {
11525 string poolstr;
f64942e4 11526 cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
7c673cae
FG
11527 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11528 if (pool_id < 0) {
11529 ss << "unrecognized pool '" << poolstr << "'";
11530 err = -ENOENT;
11531 goto reply;
11532 }
11533
11534 string field;
f64942e4 11535 cmd_getval_throws(g_ceph_context, cmdmap, "field", field);
7c673cae
FG
11536 if (field != "max_objects" && field != "max_bytes") {
11537 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
11538 err = -EINVAL;
11539 goto reply;
11540 }
11541
11542 // val could contain unit designations, so we treat as a string
11543 string val;
f64942e4 11544 cmd_getval_throws(g_ceph_context, cmdmap, "val", val);
1adf2230
AA
11545 string tss;
11546 int64_t value;
11547 if (field == "max_objects") {
11548 value = strict_sistrtoll(val.c_str(), &tss);
11549 } else if (field == "max_bytes") {
11550 value = strict_iecstrtoll(val.c_str(), &tss);
11551 } else {
11552 assert(0 == "unrecognized option");
11553 }
11554 if (!tss.empty()) {
11555 ss << "error parsing value '" << val << "': " << tss;
11556 err = -EINVAL;
7c673cae
FG
11557 goto reply;
11558 }
11559
11560 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
11561 if (field == "max_objects") {
11562 pi->quota_max_objects = value;
11563 } else if (field == "max_bytes") {
11564 pi->quota_max_bytes = value;
11565 } else {
11566 assert(0 == "unrecognized option");
11567 }
11568 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
11569 rs = ss.str();
11570 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11571 get_last_committed() + 1));
11572 return true;
c07f9fc5
FG
11573 } else if (prefix == "osd pool application enable" ||
11574 prefix == "osd pool application disable" ||
11575 prefix == "osd pool application set" ||
11576 prefix == "osd pool application rm") {
11577 err = prepare_command_pool_application(prefix, cmdmap, ss);
11578 if (err == -EAGAIN)
11579 goto wait;
11580 if (err < 0)
11581 goto reply;
7c673cae 11582
c07f9fc5
FG
11583 getline(ss, rs);
11584 wait_for_finished_proposal(
11585 op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11586 return true;
7c673cae
FG
11587 } else if (prefix == "osd reweight-by-pg" ||
11588 prefix == "osd reweight-by-utilization" ||
11589 prefix == "osd test-reweight-by-pg" ||
11590 prefix == "osd test-reweight-by-utilization") {
11591 bool by_pg =
11592 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
11593 bool dry_run =
11594 prefix == "osd test-reweight-by-pg" ||
11595 prefix == "osd test-reweight-by-utilization";
11596 int64_t oload;
11597 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
11598 set<int64_t> pools;
11599 vector<string> poolnamevec;
11600 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
11601 for (unsigned j = 0; j < poolnamevec.size(); j++) {
11602 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
11603 if (pool < 0) {
11604 ss << "pool '" << poolnamevec[j] << "' does not exist";
11605 err = -ENOENT;
11606 goto reply;
11607 }
11608 pools.insert(pool);
11609 }
11610 double max_change = g_conf->mon_reweight_max_change;
11611 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
11612 if (max_change <= 0.0) {
11613 ss << "max_change " << max_change << " must be positive";
11614 err = -EINVAL;
11615 goto reply;
11616 }
11617 int64_t max_osds = g_conf->mon_reweight_max_osds;
11618 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
11619 if (max_osds <= 0) {
11620 ss << "max_osds " << max_osds << " must be positive";
11621 err = -EINVAL;
11622 goto reply;
11623 }
11624 string no_increasing;
11625 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
11626 string out_str;
11627 mempool::osdmap::map<int32_t, uint32_t> new_weights;
31f18b77
FG
11628 err = mon->pgservice->reweight_by_utilization(osdmap,
11629 oload,
11630 max_change,
11631 max_osds,
11632 by_pg,
11633 pools.empty() ? NULL : &pools,
11634 no_increasing == "--no-increasing",
11635 &new_weights,
11636 &ss, &out_str, f.get());
7c673cae
FG
11637 if (err >= 0) {
11638 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
11639 }
11640 if (f)
11641 f->flush(rdata);
11642 else
11643 rdata.append(out_str);
11644 if (err < 0) {
11645 ss << "FAILED reweight-by-pg";
11646 } else if (err == 0 || dry_run) {
11647 ss << "no change";
11648 } else {
11649 ss << "SUCCESSFUL reweight-by-pg";
11650 pending_inc.new_weight = std::move(new_weights);
11651 wait_for_finished_proposal(
11652 op,
11653 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11654 return true;
11655 }
c07f9fc5
FG
11656 } else if (prefix == "osd force-create-pg") {
11657 pg_t pgid;
11658 string pgidstr;
f64942e4 11659 cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
c07f9fc5
FG
11660 if (!pgid.parse(pgidstr.c_str())) {
11661 ss << "invalid pgid '" << pgidstr << "'";
11662 err = -EINVAL;
11663 goto reply;
11664 }
94b18763
FG
11665 if (!osdmap.pg_exists(pgid)) {
11666 ss << "pg " << pgid << " should not exist";
11667 err = -ENOENT;
11668 goto reply;
11669 }
c07f9fc5
FG
11670 bool creating_now;
11671 {
11672 std::lock_guard<std::mutex> l(creating_pgs_lock);
11673 auto emplaced = creating_pgs.pgs.emplace(pgid,
11674 make_pair(osdmap.get_epoch(),
11675 ceph_clock_now()));
11676 creating_now = emplaced.second;
11677 }
11678 if (creating_now) {
11679 ss << "pg " << pgidstr << " now creating, ok";
11680 err = 0;
11681 goto update;
11682 } else {
11683 ss << "pg " << pgid << " already creating";
11684 err = 0;
11685 goto reply;
11686 }
7c673cae
FG
11687 } else {
11688 err = -EINVAL;
11689 }
11690
11691 reply:
11692 getline(ss, rs);
11693 if (err < 0 && rs.length() == 0)
11694 rs = cpp_strerror(err);
11695 mon->reply_command(op, err, rs, rdata, get_last_committed());
11696 return ret;
11697
11698 update:
11699 getline(ss, rs);
11700 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11701 get_last_committed() + 1));
11702 return true;
11703
11704 wait:
11705 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11706 return true;
11707}
11708
28e407b8 11709bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
7c673cae
FG
11710{
11711 op->mark_osdmon_event(__func__);
28e407b8 11712
7c673cae 11713 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
28e407b8
AA
11714 MonSession *session = m->get_session();
11715 if (!session) {
11716 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11717 return true;
11718 }
11719
11720 switch (m->op) {
11721 case POOL_OP_CREATE_UNMANAGED_SNAP:
11722 case POOL_OP_DELETE_UNMANAGED_SNAP:
11723 {
11724 const std::string* pool_name = nullptr;
11725 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
11726 if (pg_pool != nullptr) {
11727 pool_name = &osdmap.get_pool_name(m->pool);
11728 }
11729
11730 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
11731 session->entity_name, session->caps,
11732 pool_name)) {
11733 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
11734 << "privileges. message: " << *m << std::endl
11735 << "caps: " << session->caps << dendl;
11736 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11737 return true;
11738 }
11739 }
11740 break;
11741 default:
11742 if (!session->is_capable("osd", MON_CAP_W)) {
11743 dout(0) << "got pool op from entity with insufficient privileges. "
11744 << "message: " << *m << std::endl
11745 << "caps: " << session->caps << dendl;
11746 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11747 return true;
11748 }
11749 break;
11750 }
11751
11752 return false;
11753}
11754
11755bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
11756{
11757 op->mark_osdmon_event(__func__);
11758 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11759
11760 if (enforce_pool_op_caps(op)) {
11761 return true;
11762 }
11763
7c673cae
FG
11764 if (m->fsid != mon->monmap->fsid) {
11765 dout(0) << __func__ << " drop message on fsid " << m->fsid
11766 << " != " << mon->monmap->fsid << " for " << *m << dendl;
11767 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11768 return true;
11769 }
11770
11771 if (m->op == POOL_OP_CREATE)
11772 return preprocess_pool_op_create(op);
11773
11774 if (!osdmap.get_pg_pool(m->pool)) {
11775 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11776 _pool_op_reply(op, 0, osdmap.get_epoch());
11777 return true;
11778 }
11779
11780 // check if the snap and snapname exist
11781 bool snap_exists = false;
11782 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
11783 if (p->snap_exists(m->name.c_str()))
11784 snap_exists = true;
11785
11786 switch (m->op) {
11787 case POOL_OP_CREATE_SNAP:
11788 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
11789 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11790 return true;
11791 }
11792 if (snap_exists) {
11793 _pool_op_reply(op, 0, osdmap.get_epoch());
11794 return true;
11795 }
11796 return false;
11797 case POOL_OP_CREATE_UNMANAGED_SNAP:
11798 if (p->is_pool_snaps_mode()) {
11799 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11800 return true;
11801 }
11802 return false;
11803 case POOL_OP_DELETE_SNAP:
11804 if (p->is_unmanaged_snaps_mode()) {
11805 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11806 return true;
11807 }
11808 if (!snap_exists) {
11809 _pool_op_reply(op, 0, osdmap.get_epoch());
11810 return true;
11811 }
11812 return false;
11813 case POOL_OP_DELETE_UNMANAGED_SNAP:
11814 if (p->is_pool_snaps_mode()) {
11815 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11816 return true;
11817 }
11818 if (p->is_removed_snap(m->snapid)) {
11819 _pool_op_reply(op, 0, osdmap.get_epoch());
11820 return true;
11821 }
11822 return false;
11823 case POOL_OP_DELETE:
11824 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
11825 _pool_op_reply(op, 0, osdmap.get_epoch());
11826 return true;
11827 }
11828 return false;
11829 case POOL_OP_AUID_CHANGE:
11830 return false;
11831 default:
11832 ceph_abort();
11833 break;
11834 }
11835
11836 return false;
11837}
11838
11839bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
11840{
11841 op->mark_osdmon_event(__func__);
11842 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
7c673cae
FG
11843 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
11844 if (pool >= 0) {
11845 _pool_op_reply(op, 0, osdmap.get_epoch());
11846 return true;
11847 }
11848
11849 return false;
11850}
11851
11852bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
11853{
11854 op->mark_osdmon_event(__func__);
11855 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11856 dout(10) << "prepare_pool_op " << *m << dendl;
11857 if (m->op == POOL_OP_CREATE) {
11858 return prepare_pool_op_create(op);
11859 } else if (m->op == POOL_OP_DELETE) {
11860 return prepare_pool_op_delete(op);
11861 }
11862
11863 int ret = 0;
11864 bool changed = false;
11865
11866 if (!osdmap.have_pg_pool(m->pool)) {
11867 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11868 return false;
11869 }
11870
11871 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
11872
11873 switch (m->op) {
11874 case POOL_OP_CREATE_SNAP:
11875 if (pool->is_tier()) {
11876 ret = -EINVAL;
11877 _pool_op_reply(op, ret, osdmap.get_epoch());
11878 return false;
11879 } // else, fall through
11880 case POOL_OP_DELETE_SNAP:
11881 if (!pool->is_unmanaged_snaps_mode()) {
11882 bool snap_exists = pool->snap_exists(m->name.c_str());
11883 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
11884 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
11885 ret = 0;
11886 } else {
11887 break;
11888 }
11889 } else {
11890 ret = -EINVAL;
11891 }
11892 _pool_op_reply(op, ret, osdmap.get_epoch());
11893 return false;
11894
11895 case POOL_OP_DELETE_UNMANAGED_SNAP:
11896 // we won't allow removal of an unmanaged snapshot from a pool
11897 // not in unmanaged snaps mode.
11898 if (!pool->is_unmanaged_snaps_mode()) {
11899 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
11900 return false;
11901 }
11902 /* fall-thru */
11903 case POOL_OP_CREATE_UNMANAGED_SNAP:
11904 // but we will allow creating an unmanaged snapshot on any pool
11905 // as long as it is not in 'pool' snaps mode.
11906 if (pool->is_pool_snaps_mode()) {
11907 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11908 return false;
11909 }
11910 }
11911
11912 // projected pool info
11913 pg_pool_t pp;
11914 if (pending_inc.new_pools.count(m->pool))
11915 pp = pending_inc.new_pools[m->pool];
11916 else
11917 pp = *osdmap.get_pg_pool(m->pool);
11918
11919 bufferlist reply_data;
11920
11921 // pool snaps vs unmanaged snaps are mutually exclusive
11922 switch (m->op) {
11923 case POOL_OP_CREATE_SNAP:
11924 case POOL_OP_DELETE_SNAP:
11925 if (pp.is_unmanaged_snaps_mode()) {
11926 ret = -EINVAL;
11927 goto out;
11928 }
11929 break;
11930
11931 case POOL_OP_CREATE_UNMANAGED_SNAP:
11932 case POOL_OP_DELETE_UNMANAGED_SNAP:
11933 if (pp.is_pool_snaps_mode()) {
11934 ret = -EINVAL;
11935 goto out;
11936 }
11937 }
11938
11939 switch (m->op) {
11940 case POOL_OP_CREATE_SNAP:
11941 if (!pp.snap_exists(m->name.c_str())) {
11942 pp.add_snap(m->name.c_str(), ceph_clock_now());
11943 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
11944 changed = true;
11945 }
11946 break;
11947
11948 case POOL_OP_DELETE_SNAP:
11949 {
11950 snapid_t s = pp.snap_exists(m->name.c_str());
11951 if (s) {
11952 pp.remove_snap(s);
11953 changed = true;
11954 }
11955 }
11956 break;
11957
11958 case POOL_OP_CREATE_UNMANAGED_SNAP:
11959 {
11960 uint64_t snapid;
11961 pp.add_unmanaged_snap(snapid);
11962 ::encode(snapid, reply_data);
11963 changed = true;
11964 }
11965 break;
11966
11967 case POOL_OP_DELETE_UNMANAGED_SNAP:
11968 if (!pp.is_removed_snap(m->snapid)) {
28e407b8
AA
11969 if (m->snapid > pp.get_snap_seq()) {
11970 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11971 return false;
11972 }
7c673cae
FG
11973 pp.remove_unmanaged_snap(m->snapid);
11974 changed = true;
11975 }
11976 break;
11977
11978 case POOL_OP_AUID_CHANGE:
11979 if (pp.auid != m->auid) {
11980 pp.auid = m->auid;
11981 changed = true;
11982 }
11983 break;
11984
11985 default:
11986 ceph_abort();
11987 break;
11988 }
11989
11990 if (changed) {
11991 pp.set_snap_epoch(pending_inc.epoch);
11992 pending_inc.new_pools[m->pool] = pp;
11993 }
11994
11995 out:
11996 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
11997 return true;
11998}
11999
12000bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
12001{
12002 op->mark_osdmon_event(__func__);
12003 int err = prepare_new_pool(op);
12004 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
12005 return true;
12006}
12007
12008int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
12009 ostream *ss)
12010{
12011 const string& poolstr = osdmap.get_pool_name(pool_id);
12012
12013 // If the Pool is in use by CephFS, refuse to delete it
28e407b8 12014 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
12015 if (pending_fsmap.pool_in_use(pool_id)) {
12016 *ss << "pool '" << poolstr << "' is in use by CephFS";
12017 return -EBUSY;
12018 }
12019
12020 if (pool.tier_of >= 0) {
12021 *ss << "pool '" << poolstr << "' is a tier of '"
12022 << osdmap.get_pool_name(pool.tier_of) << "'";
12023 return -EBUSY;
12024 }
12025 if (!pool.tiers.empty()) {
12026 *ss << "pool '" << poolstr << "' has tiers";
12027 for(auto tier : pool.tiers) {
12028 *ss << " " << osdmap.get_pool_name(tier);
12029 }
12030 return -EBUSY;
12031 }
12032
12033 if (!g_conf->mon_allow_pool_delete) {
12034 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12035 return -EPERM;
12036 }
12037
12038 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
12039 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
12040 return -EPERM;
12041 }
12042
12043 *ss << "pool '" << poolstr << "' removed";
12044 return 0;
12045}
12046
12047/**
12048 * Check if it is safe to add a tier to a base pool
12049 *
12050 * @return
12051 * True if the operation should proceed, false if we should abort here
12052 * (abort doesn't necessarily mean error, could be idempotency)
12053 */
12054bool OSDMonitor::_check_become_tier(
12055 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
12056 const int64_t base_pool_id, const pg_pool_t *base_pool,
12057 int *err,
12058 ostream *ss) const
12059{
12060 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
12061 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12062
28e407b8 12063 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
12064 if (pending_fsmap.pool_in_use(tier_pool_id)) {
12065 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
12066 *err = -EBUSY;
12067 return false;
12068 }
12069
12070 if (base_pool->tiers.count(tier_pool_id)) {
12071 assert(tier_pool->tier_of == base_pool_id);
12072 *err = 0;
12073 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
12074 << base_pool_name << "'";
12075 return false;
12076 }
12077
12078 if (base_pool->is_tier()) {
12079 *ss << "pool '" << base_pool_name << "' is already a tier of '"
12080 << osdmap.get_pool_name(base_pool->tier_of) << "', "
12081 << "multiple tiers are not yet supported.";
12082 *err = -EINVAL;
12083 return false;
12084 }
12085
12086 if (tier_pool->has_tiers()) {
12087 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
12088 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
12089 it != tier_pool->tiers.end(); ++it)
12090 *ss << "'" << osdmap.get_pool_name(*it) << "',";
12091 *ss << " multiple tiers are not yet supported.";
12092 *err = -EINVAL;
12093 return false;
12094 }
12095
12096 if (tier_pool->is_tier()) {
12097 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
12098 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
12099 *err = -EINVAL;
12100 return false;
12101 }
12102
12103 *err = 0;
12104 return true;
12105}
12106
12107
12108/**
12109 * Check if it is safe to remove a tier from this base pool
12110 *
12111 * @return
12112 * True if the operation should proceed, false if we should abort here
12113 * (abort doesn't necessarily mean error, could be idempotency)
12114 */
12115bool OSDMonitor::_check_remove_tier(
12116 const int64_t base_pool_id, const pg_pool_t *base_pool,
12117 const pg_pool_t *tier_pool,
12118 int *err, ostream *ss) const
12119{
12120 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12121
12122 // Apply CephFS-specific checks
28e407b8 12123 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae 12124 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
12125 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
12126 // If the underlying pool is erasure coded and does not allow EC
12127 // overwrites, we can't permit the removal of the replicated tier that
12128 // CephFS relies on to access it
12129 *ss << "pool '" << base_pool_name <<
12130 "' does not allow EC overwrites and is in use by CephFS"
12131 " via its tier";
7c673cae
FG
12132 *err = -EBUSY;
12133 return false;
12134 }
12135
12136 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
12137 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
12138 "tier is still in use as a writeback cache. Change the cache "
12139 "mode and flush the cache before removing it";
12140 *err = -EBUSY;
12141 return false;
12142 }
12143 }
12144
12145 *err = 0;
12146 return true;
12147}
12148
12149int OSDMonitor::_prepare_remove_pool(
12150 int64_t pool, ostream *ss, bool no_fake)
12151{
224ce89b 12152 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
12153 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12154 int r = _check_remove_pool(pool, *p, ss);
12155 if (r < 0)
12156 return r;
12157
12158 auto new_pool = pending_inc.new_pools.find(pool);
12159 if (new_pool != pending_inc.new_pools.end()) {
12160 // if there is a problem with the pending info, wait and retry
12161 // this op.
12162 const auto& p = new_pool->second;
12163 int r = _check_remove_pool(pool, p, ss);
12164 if (r < 0)
12165 return -EAGAIN;
12166 }
12167
12168 if (pending_inc.old_pools.count(pool)) {
224ce89b 12169 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
12170 << dendl;
12171 return 0;
12172 }
12173
12174 if (g_conf->mon_fake_pool_delete && !no_fake) {
12175 string old_name = osdmap.get_pool_name(pool);
12176 string new_name = old_name + "." + stringify(pool) + ".DELETED";
12177 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
12178 << old_name << " -> " << new_name << dendl;
12179 pending_inc.new_pool_names[pool] = new_name;
12180 return 0;
12181 }
12182
12183 // remove
12184 pending_inc.old_pools.insert(pool);
12185
224ce89b 12186 // remove any pg_temp mappings for this pool
7c673cae
FG
12187 for (auto p = osdmap.pg_temp->begin();
12188 p != osdmap.pg_temp->end();
12189 ++p) {
12190 if (p->first.pool() == (uint64_t)pool) {
224ce89b 12191 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
12192 << p->first << dendl;
12193 pending_inc.new_pg_temp[p->first].clear();
12194 }
12195 }
224ce89b 12196 // remove any primary_temp mappings for this pool
7c673cae
FG
12197 for (auto p = osdmap.primary_temp->begin();
12198 p != osdmap.primary_temp->end();
12199 ++p) {
12200 if (p->first.pool() == (uint64_t)pool) {
224ce89b 12201 dout(10) << __func__ << " " << pool
7c673cae
FG
12202 << " removing obsolete primary_temp" << p->first << dendl;
12203 pending_inc.new_primary_temp[p->first] = -1;
12204 }
12205 }
224ce89b
WB
12206 // remove any pg_upmap mappings for this pool
12207 for (auto& p : osdmap.pg_upmap) {
12208 if (p.first.pool() == (uint64_t)pool) {
12209 dout(10) << __func__ << " " << pool
12210 << " removing obsolete pg_upmap "
12211 << p.first << dendl;
12212 pending_inc.old_pg_upmap.insert(p.first);
12213 }
12214 }
94b18763
FG
12215 // remove any pending pg_upmap mappings for this pool
12216 {
12217 auto it = pending_inc.new_pg_upmap.begin();
12218 while (it != pending_inc.new_pg_upmap.end()) {
12219 if (it->first.pool() == (uint64_t)pool) {
12220 dout(10) << __func__ << " " << pool
12221 << " removing pending pg_upmap "
12222 << it->first << dendl;
12223 it = pending_inc.new_pg_upmap.erase(it);
12224 } else {
12225 it++;
12226 }
12227 }
12228 }
224ce89b
WB
12229 // remove any pg_upmap_items mappings for this pool
12230 for (auto& p : osdmap.pg_upmap_items) {
12231 if (p.first.pool() == (uint64_t)pool) {
12232 dout(10) << __func__ << " " << pool
12233 << " removing obsolete pg_upmap_items " << p.first
12234 << dendl;
12235 pending_inc.old_pg_upmap_items.insert(p.first);
12236 }
12237 }
94b18763
FG
12238 // remove any pending pg_upmap mappings for this pool
12239 {
12240 auto it = pending_inc.new_pg_upmap_items.begin();
12241 while (it != pending_inc.new_pg_upmap_items.end()) {
12242 if (it->first.pool() == (uint64_t)pool) {
12243 dout(10) << __func__ << " " << pool
12244 << " removing pending pg_upmap_items "
12245 << it->first << dendl;
12246 it = pending_inc.new_pg_upmap_items.erase(it);
12247 } else {
12248 it++;
12249 }
12250 }
12251 }
35e4c445
FG
12252
12253 // remove any choose_args for this pool
12254 CrushWrapper newcrush;
12255 _get_pending_crush(newcrush);
12256 if (newcrush.have_choose_args(pool)) {
12257 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
12258 newcrush.rm_choose_args(pool);
12259 pending_inc.crush.clear();
12260 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
12261 }
7c673cae
FG
12262 return 0;
12263}
12264
12265int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
12266{
12267 dout(10) << "_prepare_rename_pool " << pool << dendl;
12268 if (pending_inc.old_pools.count(pool)) {
12269 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
12270 return -ENOENT;
12271 }
12272 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
12273 p != pending_inc.new_pool_names.end();
12274 ++p) {
12275 if (p->second == newname && p->first != pool) {
12276 return -EEXIST;
12277 }
12278 }
12279
12280 pending_inc.new_pool_names[pool] = newname;
12281 return 0;
12282}
12283
12284bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
12285{
12286 op->mark_osdmon_event(__func__);
12287 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12288 ostringstream ss;
12289 int ret = _prepare_remove_pool(m->pool, &ss, false);
12290 if (ret == -EAGAIN) {
12291 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12292 return true;
12293 }
12294 if (ret < 0)
12295 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
12296 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
12297 pending_inc.epoch));
12298 return true;
12299}
12300
12301void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
12302 int ret, epoch_t epoch, bufferlist *blp)
12303{
12304 op->mark_osdmon_event(__func__);
12305 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12306 dout(20) << "_pool_op_reply " << ret << dendl;
12307 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
12308 ret, epoch, get_last_committed(), blp);
12309 mon->send_reply(op, reply);
12310}