]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
update sources to 12.2.7
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b
WB
20#include <boost/algorithm/string.hpp>
21#include <locale>
7c673cae
FG
22#include <sstream>
23
31f18b77
FG
24#include "mon/OSDMonitor.h"
25#include "mon/Monitor.h"
26#include "mon/MDSMonitor.h"
27#include "mon/PGMonitor.h"
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
42#include "messages/MOSDFull.h"
43#include "messages/MOSDMap.h"
44#include "messages/MMonGetOSDMap.h"
45#include "messages/MOSDBoot.h"
46#include "messages/MOSDAlive.h"
47#include "messages/MPoolOp.h"
48#include "messages/MPoolOpReply.h"
49#include "messages/MOSDPGCreate.h"
50#include "messages/MOSDPGCreated.h"
51#include "messages/MOSDPGTemp.h"
52#include "messages/MMonCommand.h"
53#include "messages/MRemoveSnaps.h"
54#include "messages/MOSDScrub.h"
55#include "messages/MRoute.h"
56
57#include "common/TextTable.h"
58#include "common/Timer.h"
59#include "common/ceph_argparse.h"
60#include "common/perf_counters.h"
61#include "common/strtol.h"
62
63#include "common/config.h"
64#include "common/errno.h"
65
66#include "erasure-code/ErasureCodePlugin.h"
67#include "compressor/Compressor.h"
68#include "common/Checksummer.h"
69
70#include "include/compat.h"
71#include "include/assert.h"
72#include "include/stringify.h"
73#include "include/util.h"
74#include "common/cmdparse.h"
75#include "include/str_list.h"
76#include "include/str_map.h"
224ce89b 77#include "include/scope_guard.h"
7c673cae 78
28e407b8
AA
79#include "auth/cephx/CephxKeyServer.h"
80#include "osd/OSDCap.h"
81
7c673cae
FG
82#include "json_spirit/json_spirit_reader.h"
83
c07f9fc5
FG
84#include <boost/algorithm/string/predicate.hpp>
85
7c673cae 86#define dout_subsys ceph_subsys_mon
3efd9988
FG
87static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
88static const string OSD_METADATA_PREFIX("osd_metadata");
7c673cae 89
c07f9fc5
FG
90namespace {
91
92const uint32_t MAX_POOL_APPLICATIONS = 4;
93const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
94const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
95
28e407b8
AA
96bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
97 // Note: this doesn't include support for the application tag match
98 if ((grant.spec.allow & OSD_CAP_W) != 0) {
99 auto& match = grant.match;
100 if (match.is_match_all()) {
101 return true;
102 } else if (pool_name != nullptr && match.auid < 0 &&
103 !match.pool_namespace.pool_name.empty() &&
104 match.pool_namespace.pool_name == *pool_name) {
105 return true;
106 }
107 }
108 return false;
109}
110
111bool is_unmanaged_snap_op_permitted(CephContext* cct,
112 const KeyServer& key_server,
113 const EntityName& entity_name,
114 const MonCap& mon_caps,
115 const std::string* pool_name)
116{
117 typedef std::map<std::string, std::string> CommandArgs;
118
119 if (mon_caps.is_capable(cct, CEPH_ENTITY_TYPE_MON,
120 entity_name, "osd",
121 "osd pool op unmanaged-snap",
122 (pool_name == nullptr ?
123 CommandArgs{} /* pool DNE, require unrestricted cap */ :
124 CommandArgs{{"poolname", *pool_name}}),
125 false, true, false)) {
126 return true;
127 }
128
129 AuthCapsInfo caps_info;
130 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
131 caps_info)) {
132 dout(10) << "unable to locate OSD cap data for " << entity_name
133 << " in auth db" << dendl;
134 return false;
135 }
136
137 string caps_str;
138 if (caps_info.caps.length() > 0) {
139 auto p = caps_info.caps.begin();
140 try {
141 decode(caps_str, p);
142 } catch (const buffer::error &err) {
143 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
144 << dendl;
145 return false;
146 }
147 }
148
149 OSDCap osd_cap;
150 if (!osd_cap.parse(caps_str, nullptr)) {
151 dout(10) << "unable to parse OSD cap data for " << entity_name
152 << " in auth db" << dendl;
153 return false;
154 }
155
156 // if the entity has write permissions in one or all pools, permit
157 // usage of unmanaged-snapshots
158 if (osd_cap.allow_all()) {
159 return true;
160 }
161
162 for (auto& grant : osd_cap.grants) {
163 if (grant.profile.is_valid()) {
164 for (auto& profile_grant : grant.profile_grants) {
165 if (is_osd_writable(profile_grant, pool_name)) {
166 return true;
167 }
168 }
169 } else if (is_osd_writable(grant, pool_name)) {
170 return true;
171 }
172 }
173
174 return false;
175}
176
c07f9fc5
FG
177} // anonymous namespace
178
7c673cae
FG
179void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
180{
181 if (epoch_by_pg.size() <= ps) {
182 epoch_by_pg.resize(ps + 1, 0);
183 }
184 const auto old_lec = epoch_by_pg[ps];
185 if (old_lec >= last_epoch_clean) {
186 // stale lec
187 return;
188 }
189 epoch_by_pg[ps] = last_epoch_clean;
190 if (last_epoch_clean < floor) {
191 floor = last_epoch_clean;
192 } else if (last_epoch_clean > floor) {
193 if (old_lec == floor) {
194 // probably should increase floor?
195 auto new_floor = std::min_element(std::begin(epoch_by_pg),
196 std::end(epoch_by_pg));
197 floor = *new_floor;
198 }
199 }
200 if (ps != next_missing) {
201 return;
202 }
203 for (; next_missing < epoch_by_pg.size(); next_missing++) {
204 if (epoch_by_pg[next_missing] == 0) {
205 break;
206 }
207 }
208}
209
210void LastEpochClean::remove_pool(uint64_t pool)
211{
212 report_by_pool.erase(pool);
213}
214
215void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
216{
217 auto& lec = report_by_pool[pg.pool()];
218 return lec.report(pg.ps(), last_epoch_clean);
219}
220
221epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
222{
223 auto floor = latest.get_epoch();
224 for (auto& pool : latest.get_pools()) {
225 auto reported = report_by_pool.find(pool.first);
226 if (reported == report_by_pool.end()) {
227 return 0;
228 }
229 if (reported->second.next_missing < pool.second.get_pg_num()) {
230 return 0;
231 }
232 if (reported->second.floor < floor) {
233 floor = reported->second.floor;
234 }
235 }
236 return floor;
237}
238
239
240struct C_UpdateCreatingPGs : public Context {
241 OSDMonitor *osdmon;
242 utime_t start;
243 epoch_t epoch;
244 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
245 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
246 void finish(int r) override {
247 if (r >= 0) {
248 utime_t end = ceph_clock_now();
249 dout(10) << "osdmap epoch " << epoch << " mapping took "
250 << (end - start) << " seconds" << dendl;
251 osdmon->update_creating_pgs();
252 osdmon->check_pg_creates_subs();
253 }
254 }
255};
256
257#undef dout_prefix
258#define dout_prefix _prefix(_dout, mon, osdmap)
259static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
260 return *_dout << "mon." << mon->name << "@" << mon->rank
261 << "(" << mon->get_state_name()
262 << ").osd e" << osdmap.get_epoch() << " ";
263}
264
265OSDMonitor::OSDMonitor(
266 CephContext *cct,
267 Monitor *mn,
268 Paxos *p,
269 const string& service_name)
270 : PaxosService(mn, p, service_name),
271 cct(cct),
272 inc_osd_cache(g_conf->mon_osd_cache_size),
273 full_osd_cache(g_conf->mon_osd_cache_size),
7c673cae
FG
274 mapper(mn->cct, &mn->cpu_tp),
275 op_tracker(cct, true, 1)
276{}
277
278bool OSDMonitor::_have_pending_crush()
279{
280 return pending_inc.crush.length() > 0;
281}
282
283CrushWrapper &OSDMonitor::_get_stable_crush()
284{
285 return *osdmap.crush;
286}
287
288void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
289{
290 bufferlist bl;
291 if (pending_inc.crush.length())
292 bl = pending_inc.crush;
293 else
294 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
295
296 bufferlist::iterator p = bl.begin();
297 newcrush.decode(p);
298}
299
300void OSDMonitor::create_initial()
301{
302 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
303
304 OSDMap newmap;
305
306 bufferlist bl;
307 mon->store->get("mkfs", "osdmap", bl);
308
309 if (bl.length()) {
310 newmap.decode(bl);
311 newmap.set_fsid(mon->monmap->fsid);
312 } else {
224ce89b 313 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0);
7c673cae
FG
314 }
315 newmap.set_epoch(1);
316 newmap.created = newmap.modified = ceph_clock_now();
317
318 // new clusters should sort bitwise by default.
319 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
320
321 // new cluster should require latest by default
31f18b77
FG
322 if (g_conf->mon_debug_no_require_luminous) {
323 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
324 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
325 } else {
326 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
181888fb
FG
327 newmap.flags |=
328 CEPH_OSDMAP_RECOVERY_DELETES |
329 CEPH_OSDMAP_PURGED_SNAPDIRS;
7c673cae
FG
330 newmap.full_ratio = g_conf->mon_osd_full_ratio;
331 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
332 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
333 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
334 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
335 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
31f18b77
FG
336 int r = ceph_release_from_name(
337 g_conf->mon_osd_initial_require_min_compat_client.c_str());
338 if (r <= 0) {
339 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
340 }
341 newmap.require_min_compat_client = r;
7c673cae
FG
342 }
343
344 // encode into pending incremental
28e407b8 345 uint64_t features = newmap.get_encoding_features();
7c673cae 346 newmap.encode(pending_inc.fullmap,
28e407b8 347 features | CEPH_FEATURE_RESERVED);
7c673cae
FG
348 pending_inc.full_crc = newmap.get_crc();
349 dout(20) << " full crc " << pending_inc.full_crc << dendl;
350}
351
352void OSDMonitor::get_store_prefixes(std::set<string>& s)
353{
354 s.insert(service_name);
355 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 356 s.insert(OSD_METADATA_PREFIX);
7c673cae
FG
357}
358
359void OSDMonitor::update_from_paxos(bool *need_bootstrap)
360{
361 version_t version = get_last_committed();
362 if (version == osdmap.epoch)
363 return;
364 assert(version > osdmap.epoch);
365
366 dout(15) << "update_from_paxos paxos e " << version
367 << ", my e " << osdmap.epoch << dendl;
368
31f18b77
FG
369 if (mapping_job) {
370 if (!mapping_job->is_done()) {
371 dout(1) << __func__ << " mapping job "
372 << mapping_job.get() << " did not complete, "
373 << mapping_job->shards << " left, canceling" << dendl;
374 mapping_job->abort();
375 }
376 mapping_job.reset();
377 }
7c673cae 378
224ce89b
WB
379 load_health();
380
7c673cae
FG
381 /*
382 * We will possibly have a stashed latest that *we* wrote, and we will
383 * always be sure to have the oldest full map in the first..last range
384 * due to encode_trim_extra(), which includes the oldest full map in the trim
385 * transaction.
386 *
387 * encode_trim_extra() does not however write the full map's
388 * version to 'full_latest'. This is only done when we are building the
389 * full maps from the incremental versions. But don't panic! We make sure
390 * that the following conditions find whichever full map version is newer.
391 */
392 version_t latest_full = get_version_latest_full();
393 if (latest_full == 0 && get_first_committed() > 1)
394 latest_full = get_first_committed();
395
396 if (get_first_committed() > 1 &&
397 latest_full < get_first_committed()) {
398 // the monitor could be just sync'ed with its peer, and the latest_full key
399 // is not encoded in the paxos commits in encode_pending(), so we need to
400 // make sure we get it pointing to a proper version.
401 version_t lc = get_last_committed();
402 version_t fc = get_first_committed();
403
404 dout(10) << __func__ << " looking for valid full map in interval"
405 << " [" << fc << ", " << lc << "]" << dendl;
406
407 latest_full = 0;
408 for (version_t v = lc; v >= fc; v--) {
409 string full_key = "full_" + stringify(v);
410 if (mon->store->exists(get_service_name(), full_key)) {
411 dout(10) << __func__ << " found latest full map v " << v << dendl;
412 latest_full = v;
413 break;
414 }
415 }
416
417 assert(latest_full > 0);
418 auto t(std::make_shared<MonitorDBStore::Transaction>());
419 put_version_latest_full(t, latest_full);
420 mon->store->apply_transaction(t);
421 dout(10) << __func__ << " updated the on-disk full map version to "
422 << latest_full << dendl;
423 }
424
425 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
426 bufferlist latest_bl;
427 get_version_full(latest_full, latest_bl);
428 assert(latest_bl.length() != 0);
429 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
430 osdmap.decode(latest_bl);
431 }
432
433 if (mon->monmap->get_required_features().contains_all(
434 ceph::features::mon::FEATURE_LUMINOUS)) {
435 bufferlist bl;
436 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
437 auto p = bl.begin();
438 std::lock_guard<std::mutex> l(creating_pgs_lock);
439 creating_pgs.decode(p);
440 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
441 << creating_pgs.last_scan_epoch
442 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
443 } else {
444 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
445 << dendl;
446 }
447 }
448
31f18b77
FG
449 // make sure we're using the right pg service.. remove me post-luminous!
450 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
451 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
452 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
453 } else {
454 dout(10) << __func__ << " pgservice is pg" << dendl;
455 mon->pgservice = mon->pgmon()->get_pg_stat_service();
456 }
457
7c673cae
FG
458 // walk through incrementals
459 MonitorDBStore::TransactionRef t;
460 size_t tx_size = 0;
461 while (version > osdmap.epoch) {
462 bufferlist inc_bl;
463 int err = get_version(osdmap.epoch+1, inc_bl);
464 assert(err == 0);
465 assert(inc_bl.length());
466
467 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
468 << dendl;
469 OSDMap::Incremental inc(inc_bl);
470 err = osdmap.apply_incremental(inc);
471 assert(err == 0);
472
473 if (!t)
474 t.reset(new MonitorDBStore::Transaction);
475
476 // Write out the full map for all past epochs. Encode the full
477 // map with the same features as the incremental. If we don't
478 // know, use the quorum features. If we don't know those either,
479 // encode with all features.
480 uint64_t f = inc.encode_features;
481 if (!f)
482 f = mon->get_quorum_con_features();
483 if (!f)
484 f = -1;
485 bufferlist full_bl;
486 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
487 tx_size += full_bl.length();
488
489 bufferlist orig_full_bl;
490 get_version_full(osdmap.epoch, orig_full_bl);
491 if (orig_full_bl.length()) {
492 // the primary provided the full map
493 assert(inc.have_crc);
494 if (inc.full_crc != osdmap.crc) {
495 // This will happen if the mons were running mixed versions in
496 // the past or some other circumstance made the full encoded
497 // maps divergent. Reloading here will bring us back into
498 // sync with the primary for this and all future maps. OSDs
499 // will also be brought back into sync when they discover the
500 // crc mismatch and request a full map from a mon.
501 derr << __func__ << " full map CRC mismatch, resetting to canonical"
502 << dendl;
503 osdmap = OSDMap();
504 osdmap.decode(orig_full_bl);
505 }
506 } else {
507 assert(!inc.have_crc);
508 put_version_full(t, osdmap.epoch, full_bl);
509 }
510 put_version_latest_full(t, osdmap.epoch);
511
512 // share
513 dout(1) << osdmap << dendl;
514
515 if (osdmap.epoch == 1) {
516 t->erase("mkfs", "osdmap");
517 }
518
31f18b77
FG
519 // make sure we're using the right pg service.. remove me post-luminous!
520 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
521 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
522 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
523 } else {
524 dout(10) << __func__ << " pgservice is pg" << dendl;
525 mon->pgservice = mon->pgmon()->get_pg_stat_service();
526 }
527
7c673cae
FG
528 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
529 mon->store->apply_transaction(t);
530 t = MonitorDBStore::TransactionRef();
531 tx_size = 0;
532 }
533 if (mon->monmap->get_required_features().contains_all(
534 ceph::features::mon::FEATURE_LUMINOUS)) {
7c673cae
FG
535 for (const auto &osd_state : inc.new_state) {
536 if (osd_state.second & CEPH_OSD_UP) {
537 // could be marked up *or* down, but we're too lazy to check which
538 last_osd_report.erase(osd_state.first);
539 }
540 if (osd_state.second & CEPH_OSD_EXISTS) {
541 // could be created *or* destroyed, but we can safely drop it
542 osd_epochs.erase(osd_state.first);
543 }
544 }
545 }
546 }
547
548 if (t) {
549 mon->store->apply_transaction(t);
550 }
551
552 for (int o = 0; o < osdmap.get_max_osd(); o++) {
553 if (osdmap.is_out(o))
554 continue;
555 auto found = down_pending_out.find(o);
556 if (osdmap.is_down(o)) {
557 // populate down -> out map
558 if (found == down_pending_out.end()) {
559 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
560 down_pending_out[o] = ceph_clock_now();
561 }
562 } else {
563 if (found != down_pending_out.end()) {
564 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
565 down_pending_out.erase(found);
566 }
567 }
568 }
569 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
570
571 if (mon->is_leader()) {
572 // kick pgmon, make sure it's seen the latest map
573 mon->pgmon()->check_osd_map(osdmap.epoch);
574 }
575
576 check_osdmap_subs();
577 check_pg_creates_subs();
578
579 share_map_with_random_osd();
580 update_logger();
581
582 process_failures();
583
584 // make sure our feature bits reflect the latest map
585 update_msgr_features();
586
587 if (!mon->is_leader()) {
588 // will be called by on_active() on the leader, avoid doing so twice
589 start_mapping();
590 }
591}
592
593void OSDMonitor::start_mapping()
594{
595 // initiate mapping job
596 if (mapping_job) {
597 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
598 << dendl;
599 mapping_job->abort();
600 }
224ce89b
WB
601 if (!osdmap.get_pools().empty()) {
602 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
603 mapping_job = mapping.start_update(osdmap, mapper,
604 g_conf->mon_osd_mapping_pgs_per_chunk);
605 dout(10) << __func__ << " started mapping job " << mapping_job.get()
606 << " at " << fin->start << dendl;
607 mapping_job->set_finish_event(fin);
608 } else {
609 dout(10) << __func__ << " no pools, no mapping job" << dendl;
610 mapping_job = nullptr;
611 }
7c673cae
FG
612}
613
614void OSDMonitor::update_msgr_features()
615{
616 set<int> types;
617 types.insert((int)entity_name_t::TYPE_OSD);
618 types.insert((int)entity_name_t::TYPE_CLIENT);
619 types.insert((int)entity_name_t::TYPE_MDS);
620 types.insert((int)entity_name_t::TYPE_MON);
621 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
622 uint64_t mask;
623 uint64_t features = osdmap.get_features(*q, &mask);
624 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
625 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
626 Messenger::Policy p = mon->messenger->get_policy(*q);
627 p.features_required = (p.features_required & ~mask) | features;
628 mon->messenger->set_policy(*q, p);
629 }
630 }
631}
632
633void OSDMonitor::on_active()
634{
635 update_logger();
636
637 if (mon->is_leader()) {
224ce89b 638 mon->clog->debug() << "osdmap " << osdmap;
7c673cae
FG
639 } else {
640 list<MonOpRequestRef> ls;
641 take_all_failures(ls);
642 while (!ls.empty()) {
643 MonOpRequestRef op = ls.front();
644 op->mark_osdmon_event(__func__);
645 dispatch(op);
646 ls.pop_front();
647 }
648 }
649 start_mapping();
650}
651
652void OSDMonitor::on_restart()
653{
654 last_osd_report.clear();
655}
656
657void OSDMonitor::on_shutdown()
658{
659 dout(10) << __func__ << dendl;
660 if (mapping_job) {
661 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
662 << dendl;
663 mapping_job->abort();
664 }
665
666 // discard failure info, waiters
667 list<MonOpRequestRef> ls;
668 take_all_failures(ls);
669 ls.clear();
670}
671
672void OSDMonitor::update_logger()
673{
674 dout(10) << "update_logger" << dendl;
675
676 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
677 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
678 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
679 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
680}
681
7c673cae
FG
682void OSDMonitor::create_pending()
683{
684 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
685 pending_inc.fsid = mon->monmap->fsid;
686
687 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
688
689 // clean up pg_temp, primary_temp
690 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
691 dout(10) << "create_pending did clean_temps" << dendl;
692
693 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
694 // instead of osd_backfill_full_ratio config
695 if (osdmap.backfillfull_ratio <= 0) {
696 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
697 if (pending_inc.new_backfillfull_ratio > 1.0)
698 pending_inc.new_backfillfull_ratio /= 100;
699 dout(1) << __func__ << " setting backfillfull_ratio = "
700 << pending_inc.new_backfillfull_ratio << dendl;
701 }
31f18b77
FG
702 if (osdmap.get_epoch() > 0 &&
703 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae 704 // transition full ratios from PGMap to OSDMap (on upgrade)
31f18b77
FG
705 float full_ratio = mon->pgservice->get_full_ratio();
706 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
707 if (osdmap.full_ratio != full_ratio) {
7c673cae 708 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
31f18b77
FG
709 << " -> " << full_ratio << " (from pgmap)" << dendl;
710 pending_inc.new_full_ratio = full_ratio;
7c673cae 711 }
31f18b77 712 if (osdmap.nearfull_ratio != nearfull_ratio) {
7c673cae 713 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
31f18b77
FG
714 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
715 pending_inc.new_nearfull_ratio = nearfull_ratio;
7c673cae
FG
716 }
717 } else {
718 // safety check (this shouldn't really happen)
719 if (osdmap.full_ratio <= 0) {
720 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
721 if (pending_inc.new_full_ratio > 1.0)
722 pending_inc.new_full_ratio /= 100;
723 dout(1) << __func__ << " setting full_ratio = "
724 << pending_inc.new_full_ratio << dendl;
725 }
726 if (osdmap.nearfull_ratio <= 0) {
727 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
728 if (pending_inc.new_nearfull_ratio > 1.0)
729 pending_inc.new_nearfull_ratio /= 100;
730 dout(1) << __func__ << " setting nearfull_ratio = "
731 << pending_inc.new_nearfull_ratio << dendl;
732 }
733 }
3efd9988
FG
734
735 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
736 // structure.
737 if (osdmap.crush->has_legacy_rule_ids()) {
738 CrushWrapper newcrush;
739 _get_pending_crush(newcrush);
740
741 // First, for all pools, work out which rule they really used
742 // by resolving ruleset to rule.
743 for (const auto &i : osdmap.get_pools()) {
744 const auto pool_id = i.first;
745 const auto &pool = i.second;
746 int new_rule_id = newcrush.find_rule(pool.crush_rule,
747 pool.type, pool.size);
748
749 dout(1) << __func__ << " rewriting pool "
750 << osdmap.get_pool_name(pool_id) << " crush ruleset "
751 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
752 if (pending_inc.new_pools.count(pool_id) == 0) {
753 pending_inc.new_pools[pool_id] = pool;
754 }
755 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
756 }
757
758 // Now, go ahead and renumber all the rules so that their
759 // rule_id field corresponds to their position in the array
760 auto old_to_new = newcrush.renumber_rules();
761 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
762 for (const auto &i : old_to_new) {
763 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
764 }
765 pending_inc.crush.clear();
766 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
767 }
7c673cae
FG
768}
769
770creating_pgs_t
94b18763
FG
771OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
772 const OSDMap& nextmap)
7c673cae 773{
31f18b77 774 dout(10) << __func__ << dendl;
7c673cae
FG
775 creating_pgs_t pending_creatings;
776 {
777 std::lock_guard<std::mutex> l(creating_pgs_lock);
778 pending_creatings = creating_pgs;
779 }
31f18b77
FG
780 // check for new or old pools
781 if (pending_creatings.last_scan_epoch < inc.epoch) {
782 if (osdmap.get_epoch() &&
783 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
784 auto added =
785 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
786 osdmap.get_pools(),
787 &pending_creatings);
788 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
789 }
790 unsigned queued = 0;
791 queued += scan_for_creating_pgs(osdmap.get_pools(),
792 inc.old_pools,
793 inc.modified,
794 &pending_creatings);
795 queued += scan_for_creating_pgs(inc.new_pools,
796 inc.old_pools,
797 inc.modified,
798 &pending_creatings);
799 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
800 for (auto deleted_pool : inc.old_pools) {
801 auto removed = pending_creatings.remove_pool(deleted_pool);
802 dout(10) << __func__ << " " << removed
803 << " pg removed because containing pool deleted: "
804 << deleted_pool << dendl;
805 last_epoch_clean.remove_pool(deleted_pool);
806 }
807 // pgmon updates its creating_pgs in check_osd_map() which is called by
808 // on_active() and check_osd_map() could be delayed if lease expires, so its
809 // creating_pgs could be stale in comparison with the one of osdmon. let's
810 // trim them here. otherwise, they will be added back after being erased.
811 unsigned removed = 0;
812 for (auto& pg : pending_created_pgs) {
813 dout(20) << __func__ << " noting created pg " << pg << dendl;
814 pending_creatings.created_pools.insert(pg.pool());
815 removed += pending_creatings.pgs.erase(pg);
816 }
817 pending_created_pgs.clear();
818 dout(10) << __func__ << " " << removed
819 << " pgs removed because they're created" << dendl;
820 pending_creatings.last_scan_epoch = osdmap.get_epoch();
821 }
822
94b18763
FG
823 // filter out any pgs that shouldn't exist.
824 {
825 auto i = pending_creatings.pgs.begin();
826 while (i != pending_creatings.pgs.end()) {
827 if (!nextmap.pg_exists(i->first)) {
828 dout(10) << __func__ << " removing pg " << i->first
829 << " which should not exist" << dendl;
830 i = pending_creatings.pgs.erase(i);
831 } else {
832 ++i;
833 }
834 }
835 }
836
31f18b77
FG
837 // process queue
838 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
839 const auto total = pending_creatings.pgs.size();
840 while (pending_creatings.pgs.size() < max &&
841 !pending_creatings.queue.empty()) {
842 auto p = pending_creatings.queue.begin();
843 int64_t poolid = p->first;
844 dout(10) << __func__ << " pool " << poolid
845 << " created " << p->second.created
846 << " modified " << p->second.modified
847 << " [" << p->second.start << "-" << p->second.end << ")"
848 << dendl;
849 int n = MIN(max - pending_creatings.pgs.size(),
850 p->second.end - p->second.start);
851 ps_t first = p->second.start;
852 ps_t end = first + n;
853 for (ps_t ps = first; ps < end; ++ps) {
854 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
855 // NOTE: use the *current* epoch as the PG creation epoch so that the
856 // OSD does not have to generate a long set of PastIntervals.
857 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
858 p->second.modified));
859 dout(10) << __func__ << " adding " << pgid << dendl;
860 }
861 p->second.start = end;
862 if (p->second.done()) {
863 dout(10) << __func__ << " done with queue for " << poolid << dendl;
864 pending_creatings.queue.erase(p);
865 } else {
866 dout(10) << __func__ << " pool " << poolid
867 << " now [" << p->second.start << "-" << p->second.end << ")"
868 << dendl;
869 }
870 }
871 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
872 << " pools" << dendl;
c07f9fc5
FG
873 dout(10) << __func__
874 << " " << (pending_creatings.pgs.size() - total)
875 << "/" << pending_creatings.pgs.size()
31f18b77 876 << " pgs added from queued pools" << dendl;
7c673cae
FG
877 return pending_creatings;
878}
879
880void OSDMonitor::maybe_prime_pg_temp()
881{
882 bool all = false;
883 if (pending_inc.crush.length()) {
884 dout(10) << __func__ << " new crush map, all" << dendl;
885 all = true;
886 }
887
888 if (!pending_inc.new_up_client.empty()) {
889 dout(10) << __func__ << " new up osds, all" << dendl;
890 all = true;
891 }
892
893 // check for interesting OSDs
894 set<int> osds;
31f18b77 895 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
896 !all && p != pending_inc.new_state.end();
897 ++p) {
898 if ((p->second & CEPH_OSD_UP) &&
899 osdmap.is_up(p->first)) {
900 osds.insert(p->first);
901 }
902 }
903 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
904 !all && p != pending_inc.new_weight.end();
905 ++p) {
906 if (p->second < osdmap.get_weight(p->first)) {
907 // weight reduction
908 osds.insert(p->first);
909 } else {
910 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
911 << dendl;
912 all = true;
913 }
914 }
915
916 if (!all && osds.empty())
917 return;
918
919 if (!all) {
920 unsigned estimate =
921 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
922 if (estimate > mapping.get_num_pgs() *
923 g_conf->mon_osd_prime_pg_temp_max_estimate) {
924 dout(10) << __func__ << " estimate " << estimate << " pgs on "
925 << osds.size() << " osds >= "
926 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
927 << mapping.get_num_pgs() << " pgs, all"
928 << dendl;
929 all = true;
930 } else {
931 dout(10) << __func__ << " estimate " << estimate << " pgs on "
932 << osds.size() << " osds" << dendl;
933 }
934 }
935
936 OSDMap next;
937 next.deepish_copy_from(osdmap);
938 next.apply_incremental(pending_inc);
939
224ce89b
WB
940 if (next.get_pools().empty()) {
941 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
942 } else if (all) {
7c673cae
FG
943 PrimeTempJob job(next, this);
944 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
945 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
946 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
947 } else {
948 dout(10) << __func__ << " did not finish in "
949 << g_conf->mon_osd_prime_pg_temp_max_time
950 << ", stopping" << dendl;
951 job.abort();
952 }
953 } else {
954 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
955 utime_t stop = ceph_clock_now();
956 stop += g_conf->mon_osd_prime_pg_temp_max_time;
957 const int chunk = 1000;
958 int n = chunk;
959 std::unordered_set<pg_t> did_pgs;
960 for (auto osd : osds) {
961 auto& pgs = mapping.get_osd_acting_pgs(osd);
962 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
963 for (auto pgid : pgs) {
964 if (!did_pgs.insert(pgid).second) {
965 continue;
966 }
967 prime_pg_temp(next, pgid);
968 if (--n <= 0) {
969 n = chunk;
970 if (ceph_clock_now() > stop) {
971 dout(10) << __func__ << " consumed more than "
972 << g_conf->mon_osd_prime_pg_temp_max_time
973 << " seconds, stopping"
974 << dendl;
975 return;
976 }
977 }
978 }
979 }
980 }
981}
982
983void OSDMonitor::prime_pg_temp(
984 const OSDMap& next,
985 pg_t pgid)
986{
987 if (mon->monmap->get_required_features().contains_all(
988 ceph::features::mon::FEATURE_LUMINOUS)) {
31f18b77 989 // TODO: remove this creating_pgs direct access?
7c673cae
FG
990 if (creating_pgs.pgs.count(pgid)) {
991 return;
992 }
993 } else {
31f18b77 994 if (mon->pgservice->is_creating_pg(pgid)) {
7c673cae
FG
995 return;
996 }
997 }
998 if (!osdmap.pg_exists(pgid)) {
999 return;
1000 }
1001
1002 vector<int> up, acting;
1003 mapping.get(pgid, &up, nullptr, &acting, nullptr);
1004
1005 vector<int> next_up, next_acting;
1006 int next_up_primary, next_acting_primary;
1007 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1008 &next_acting, &next_acting_primary);
c07f9fc5 1009 if (acting == next_acting && next_up != next_acting)
7c673cae
FG
1010 return; // no change since last epoch
1011
1012 if (acting.empty())
1013 return; // if previously empty now we can be no worse off
1014 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1015 if (pool && acting.size() < pool->min_size)
1016 return; // can be no worse off than before
1017
c07f9fc5
FG
1018 if (next_up == next_acting) {
1019 acting.clear();
1020 dout(20) << __func__ << "next_up === next_acting now, clear pg_temp"
1021 << dendl;
1022 }
1023
7c673cae
FG
1024 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1025 << " -> " << next_up << "/" << next_acting
1026 << ", priming " << acting
1027 << dendl;
1028 {
1029 Mutex::Locker l(prime_pg_temp_lock);
1030 // do not touch a mapping if a change is pending
1031 pending_inc.new_pg_temp.emplace(
1032 pgid,
1033 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1034 }
1035}
1036
1037/**
1038 * @note receiving a transaction in this function gives a fair amount of
1039 * freedom to the service implementation if it does need it. It shouldn't.
1040 */
1041void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1042{
1043 dout(10) << "encode_pending e " << pending_inc.epoch
1044 << dendl;
1045
1046 // finalize up pending_inc
1047 pending_inc.modified = ceph_clock_now();
1048
1049 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
1050 assert(r == 0);
1051
1052 if (mapping_job) {
1053 if (!mapping_job->is_done()) {
1054 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1055 << mapping_job.get() << " did not complete, "
1056 << mapping_job->shards << " left" << dendl;
1057 mapping_job->abort();
1058 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1059 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1060 << mapping_job.get() << " is prior epoch "
1061 << mapping.get_epoch() << dendl;
1062 } else {
1063 if (g_conf->mon_osd_prime_pg_temp) {
1064 maybe_prime_pg_temp();
1065 }
1066 }
1067 } else if (g_conf->mon_osd_prime_pg_temp) {
1068 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1069 << dendl;
1070 }
1071 mapping_job.reset();
1072
c07f9fc5
FG
1073 // ensure we don't have blank new_state updates. these are interrpeted as
1074 // CEPH_OSD_UP (and almost certainly not what we want!).
1075 auto p = pending_inc.new_state.begin();
1076 while (p != pending_inc.new_state.end()) {
1077 if (p->second == 0) {
1078 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1079 p = pending_inc.new_state.erase(p);
1080 } else {
1081 ++p;
1082 }
1083 }
1084
7c673cae
FG
1085 bufferlist bl;
1086
1087 {
1088 OSDMap tmp;
1089 tmp.deepish_copy_from(osdmap);
1090 tmp.apply_incremental(pending_inc);
1091
31f18b77 1092 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
3efd9988
FG
1093 // remove any legacy osdmap nearfull/full flags
1094 {
1095 if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1096 dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1097 << dendl;
1098 remove_flag(CEPH_OSDMAP_NEARFULL);
1099 remove_flag(CEPH_OSDMAP_FULL);
1100 }
1101 }
1102 // collect which pools are currently affected by
1103 // the near/backfill/full osd(s),
1104 // and set per-pool near/backfill/full flag instead
1105 set<int64_t> full_pool_ids;
1106 set<int64_t> backfillfull_pool_ids;
1107 set<int64_t> nearfull_pool_ids;
1108 tmp.get_full_pools(g_ceph_context,
1109 &full_pool_ids,
1110 &backfillfull_pool_ids,
1111 &nearfull_pool_ids);
1112 if (full_pool_ids.empty() ||
1113 backfillfull_pool_ids.empty() ||
1114 nearfull_pool_ids.empty()) {
1115 // normal case - no nearfull, backfillfull or full osds
1116 // try cancel any improper nearfull/backfillfull/full pool
1117 // flags first
1118 for (auto &pool: tmp.get_pools()) {
1119 auto p = pool.first;
1120 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1121 nearfull_pool_ids.empty()) {
1122 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1123 << "'s nearfull flag" << dendl;
1124 if (pending_inc.new_pools.count(p) == 0) {
1125 // load original pool info first!
1126 pending_inc.new_pools[p] = pool.second;
1127 }
1128 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1129 }
1130 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1131 backfillfull_pool_ids.empty()) {
1132 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1133 << "'s backfillfull flag" << dendl;
1134 if (pending_inc.new_pools.count(p) == 0) {
1135 pending_inc.new_pools[p] = pool.second;
1136 }
1137 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1138 }
1139 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1140 full_pool_ids.empty()) {
1141 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1142 // set by EQUOTA, skipping
1143 continue;
1144 }
1145 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1146 << "'s full flag" << dendl;
1147 if (pending_inc.new_pools.count(p) == 0) {
1148 pending_inc.new_pools[p] = pool.second;
1149 }
1150 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1151 }
1152 }
1153 }
1154 if (!full_pool_ids.empty()) {
1155 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1156 << " as full" << dendl;
1157 for (auto &p: full_pool_ids) {
1158 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1159 continue;
1160 }
1161 if (pending_inc.new_pools.count(p) == 0) {
1162 pending_inc.new_pools[p] = tmp.pools[p];
1163 }
1164 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1165 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1166 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1167 }
1168 // cancel FLAG_FULL for pools which are no longer full too
1169 for (auto &pool: tmp.get_pools()) {
1170 auto p = pool.first;
1171 if (full_pool_ids.count(p)) {
1172 // skip pools we have just marked as full above
1173 continue;
1174 }
1175 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1176 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1177 // don't touch if currently is not full
1178 // or is running out of quota (and hence considered as full)
1179 continue;
1180 }
1181 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1182 << "'s full flag" << dendl;
1183 if (pending_inc.new_pools.count(p) == 0) {
1184 pending_inc.new_pools[p] = pool.second;
1185 }
1186 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1187 }
1188 }
1189 if (!backfillfull_pool_ids.empty()) {
1190 for (auto &p: backfillfull_pool_ids) {
1191 if (full_pool_ids.count(p)) {
1192 // skip pools we have already considered as full above
1193 continue;
1194 }
1195 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1196 // make sure FLAG_FULL is truly set, so we are safe not
1197 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1198 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1199 continue;
1200 }
1201 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1202 // don't bother if pool is already marked as backfillfull
1203 continue;
1204 }
1205 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1206 << "'s as backfillfull" << dendl;
1207 if (pending_inc.new_pools.count(p) == 0) {
1208 pending_inc.new_pools[p] = tmp.pools[p];
1209 }
1210 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1211 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1212 }
1213 // cancel FLAG_BACKFILLFULL for pools
1214 // which are no longer backfillfull too
1215 for (auto &pool: tmp.get_pools()) {
1216 auto p = pool.first;
1217 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1218 // skip pools we have just marked as backfillfull/full above
1219 continue;
1220 }
1221 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1222 // and don't touch if currently is not backfillfull
1223 continue;
1224 }
1225 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1226 << "'s backfillfull flag" << dendl;
1227 if (pending_inc.new_pools.count(p) == 0) {
1228 pending_inc.new_pools[p] = pool.second;
1229 }
1230 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1231 }
1232 }
1233 if (!nearfull_pool_ids.empty()) {
1234 for (auto &p: nearfull_pool_ids) {
1235 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1236 continue;
1237 }
1238 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1239 // make sure FLAG_FULL is truly set, so we are safe not
1240 // to set a extra (redundant) FLAG_NEARFULL flag
1241 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1242 continue;
1243 }
1244 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1245 // don't bother if pool is already marked as nearfull
1246 continue;
1247 }
1248 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1249 << "'s as nearfull" << dendl;
1250 if (pending_inc.new_pools.count(p) == 0) {
1251 pending_inc.new_pools[p] = tmp.pools[p];
1252 }
1253 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1254 }
1255 // cancel FLAG_NEARFULL for pools
1256 // which are no longer nearfull too
1257 for (auto &pool: tmp.get_pools()) {
1258 auto p = pool.first;
1259 if (full_pool_ids.count(p) ||
1260 backfillfull_pool_ids.count(p) ||
1261 nearfull_pool_ids.count(p)) {
1262 // skip pools we have just marked as
1263 // nearfull/backfillfull/full above
1264 continue;
1265 }
1266 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1267 // and don't touch if currently is not nearfull
1268 continue;
1269 }
1270 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1271 << "'s nearfull flag" << dendl;
1272 if (pending_inc.new_pools.count(p) == 0) {
1273 pending_inc.new_pools[p] = pool.second;
1274 }
1275 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1276 }
7c673cae
FG
1277 }
1278
1279 // min_compat_client?
31f18b77 1280 if (tmp.require_min_compat_client == 0) {
7c673cae 1281 auto mv = tmp.get_min_compat_client();
31f18b77
FG
1282 dout(1) << __func__ << " setting require_min_compat_client to currently "
1283 << "required " << ceph_release_name(mv) << dendl;
1284 mon->clog->info() << "setting require_min_compat_client to currently "
1285 << "required " << ceph_release_name(mv);
1286 pending_inc.new_require_min_compat_client = mv;
7c673cae 1287 }
224ce89b
WB
1288
1289 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1290 // convert ec profile ruleset-* -> crush-*
1291 for (auto& p : tmp.erasure_code_profiles) {
1292 bool changed = false;
1293 map<string,string> newprofile;
1294 for (auto& q : p.second) {
1295 if (q.first.find("ruleset-") == 0) {
1296 string key = "crush-";
1297 key += q.first.substr(8);
1298 newprofile[key] = q.second;
1299 changed = true;
1300 dout(20) << " updating ec profile " << p.first
1301 << " key " << q.first << " -> " << key << dendl;
1302 } else {
1303 newprofile[q.first] = q.second;
1304 }
1305 }
1306 if (changed) {
1307 dout(10) << " updated ec profile " << p.first << ": "
1308 << newprofile << dendl;
1309 pending_inc.new_erasure_code_profiles[p.first] = newprofile;
1310 }
1311 }
c07f9fc5
FG
1312
1313 // auto-enable pool applications upon upgrade
1314 // NOTE: this can be removed post-Luminous assuming upgrades need to
1315 // proceed through Luminous
1316 for (auto &pool_pair : tmp.pools) {
1317 int64_t pool_id = pool_pair.first;
1318 pg_pool_t pg_pool = pool_pair.second;
1319 if (pg_pool.is_tier()) {
1320 continue;
1321 }
1322
1323 std::string pool_name = tmp.get_pool_name(pool_id);
1324 uint32_t match_count = 0;
1325
1326 // CephFS
28e407b8 1327 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
c07f9fc5
FG
1328 if (pending_fsmap.pool_in_use(pool_id)) {
1329 dout(10) << __func__ << " auto-enabling CephFS on pool '"
1330 << pool_name << "'" << dendl;
1331 pg_pool.application_metadata.insert(
1332 {pg_pool_t::APPLICATION_NAME_CEPHFS, {}});
1333 ++match_count;
1334 }
1335
1336 // RBD heuristics (default OpenStack pool names from docs and
1337 // ceph-ansible)
1338 if (boost::algorithm::contains(pool_name, "rbd") ||
1339 pool_name == "images" || pool_name == "volumes" ||
1340 pool_name == "backups" || pool_name == "vms") {
1341 dout(10) << __func__ << " auto-enabling RBD on pool '"
1342 << pool_name << "'" << dendl;
1343 pg_pool.application_metadata.insert(
1344 {pg_pool_t::APPLICATION_NAME_RBD, {}});
1345 ++match_count;
1346 }
1347
1348 // RGW heuristics
1349 if (boost::algorithm::contains(pool_name, ".rgw") ||
1350 boost::algorithm::contains(pool_name, ".log") ||
1351 boost::algorithm::contains(pool_name, ".intent-log") ||
1352 boost::algorithm::contains(pool_name, ".usage") ||
1353 boost::algorithm::contains(pool_name, ".users")) {
1354 dout(10) << __func__ << " auto-enabling RGW on pool '"
1355 << pool_name << "'" << dendl;
1356 pg_pool.application_metadata.insert(
1357 {pg_pool_t::APPLICATION_NAME_RGW, {}});
1358 ++match_count;
1359 }
1360
1361 // OpenStack gnocchi (from ceph-ansible)
1362 if (pool_name == "metrics" && match_count == 0) {
1363 dout(10) << __func__ << " auto-enabling OpenStack Gnocchi on pool '"
1364 << pool_name << "'" << dendl;
1365 pg_pool.application_metadata.insert({"openstack_gnocchi", {}});
1366 ++match_count;
1367 }
1368
1369 if (match_count == 1) {
1370 pg_pool.last_change = pending_inc.epoch;
1371 pending_inc.new_pools[pool_id] = pg_pool;
1372 } else if (match_count > 1) {
1373 auto pstat = mon->pgservice->get_pool_stat(pool_id);
1374 if (pstat != nullptr && pstat->stats.sum.num_objects > 0) {
1375 mon->clog->info() << "unable to auto-enable application for pool "
1376 << "'" << pool_name << "'";
1377 }
1378 }
1379 }
224ce89b 1380 }
7c673cae
FG
1381 }
1382 }
1383
1384 // tell me about it
31f18b77 1385 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1386 i != pending_inc.new_state.end();
1387 ++i) {
1388 int s = i->second ? i->second : CEPH_OSD_UP;
1389 if (s & CEPH_OSD_UP)
1390 dout(2) << " osd." << i->first << " DOWN" << dendl;
1391 if (s & CEPH_OSD_EXISTS)
1392 dout(2) << " osd." << i->first << " DNE" << dendl;
1393 }
1394 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1395 i != pending_inc.new_up_client.end();
1396 ++i) {
1397 //FIXME: insert cluster addresses too
1398 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1399 }
1400 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1401 i != pending_inc.new_weight.end();
1402 ++i) {
1403 if (i->second == CEPH_OSD_OUT) {
1404 dout(2) << " osd." << i->first << " OUT" << dendl;
1405 } else if (i->second == CEPH_OSD_IN) {
1406 dout(2) << " osd." << i->first << " IN" << dendl;
1407 } else {
1408 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1409 }
1410 }
1411
94b18763
FG
1412 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1413 osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
1414
7c673cae 1415 // features for osdmap and its incremental
28e407b8 1416 uint64_t features;
7c673cae
FG
1417
1418 // encode full map and determine its crc
1419 OSDMap tmp;
1420 {
1421 tmp.deepish_copy_from(osdmap);
1422 tmp.apply_incremental(pending_inc);
1423
1424 // determine appropriate features
28e407b8
AA
1425 features = tmp.get_encoding_features();
1426 dout(10) << __func__ << " encoding full map with "
1427 << ceph_release_name(tmp.require_osd_release)
1428 << " features " << features << dendl;
1429
1430 // the features should be a subset of the mon quorum's features!
1431 assert((features & ~mon->get_quorum_con_features()) == 0);
7c673cae
FG
1432
1433 bufferlist fullbl;
1434 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1435 pending_inc.full_crc = tmp.get_crc();
1436
1437 // include full map in the txn. note that old monitors will
1438 // overwrite this. new ones will now skip the local full map
1439 // encode and reload from this.
1440 put_version_full(t, pending_inc.epoch, fullbl);
1441 }
1442
1443 // encode
1444 assert(get_last_committed() + 1 == pending_inc.epoch);
1445 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1446
1447 dout(20) << " full_crc " << tmp.get_crc()
1448 << " inc_crc " << pending_inc.inc_crc << dendl;
1449
1450 /* put everything in the transaction */
1451 put_version(t, pending_inc.epoch, bl);
1452 put_last_committed(t, pending_inc.epoch);
1453
1454 // metadata, too!
1455 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1456 p != pending_metadata.end();
1457 ++p)
1458 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1459 for (set<int>::iterator p = pending_metadata_rm.begin();
1460 p != pending_metadata_rm.end();
1461 ++p)
1462 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1463 pending_metadata.clear();
1464 pending_metadata_rm.clear();
1465
1466 // and pg creating, also!
1467 if (mon->monmap->get_required_features().contains_all(
1468 ceph::features::mon::FEATURE_LUMINOUS)) {
94b18763 1469 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
31f18b77
FG
1470 if (osdmap.get_epoch() &&
1471 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1472 dout(7) << __func__ << " in the middle of upgrading, "
1473 << " trimming pending creating_pgs using pgmap" << dendl;
31f18b77 1474 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
7c673cae
FG
1475 }
1476 bufferlist creatings_bl;
1477 ::encode(pending_creatings, creatings_bl);
1478 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1479 }
224ce89b
WB
1480
1481 // health
1482 health_check_map_t next;
1483 tmp.check_health(&next);
1484 encode_health(next, t);
7c673cae
FG
1485}
1486
1487void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
31f18b77 1488 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
7c673cae
FG
1489{
1490 auto p = creating_pgs->pgs.begin();
1491 while (p != creating_pgs->pgs.end()) {
31f18b77
FG
1492 auto q = pg_stat.find(p->first);
1493 if (q != pg_stat.end() &&
7c673cae
FG
1494 !(q->second.state & PG_STATE_CREATING)) {
1495 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1496 << dendl;
1497 p = creating_pgs->pgs.erase(p);
7c673cae
FG
1498 } else {
1499 ++p;
1500 }
1501 }
1502}
1503
1504int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1505{
1506 bufferlist bl;
1507 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1508 if (r < 0)
1509 return r;
1510 try {
1511 bufferlist::iterator p = bl.begin();
1512 ::decode(m, p);
1513 }
1514 catch (buffer::error& e) {
1515 if (err)
1516 *err << "osd." << osd << " metadata is corrupt";
1517 return -EIO;
1518 }
1519 return 0;
1520}
1521
c07f9fc5 1522void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 1523{
31f18b77
FG
1524 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1525 if (osdmap.is_up(osd)) {
1526 map<string,string> meta;
1527 load_metadata(osd, meta, nullptr);
1528 auto p = meta.find(field);
1529 if (p == meta.end()) {
c07f9fc5 1530 (*out)["unknown"]++;
31f18b77 1531 } else {
c07f9fc5 1532 (*out)[p->second]++;
31f18b77
FG
1533 }
1534 }
1535 }
c07f9fc5
FG
1536}
1537
1538void OSDMonitor::count_metadata(const string& field, Formatter *f)
1539{
1540 map<string,int> by_val;
1541 count_metadata(field, &by_val);
31f18b77
FG
1542 f->open_object_section(field.c_str());
1543 for (auto& p : by_val) {
1544 f->dump_int(p.first.c_str(), p.second);
1545 }
1546 f->close_section();
1547}
1548
7c673cae
FG
1549int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1550{
1551 map<string, string> metadata;
1552 int r = load_metadata(osd, metadata, nullptr);
1553 if (r < 0)
1554 return r;
1555
1556 auto it = metadata.find("osd_objectstore");
1557 if (it == metadata.end())
1558 return -ENOENT;
1559 *type = it->second;
1560 return 0;
1561}
1562
1563bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1564 const pg_pool_t &pool,
1565 ostream *err)
1566{
1567 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1568 // since filestore osds could always join the pool later
1569 set<int> checked_osds;
1570 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1571 vector<int> up, acting;
1572 pg_t pgid(ps, pool_id, -1);
1573 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1574 for (int osd : up) {
1575 if (checked_osds.find(osd) != checked_osds.end())
1576 continue;
1577 string objectstore_type;
1578 int r = get_osd_objectstore_type(osd, &objectstore_type);
1579 // allow with missing metadata, e.g. due to an osd never booting yet
1580 if (r < 0 || objectstore_type == "bluestore") {
1581 checked_osds.insert(osd);
1582 continue;
1583 }
1584 *err << "osd." << osd << " uses " << objectstore_type;
1585 return false;
1586 }
1587 }
1588 return true;
1589}
1590
1591int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1592{
1593 map<string,string> m;
1594 if (int r = load_metadata(osd, m, err))
1595 return r;
1596 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1597 f->dump_string(p->first.c_str(), p->second);
1598 return 0;
1599}
1600
1601void OSDMonitor::print_nodes(Formatter *f)
1602{
1603 // group OSDs by their hosts
1604 map<string, list<int> > osds; // hostname => osd
1605 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1606 map<string, string> m;
1607 if (load_metadata(osd, m, NULL)) {
1608 continue;
1609 }
1610 map<string, string>::iterator hostname = m.find("hostname");
1611 if (hostname == m.end()) {
1612 // not likely though
1613 continue;
1614 }
1615 osds[hostname->second].push_back(osd);
1616 }
1617
1618 dump_services(f, osds, "osd");
1619}
1620
1621void OSDMonitor::share_map_with_random_osd()
1622{
1623 if (osdmap.get_num_up_osds() == 0) {
1624 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1625 return;
1626 }
1627
1628 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1629 if (!s) {
1630 dout(10) << __func__ << " no up osd on our session map" << dendl;
1631 return;
1632 }
1633
1634 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
28e407b8
AA
1635
1636 // get feature of the peer
1637 // use quorum_con_features, if it's an anonymous connection.
1638 uint64_t features = s->con_features ? s->con_features :
1639 mon->get_quorum_con_features();
7c673cae 1640 // whatev, they'll request more if they need it
28e407b8 1641 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
7c673cae
FG
1642 s->con->send_message(m);
1643 // NOTE: do *not* record osd has up to this epoch (as we do
1644 // elsewhere) as they may still need to request older values.
1645}
1646
1647version_t OSDMonitor::get_trim_to()
1648{
31f18b77
FG
1649 if (mon->get_quorum().empty()) {
1650 dout(10) << __func__ << ": quorum not formed" << dendl;
1651 return 0;
1652 }
7c673cae 1653
31f18b77 1654 epoch_t floor;
7c673cae
FG
1655 if (mon->monmap->get_required_features().contains_all(
1656 ceph::features::mon::FEATURE_LUMINOUS)) {
1657 {
31f18b77 1658 // TODO: Get this hidden in PGStatService
7c673cae
FG
1659 std::lock_guard<std::mutex> l(creating_pgs_lock);
1660 if (!creating_pgs.pgs.empty()) {
1661 return 0;
1662 }
1663 }
1664 floor = get_min_last_epoch_clean();
1665 } else {
31f18b77 1666 if (!mon->pgservice->is_readable())
7c673cae 1667 return 0;
31f18b77 1668 if (mon->pgservice->have_creating_pgs()) {
7c673cae
FG
1669 return 0;
1670 }
31f18b77 1671 floor = mon->pgservice->get_min_last_epoch_clean();
7c673cae
FG
1672 }
1673 {
1674 dout(10) << " min_last_epoch_clean " << floor << dendl;
1675 if (g_conf->mon_osd_force_trim_to > 0 &&
1676 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1677 floor = g_conf->mon_osd_force_trim_to;
1678 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1679 }
1680 unsigned min = g_conf->mon_min_osdmap_epochs;
1681 if (floor + min > get_last_committed()) {
1682 if (min < get_last_committed())
1683 floor = get_last_committed() - min;
1684 else
1685 floor = 0;
1686 }
1687 if (floor > get_first_committed())
1688 return floor;
1689 }
1690 return 0;
1691}
1692
1693epoch_t OSDMonitor::get_min_last_epoch_clean() const
1694{
1695 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1696 // also scan osd epochs
1697 // don't trim past the oldest reported osd epoch
1698 for (auto& osd_epoch : osd_epochs) {
1699 if (osd_epoch.second < floor) {
1700 floor = osd_epoch.second;
1701 }
1702 }
1703 return floor;
1704}
1705
1706void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1707 version_t first)
1708{
1709 dout(10) << __func__ << " including full map for e " << first << dendl;
1710 bufferlist bl;
1711 get_version_full(first, bl);
1712 put_version_full(tx, first, bl);
1713}
1714
1715// -------------
1716
1717bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1718{
1719 op->mark_osdmon_event(__func__);
1720 Message *m = op->get_req();
1721 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1722
1723 switch (m->get_type()) {
1724 // READs
1725 case MSG_MON_COMMAND:
1726 return preprocess_command(op);
1727 case CEPH_MSG_MON_GET_OSDMAP:
1728 return preprocess_get_osdmap(op);
1729
1730 // damp updates
1731 case MSG_OSD_MARK_ME_DOWN:
1732 return preprocess_mark_me_down(op);
1733 case MSG_OSD_FULL:
1734 return preprocess_full(op);
1735 case MSG_OSD_FAILURE:
1736 return preprocess_failure(op);
1737 case MSG_OSD_BOOT:
1738 return preprocess_boot(op);
1739 case MSG_OSD_ALIVE:
1740 return preprocess_alive(op);
1741 case MSG_OSD_PG_CREATED:
1742 return preprocess_pg_created(op);
1743 case MSG_OSD_PGTEMP:
1744 return preprocess_pgtemp(op);
1745 case MSG_OSD_BEACON:
1746 return preprocess_beacon(op);
1747
1748 case CEPH_MSG_POOLOP:
1749 return preprocess_pool_op(op);
1750
1751 case MSG_REMOVE_SNAPS:
1752 return preprocess_remove_snaps(op);
1753
1754 default:
1755 ceph_abort();
1756 return true;
1757 }
1758}
1759
1760bool OSDMonitor::prepare_update(MonOpRequestRef op)
1761{
1762 op->mark_osdmon_event(__func__);
1763 Message *m = op->get_req();
1764 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1765
1766 switch (m->get_type()) {
1767 // damp updates
1768 case MSG_OSD_MARK_ME_DOWN:
1769 return prepare_mark_me_down(op);
1770 case MSG_OSD_FULL:
1771 return prepare_full(op);
1772 case MSG_OSD_FAILURE:
1773 return prepare_failure(op);
1774 case MSG_OSD_BOOT:
1775 return prepare_boot(op);
1776 case MSG_OSD_ALIVE:
1777 return prepare_alive(op);
1778 case MSG_OSD_PG_CREATED:
1779 return prepare_pg_created(op);
1780 case MSG_OSD_PGTEMP:
1781 return prepare_pgtemp(op);
1782 case MSG_OSD_BEACON:
1783 return prepare_beacon(op);
1784
1785 case MSG_MON_COMMAND:
1786 return prepare_command(op);
1787
1788 case CEPH_MSG_POOLOP:
1789 return prepare_pool_op(op);
1790
1791 case MSG_REMOVE_SNAPS:
1792 return prepare_remove_snaps(op);
1793
1794
1795 default:
1796 ceph_abort();
1797 }
1798
1799 return false;
1800}
1801
1802bool OSDMonitor::should_propose(double& delay)
1803{
1804 dout(10) << "should_propose" << dendl;
1805
1806 // if full map, propose immediately! any subsequent changes will be clobbered.
1807 if (pending_inc.fullmap.length())
1808 return true;
1809
1810 // adjust osd weights?
1811 if (!osd_weight.empty() &&
1812 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1813 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1814 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1815 delay = 0.0;
1816 osd_weight.clear();
1817 return true;
1818 }
1819
7c673cae
FG
1820 return PaxosService::should_propose(delay);
1821}
1822
1823
1824
1825// ---------------------------
1826// READs
1827
1828bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1829{
1830 op->mark_osdmon_event(__func__);
1831 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
28e407b8
AA
1832
1833 uint64_t features = mon->get_quorum_con_features();
1834 if (m->get_session() && m->get_session()->con_features)
1835 features = m->get_session()->con_features;
1836
7c673cae 1837 dout(10) << __func__ << " " << *m << dendl;
28e407b8 1838 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
1839 epoch_t first = get_first_committed();
1840 epoch_t last = osdmap.get_epoch();
1841 int max = g_conf->osd_map_message_max;
1842 for (epoch_t e = MAX(first, m->get_full_first());
1843 e <= MIN(last, m->get_full_last()) && max > 0;
1844 ++e, --max) {
28e407b8 1845 int r = get_version_full(e, features, reply->maps[e]);
7c673cae
FG
1846 assert(r >= 0);
1847 }
1848 for (epoch_t e = MAX(first, m->get_inc_first());
1849 e <= MIN(last, m->get_inc_last()) && max > 0;
1850 ++e, --max) {
28e407b8 1851 int r = get_version(e, features, reply->incremental_maps[e]);
7c673cae
FG
1852 assert(r >= 0);
1853 }
1854 reply->oldest_map = first;
1855 reply->newest_map = last;
1856 mon->send_reply(op, reply);
1857 return true;
1858}
1859
1860
1861// ---------------------------
1862// UPDATEs
1863
1864// failure --
1865
1866bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1867 // check permissions
1868 MonSession *session = m->get_session();
1869 if (!session)
1870 return true;
1871 if (!session->is_capable("osd", MON_CAP_X)) {
1872 dout(0) << "got MOSDFailure from entity with insufficient caps "
1873 << session->caps << dendl;
1874 return true;
1875 }
1876 if (fsid != mon->monmap->fsid) {
1877 dout(0) << "check_source: on fsid " << fsid
1878 << " != " << mon->monmap->fsid << dendl;
1879 return true;
1880 }
1881 return false;
1882}
1883
1884
1885bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1886{
1887 op->mark_osdmon_event(__func__);
1888 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1889 // who is target_osd
1890 int badboy = m->get_target().name.num();
1891
1892 // check permissions
1893 if (check_source(m, m->fsid))
1894 goto didit;
1895
1896 // first, verify the reporting host is valid
1897 if (m->get_orig_source().is_osd()) {
1898 int from = m->get_orig_source().num();
1899 if (!osdmap.exists(from) ||
1900 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1901 (osdmap.is_down(from) && m->if_osd_failed())) {
1902 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1903 send_incremental(op, m->get_epoch()+1);
1904 goto didit;
1905 }
1906 }
1907
1908
1909 // weird?
1910 if (osdmap.is_down(badboy)) {
1911 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1912 if (m->get_epoch() < osdmap.get_epoch())
1913 send_incremental(op, m->get_epoch()+1);
1914 goto didit;
1915 }
1916 if (osdmap.get_inst(badboy) != m->get_target()) {
1917 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1918 << ", from " << m->get_orig_source_inst() << dendl;
1919 if (m->get_epoch() < osdmap.get_epoch())
1920 send_incremental(op, m->get_epoch()+1);
1921 goto didit;
1922 }
1923
1924 // already reported?
1925 if (osdmap.is_down(badboy) ||
1926 osdmap.get_up_from(badboy) > m->get_epoch()) {
1927 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1928 if (m->get_epoch() < osdmap.get_epoch())
1929 send_incremental(op, m->get_epoch()+1);
1930 goto didit;
1931 }
1932
1933 if (!can_mark_down(badboy)) {
1934 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1935 goto didit;
1936 }
1937
1938 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1939 return false;
1940
1941 didit:
28e407b8 1942 mon->no_reply(op);
7c673cae
FG
1943 return true;
1944}
1945
1946class C_AckMarkedDown : public C_MonOp {
1947 OSDMonitor *osdmon;
1948public:
1949 C_AckMarkedDown(
1950 OSDMonitor *osdmon,
1951 MonOpRequestRef op)
1952 : C_MonOp(op), osdmon(osdmon) {}
1953
1954 void _finish(int) override {
1955 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1956 osdmon->mon->send_reply(
1957 op,
1958 new MOSDMarkMeDown(
1959 m->fsid,
1960 m->get_target(),
1961 m->get_epoch(),
1962 false)); // ACK itself does not request an ack
1963 }
1964 ~C_AckMarkedDown() override {
1965 }
1966};
1967
1968bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1969{
1970 op->mark_osdmon_event(__func__);
1971 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1972 int requesting_down = m->get_target().name.num();
1973 int from = m->get_orig_source().num();
1974
1975 // check permissions
1976 if (check_source(m, m->fsid))
1977 goto reply;
1978
1979 // first, verify the reporting host is valid
1980 if (!m->get_orig_source().is_osd())
1981 goto reply;
1982
1983 if (!osdmap.exists(from) ||
1984 osdmap.is_down(from) ||
1985 osdmap.get_addr(from) != m->get_target().addr) {
1986 dout(5) << "preprocess_mark_me_down from dead osd."
1987 << from << ", ignoring" << dendl;
1988 send_incremental(op, m->get_epoch()+1);
1989 goto reply;
1990 }
1991
1992 // no down might be set
1993 if (!can_mark_down(requesting_down))
1994 goto reply;
1995
1996 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1997 return false;
1998
1999 reply:
2000 if (m->request_ack) {
2001 Context *c(new C_AckMarkedDown(this, op));
2002 c->complete(0);
2003 }
2004 return true;
2005}
2006
2007bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2008{
2009 op->mark_osdmon_event(__func__);
2010 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2011 int target_osd = m->get_target().name.num();
2012
2013 assert(osdmap.is_up(target_osd));
2014 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
2015
2016 mon->clog->info() << "osd." << target_osd << " marked itself down";
2017 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2018 if (m->request_ack)
2019 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2020 return true;
2021}
2022
2023bool OSDMonitor::can_mark_down(int i)
2024{
2025 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
31f18b77
FG
2026 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
2027 << " down" << dendl;
2028 return false;
2029 }
2030
2031 if (osdmap.is_nodown(i)) {
2032 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
2033 << "will not mark it down" << dendl;
7c673cae
FG
2034 return false;
2035 }
31f18b77 2036
7c673cae
FG
2037 int num_osds = osdmap.get_num_osds();
2038 if (num_osds == 0) {
31f18b77 2039 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
2040 return false;
2041 }
2042 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
2043 float up_ratio = (float)up / (float)num_osds;
2044 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
31f18b77 2045 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
7c673cae
FG
2046 << g_conf->mon_osd_min_up_ratio
2047 << ", will not mark osd." << i << " down" << dendl;
2048 return false;
2049 }
2050 return true;
2051}
2052
2053bool OSDMonitor::can_mark_up(int i)
2054{
2055 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
31f18b77
FG
2056 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
2057 << " up" << dendl;
2058 return false;
2059 }
2060
2061 if (osdmap.is_noup(i)) {
2062 dout(5) << __func__ << " osd." << i << " is marked as noup, "
2063 << "will not mark it up" << dendl;
7c673cae
FG
2064 return false;
2065 }
31f18b77 2066
7c673cae
FG
2067 return true;
2068}
2069
2070/**
2071 * @note the parameter @p i apparently only exists here so we can output the
2072 * osd's id on messages.
2073 */
2074bool OSDMonitor::can_mark_out(int i)
2075{
2076 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
2077 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
2078 return false;
2079 }
31f18b77
FG
2080
2081 if (osdmap.is_noout(i)) {
2082 dout(5) << __func__ << " osd." << i << " is marked as noout, "
2083 << "will not mark it out" << dendl;
2084 return false;
2085 }
2086
7c673cae
FG
2087 int num_osds = osdmap.get_num_osds();
2088 if (num_osds == 0) {
2089 dout(5) << __func__ << " no osds" << dendl;
2090 return false;
2091 }
2092 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2093 float in_ratio = (float)in / (float)num_osds;
2094 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
2095 if (i >= 0)
2096 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2097 << g_conf->mon_osd_min_in_ratio
2098 << ", will not mark osd." << i << " out" << dendl;
2099 else
2100 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2101 << g_conf->mon_osd_min_in_ratio
2102 << ", will not mark osds out" << dendl;
2103 return false;
2104 }
2105
2106 return true;
2107}
2108
2109bool OSDMonitor::can_mark_in(int i)
2110{
2111 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
31f18b77
FG
2112 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
2113 << " in" << dendl;
2114 return false;
2115 }
2116
2117 if (osdmap.is_noin(i)) {
2118 dout(5) << __func__ << " osd." << i << " is marked as noin, "
2119 << "will not mark it in" << dendl;
7c673cae
FG
2120 return false;
2121 }
31f18b77 2122
7c673cae
FG
2123 return true;
2124}
2125
2126bool OSDMonitor::check_failures(utime_t now)
2127{
2128 bool found_failure = false;
2129 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2130 p != failure_info.end();
2131 ++p) {
2132 if (can_mark_down(p->first)) {
2133 found_failure |= check_failure(now, p->first, p->second);
2134 }
2135 }
2136 return found_failure;
2137}
2138
2139bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2140{
2141 // already pending failure?
2142 if (pending_inc.new_state.count(target_osd) &&
2143 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2144 dout(10) << " already pending failure" << dendl;
2145 return true;
2146 }
2147
2148 set<string> reporters_by_subtree;
2149 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
2150 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
2151 utime_t max_failed_since = fi.get_failed_since();
2152 utime_t failed_for = now - max_failed_since;
2153
2154 utime_t grace = orig_grace;
2155 double my_grace = 0, peer_grace = 0;
2156 double decay_k = 0;
2157 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2158 double halflife = (double)g_conf->mon_osd_laggy_halflife;
2159 decay_k = ::log(.5) / halflife;
2160
2161 // scale grace period based on historical probability of 'lagginess'
2162 // (false positive failures due to slowness).
2163 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2164 double decay = exp((double)failed_for * decay_k);
2165 dout(20) << " halflife " << halflife << " decay_k " << decay_k
2166 << " failed_for " << failed_for << " decay " << decay << dendl;
2167 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2168 grace += my_grace;
2169 }
2170
2171 // consider the peers reporting a failure a proxy for a potential
2172 // 'subcluster' over the overall cluster that is similarly
2173 // laggy. this is clearly not true in all cases, but will sometimes
2174 // help us localize the grace correction to a subset of the system
2175 // (say, a rack with a bad switch) that is unhappy.
2176 assert(fi.reporters.size());
2177 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2178 p != fi.reporters.end();
2179 ++p) {
2180 // get the parent bucket whose type matches with "reporter_subtree_level".
2181 // fall back to OSD if the level doesn't exist.
2182 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2183 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2184 if (iter == reporter_loc.end()) {
2185 reporters_by_subtree.insert("osd." + to_string(p->first));
2186 } else {
2187 reporters_by_subtree.insert(iter->second);
2188 }
2189 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2190 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2191 utime_t elapsed = now - xi.down_stamp;
2192 double decay = exp((double)elapsed * decay_k);
2193 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2194 }
2195 }
2196
2197 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2198 peer_grace /= (double)fi.reporters.size();
2199 grace += peer_grace;
2200 }
2201
2202 dout(10) << " osd." << target_osd << " has "
2203 << fi.reporters.size() << " reporters, "
2204 << grace << " grace (" << orig_grace << " + " << my_grace
2205 << " + " << peer_grace << "), max_failed_since " << max_failed_since
2206 << dendl;
2207
2208 if (failed_for >= grace &&
2209 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
2210 dout(1) << " we have enough reporters to mark osd." << target_osd
2211 << " down" << dendl;
2212 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2213
31f18b77
FG
2214 mon->clog->info() << "osd." << target_osd << " failed ("
2215 << osdmap.crush->get_full_location_ordered_string(
2216 target_osd)
2217 << ") ("
2218 << (int)reporters_by_subtree.size()
2219 << " reporters from different "
7c673cae
FG
2220 << reporter_subtree_level << " after "
2221 << failed_for << " >= grace " << grace << ")";
2222 return true;
2223 }
2224 return false;
2225}
2226
224ce89b 2227void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
2228{
2229 // already pending failure?
2230 if (pending_inc.new_state.count(target_osd) &&
2231 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2232 dout(10) << " already pending failure" << dendl;
2233 return;
2234 }
2235
2236 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2237 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2238
31f18b77
FG
2239 mon->clog->info() << "osd." << target_osd << " failed ("
2240 << osdmap.crush->get_full_location_ordered_string(target_osd)
2241 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
2242 return;
2243}
2244
2245bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2246{
2247 op->mark_osdmon_event(__func__);
2248 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2249 dout(1) << "prepare_failure " << m->get_target()
2250 << " from " << m->get_orig_source_inst()
2251 << " is reporting failure:" << m->if_osd_failed() << dendl;
2252
2253 int target_osd = m->get_target().name.num();
2254 int reporter = m->get_orig_source().num();
2255 assert(osdmap.is_up(target_osd));
2256 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
2257
2258 if (m->if_osd_failed()) {
2259 // calculate failure time
2260 utime_t now = ceph_clock_now();
2261 utime_t failed_since =
2262 m->get_recv_stamp() - utime_t(m->failed_for, 0);
2263
2264 // add a report
2265 if (m->is_immediate()) {
2266 mon->clog->debug() << m->get_target() << " reported immediately failed by "
2267 << m->get_orig_source_inst();
224ce89b 2268 force_failure(target_osd, reporter);
94b18763 2269 mon->no_reply(op);
7c673cae
FG
2270 return true;
2271 }
2272 mon->clog->debug() << m->get_target() << " reported failed by "
2273 << m->get_orig_source_inst();
2274
2275 failure_info_t& fi = failure_info[target_osd];
2276 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2277 if (old_op) {
2278 mon->no_reply(old_op);
2279 }
2280
2281 return check_failure(now, target_osd, fi);
2282 } else {
2283 // remove the report
2284 mon->clog->debug() << m->get_target() << " failure report canceled by "
2285 << m->get_orig_source_inst();
2286 if (failure_info.count(target_osd)) {
2287 failure_info_t& fi = failure_info[target_osd];
2288 MonOpRequestRef report_op = fi.cancel_report(reporter);
2289 if (report_op) {
2290 mon->no_reply(report_op);
2291 }
2292 if (fi.reporters.empty()) {
2293 dout(10) << " removing last failure_info for osd." << target_osd
2294 << dendl;
2295 failure_info.erase(target_osd);
2296 } else {
2297 dout(10) << " failure_info for osd." << target_osd << " now "
2298 << fi.reporters.size() << " reporters" << dendl;
2299 }
2300 } else {
2301 dout(10) << " no failure_info for osd." << target_osd << dendl;
2302 }
2303 mon->no_reply(op);
2304 }
2305
2306 return false;
2307}
2308
2309void OSDMonitor::process_failures()
2310{
2311 map<int,failure_info_t>::iterator p = failure_info.begin();
2312 while (p != failure_info.end()) {
2313 if (osdmap.is_up(p->first)) {
2314 ++p;
2315 } else {
2316 dout(10) << "process_failures osd." << p->first << dendl;
2317 list<MonOpRequestRef> ls;
2318 p->second.take_report_messages(ls);
2319 failure_info.erase(p++);
2320
2321 while (!ls.empty()) {
2322 MonOpRequestRef o = ls.front();
2323 if (o) {
2324 o->mark_event(__func__);
2325 MOSDFailure *m = o->get_req<MOSDFailure>();
2326 send_latest(o, m->get_epoch());
28e407b8 2327 mon->no_reply(o);
7c673cae
FG
2328 }
2329 ls.pop_front();
2330 }
2331 }
2332 }
2333}
2334
2335void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2336{
2337 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2338
2339 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2340 p != failure_info.end();
2341 ++p) {
2342 p->second.take_report_messages(ls);
2343 }
2344 failure_info.clear();
2345}
2346
2347
2348// boot --
2349
2350bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2351{
2352 op->mark_osdmon_event(__func__);
2353 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2354 int from = m->get_orig_source_inst().name.num();
2355
2356 // check permissions, ignore if failed (no response expected)
2357 MonSession *session = m->get_session();
2358 if (!session)
2359 goto ignore;
2360 if (!session->is_capable("osd", MON_CAP_X)) {
2361 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2362 << session->caps << dendl;
2363 goto ignore;
2364 }
2365
2366 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2367 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2368 << " != " << mon->monmap->fsid << dendl;
2369 goto ignore;
2370 }
2371
2372 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2373 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2374 goto ignore;
2375 }
2376
2377 assert(m->get_orig_source_inst().name.is_osd());
2378
2379 // check if osd has required features to boot
2380 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2381 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2382 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2383 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2384 << m->get_orig_source_inst()
2385 << " doesn't announce support -- ignore" << dendl;
2386 goto ignore;
2387 }
2388
2389 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2390 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2391 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2392 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2393 << m->get_orig_source_inst()
2394 << " doesn't announce support -- ignore" << dendl;
2395 goto ignore;
2396 }
2397
2398 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2399 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2400 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2401 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2402 << m->get_orig_source_inst()
2403 << " doesn't announce support -- ignore" << dendl;
2404 goto ignore;
2405 }
2406
31f18b77 2407 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
2408 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2409 mon->clog->info() << "disallowing boot of OSD "
2410 << m->get_orig_source_inst()
2411 << " because the osdmap requires"
2412 << " CEPH_FEATURE_SERVER_LUMINOUS"
2413 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2414 goto ignore;
2415 }
2416
31f18b77 2417 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
7c673cae
FG
2418 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2419 mon->clog->info() << "disallowing boot of OSD "
2420 << m->get_orig_source_inst()
2421 << " because the osdmap requires"
2422 << " CEPH_FEATURE_SERVER_JEWEL"
2423 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2424 goto ignore;
2425 }
2426
31f18b77 2427 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
7c673cae
FG
2428 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2429 mon->clog->info() << "disallowing boot of OSD "
2430 << m->get_orig_source_inst()
2431 << " because the osdmap requires"
2432 << " CEPH_FEATURE_SERVER_KRAKEN"
2433 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2434 goto ignore;
2435 }
2436
2437 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2438 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2439 mon->clog->info() << "disallowing boot of OSD "
2440 << m->get_orig_source_inst()
2441 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2442 goto ignore;
2443 }
2444
c07f9fc5
FG
2445 if (osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES) &&
2446 !(m->osd_features & CEPH_FEATURE_OSD_RECOVERY_DELETES)) {
2447 mon->clog->info() << "disallowing boot of OSD "
2448 << m->get_orig_source_inst()
2449 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2450 goto ignore;
2451 }
2452
7c673cae
FG
2453 if (any_of(osdmap.get_pools().begin(),
2454 osdmap.get_pools().end(),
2455 [](const std::pair<int64_t,pg_pool_t>& pool)
2456 { return pool.second.use_gmt_hitset; })) {
2457 assert(osdmap.get_num_up_osds() == 0 ||
2458 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2459 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2460 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2461 << m->get_orig_source_inst()
2462 << " doesn't announce support -- ignore" << dendl;
2463 goto ignore;
2464 }
2465 }
2466
2467 // make sure upgrades stop at luminous
2468 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
31f18b77 2469 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
2470 mon->clog->info() << "disallowing boot of post-luminous OSD "
2471 << m->get_orig_source_inst()
31f18b77 2472 << " because require_osd_release < luminous";
7c673cae
FG
2473 goto ignore;
2474 }
2475
2476 // make sure upgrades stop at jewel
2477 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
31f18b77 2478 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
2479 mon->clog->info() << "disallowing boot of post-jewel OSD "
2480 << m->get_orig_source_inst()
31f18b77 2481 << " because require_osd_release < jewel";
7c673cae
FG
2482 goto ignore;
2483 }
2484
2485 // make sure upgrades stop at hammer
2486 // * HAMMER_0_94_4 is the required hammer feature
2487 // * MON_METADATA is the first post-hammer feature
2488 if (osdmap.get_num_up_osds() > 0) {
2489 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2490 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2491 mon->clog->info() << "disallowing boot of post-hammer OSD "
2492 << m->get_orig_source_inst()
2493 << " because one or more up OSDs is pre-hammer v0.94.4";
2494 goto ignore;
2495 }
2496 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2497 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2498 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2499 << m->get_orig_source_inst()
2500 << " because all up OSDs are post-hammer";
2501 goto ignore;
2502 }
2503 }
2504
2505 // already booted?
2506 if (osdmap.is_up(from) &&
2507 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2508 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2509 // yup.
2510 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2511 << " == " << osdmap.get_inst(from) << dendl;
2512 _booted(op, false);
2513 return true;
2514 }
2515
2516 if (osdmap.exists(from) &&
2517 !osdmap.get_uuid(from).is_zero() &&
2518 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2519 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2520 << " clashes with existing osd: different fsid"
2521 << " (ours: " << osdmap.get_uuid(from)
2522 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2523 goto ignore;
2524 }
2525
2526 if (osdmap.exists(from) &&
2527 osdmap.get_info(from).up_from > m->version &&
2528 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2529 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2530 send_latest(op, m->sb.current_epoch+1);
2531 return true;
2532 }
2533
2534 // noup?
2535 if (!can_mark_up(from)) {
2536 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2537 send_latest(op, m->sb.current_epoch+1);
2538 return true;
2539 }
2540
2541 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2542 return false;
2543
2544 ignore:
2545 return true;
2546}
2547
2548bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2549{
2550 op->mark_osdmon_event(__func__);
2551 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2552 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2553 << " cluster_addr " << m->cluster_addr
2554 << " hb_back_addr " << m->hb_back_addr
2555 << " hb_front_addr " << m->hb_front_addr
2556 << dendl;
2557
2558 assert(m->get_orig_source().is_osd());
2559 int from = m->get_orig_source().num();
2560
2561 // does this osd exist?
2562 if (from >= osdmap.get_max_osd()) {
2563 dout(1) << "boot from osd." << from << " >= max_osd "
2564 << osdmap.get_max_osd() << dendl;
2565 return false;
2566 }
2567
2568 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2569 if (pending_inc.new_state.count(from))
2570 oldstate ^= pending_inc.new_state[from];
2571
2572 // already up? mark down first?
2573 if (osdmap.is_up(from)) {
2574 dout(7) << __func__ << " was up, first marking down "
2575 << osdmap.get_inst(from) << dendl;
2576 // preprocess should have caught these; if not, assert.
2577 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2578 osdmap.get_cluster_addr(from) != m->cluster_addr);
2579 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2580
2581 if (pending_inc.new_state.count(from) == 0 ||
2582 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2583 // mark previous guy down
2584 pending_inc.new_state[from] = CEPH_OSD_UP;
2585 }
2586 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2587 } else if (pending_inc.new_up_client.count(from)) {
2588 // already prepared, just wait
2589 dout(7) << __func__ << " already prepared, waiting on "
2590 << m->get_orig_source_addr() << dendl;
2591 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2592 } else {
2593 // mark new guy up.
2594 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2595 if (!m->cluster_addr.is_blank_ip())
2596 pending_inc.new_up_cluster[from] = m->cluster_addr;
2597 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2598 if (!m->hb_front_addr.is_blank_ip())
2599 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2600
2601 down_pending_out.erase(from); // if any
2602
2603 if (m->sb.weight)
2604 osd_weight[from] = m->sb.weight;
2605
2606 // set uuid?
2607 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2608 << dendl;
2609 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2610 // preprocess should have caught this; if not, assert.
2611 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2612 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2613 }
2614
2615 // fresh osd?
2616 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2617 const osd_info_t& i = osdmap.get_info(from);
2618 if (i.up_from > i.lost_at) {
2619 dout(10) << " fresh osd; marking lost_at too" << dendl;
2620 pending_inc.new_lost[from] = osdmap.get_epoch();
2621 }
2622 }
2623
2624 // metadata
2625 bufferlist osd_metadata;
2626 ::encode(m->metadata, osd_metadata);
2627 pending_metadata[from] = osd_metadata;
31f18b77 2628 pending_metadata_rm.erase(from);
7c673cae
FG
2629
2630 // adjust last clean unmount epoch?
2631 const osd_info_t& info = osdmap.get_info(from);
2632 dout(10) << " old osd_info: " << info << dendl;
2633 if (m->sb.mounted > info.last_clean_begin ||
2634 (m->sb.mounted == info.last_clean_begin &&
2635 m->sb.clean_thru > info.last_clean_end)) {
2636 epoch_t begin = m->sb.mounted;
2637 epoch_t end = m->sb.clean_thru;
2638
2639 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2640 << "[" << info.last_clean_begin << "," << info.last_clean_end
2641 << ") -> [" << begin << "-" << end << ")"
2642 << dendl;
2643 pending_inc.new_last_clean_interval[from] =
2644 pair<epoch_t,epoch_t>(begin, end);
2645 }
2646
2647 osd_xinfo_t xi = osdmap.get_xinfo(from);
2648 if (m->boot_epoch == 0) {
2649 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2650 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2651 dout(10) << " not laggy, new xi " << xi << dendl;
2652 } else {
2653 if (xi.down_stamp.sec()) {
2654 int interval = ceph_clock_now().sec() -
2655 xi.down_stamp.sec();
2656 if (g_conf->mon_osd_laggy_max_interval &&
2657 (interval > g_conf->mon_osd_laggy_max_interval)) {
2658 interval = g_conf->mon_osd_laggy_max_interval;
2659 }
2660 xi.laggy_interval =
2661 interval * g_conf->mon_osd_laggy_weight +
2662 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2663 }
2664 xi.laggy_probability =
2665 g_conf->mon_osd_laggy_weight +
2666 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2667 dout(10) << " laggy, now xi " << xi << dendl;
2668 }
2669
2670 // set features shared by the osd
2671 if (m->osd_features)
2672 xi.features = m->osd_features;
2673 else
2674 xi.features = m->get_connection()->get_features();
2675
2676 // mark in?
2677 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2678 (oldstate & CEPH_OSD_AUTOOUT)) ||
2679 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2680 (g_conf->mon_osd_auto_mark_in)) {
2681 if (can_mark_in(from)) {
2682 if (osdmap.osd_xinfo[from].old_weight > 0) {
2683 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2684 xi.old_weight = 0;
2685 } else {
2686 pending_inc.new_weight[from] = CEPH_OSD_IN;
2687 }
2688 } else {
2689 dout(7) << __func__ << " NOIN set, will not mark in "
2690 << m->get_orig_source_addr() << dendl;
2691 }
2692 }
2693
2694 pending_inc.new_xinfo[from] = xi;
2695
2696 // wait
2697 wait_for_finished_proposal(op, new C_Booted(this, op));
2698 }
2699 return true;
2700}
2701
2702void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2703{
2704 op->mark_osdmon_event(__func__);
2705 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2706 dout(7) << "_booted " << m->get_orig_source_inst()
2707 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2708
2709 if (logit) {
2710 mon->clog->info() << m->get_orig_source_inst() << " boot";
2711 }
2712
2713 send_latest(op, m->sb.current_epoch+1);
2714}
2715
2716
2717// -------------
2718// full
2719
2720bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2721{
2722 op->mark_osdmon_event(__func__);
2723 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2724 int from = m->get_orig_source().num();
2725 set<string> state;
2726 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2727
2728 // check permissions, ignore if failed
2729 MonSession *session = m->get_session();
2730 if (!session)
2731 goto ignore;
2732 if (!session->is_capable("osd", MON_CAP_X)) {
2733 dout(0) << "MOSDFull from entity with insufficient privileges:"
2734 << session->caps << dendl;
2735 goto ignore;
2736 }
2737
2738 // ignore a full message from the osd instance that already went down
2739 if (!osdmap.exists(from)) {
2740 dout(7) << __func__ << " ignoring full message from nonexistent "
2741 << m->get_orig_source_inst() << dendl;
2742 goto ignore;
2743 }
2744 if ((!osdmap.is_up(from) &&
2745 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2746 (osdmap.is_up(from) &&
2747 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2748 dout(7) << __func__ << " ignoring full message from down "
2749 << m->get_orig_source_inst() << dendl;
2750 goto ignore;
2751 }
2752
2753 OSDMap::calc_state_set(osdmap.get_state(from), state);
2754
2755 if ((osdmap.get_state(from) & mask) == m->state) {
2756 dout(7) << __func__ << " state already " << state << " for osd." << from
2757 << " " << m->get_orig_source_inst() << dendl;
2758 _reply_map(op, m->version);
2759 goto ignore;
2760 }
2761
2762 dout(10) << __func__ << " want state " << state << " for osd." << from
2763 << " " << m->get_orig_source_inst() << dendl;
2764 return false;
2765
2766 ignore:
2767 return true;
2768}
2769
2770bool OSDMonitor::prepare_full(MonOpRequestRef op)
2771{
2772 op->mark_osdmon_event(__func__);
2773 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2774 const int from = m->get_orig_source().num();
2775
2776 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2777 const unsigned want_state = m->state & mask; // safety first
2778
2779 unsigned cur_state = osdmap.get_state(from);
2780 auto p = pending_inc.new_state.find(from);
2781 if (p != pending_inc.new_state.end()) {
2782 cur_state ^= p->second;
2783 }
2784 cur_state &= mask;
2785
2786 set<string> want_state_set, cur_state_set;
2787 OSDMap::calc_state_set(want_state, want_state_set);
2788 OSDMap::calc_state_set(cur_state, cur_state_set);
2789
2790 if (cur_state != want_state) {
2791 if (p != pending_inc.new_state.end()) {
2792 p->second &= ~mask;
2793 } else {
2794 pending_inc.new_state[from] = 0;
2795 }
2796 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2797 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2798 << " -> " << want_state_set << dendl;
2799 } else {
2800 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2801 << " = wanted " << want_state_set << ", just waiting" << dendl;
2802 }
2803
2804 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2805 return true;
2806}
2807
2808// -------------
2809// alive
2810
2811bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2812{
2813 op->mark_osdmon_event(__func__);
2814 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2815 int from = m->get_orig_source().num();
2816
2817 // check permissions, ignore if failed
2818 MonSession *session = m->get_session();
2819 if (!session)
2820 goto ignore;
2821 if (!session->is_capable("osd", MON_CAP_X)) {
2822 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2823 << session->caps << dendl;
2824 goto ignore;
2825 }
2826
2827 if (!osdmap.is_up(from) ||
2828 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2829 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2830 goto ignore;
2831 }
2832
2833 if (osdmap.get_up_thru(from) >= m->want) {
2834 // yup.
2835 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2836 _reply_map(op, m->version);
2837 return true;
2838 }
2839
2840 dout(10) << "preprocess_alive want up_thru " << m->want
2841 << " from " << m->get_orig_source_inst() << dendl;
2842 return false;
2843
2844 ignore:
2845 return true;
2846}
2847
2848bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2849{
2850 op->mark_osdmon_event(__func__);
2851 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2852 int from = m->get_orig_source().num();
2853
2854 if (0) { // we probably don't care much about these
2855 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2856 }
2857
2858 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2859 << " from " << m->get_orig_source_inst() << dendl;
2860
2861 update_up_thru(from, m->version); // set to the latest map the OSD has
2862 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2863 return true;
2864}
2865
2866void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2867{
2868 op->mark_osdmon_event(__func__);
2869 dout(7) << "_reply_map " << e
2870 << " from " << op->get_req()->get_orig_source_inst()
2871 << dendl;
2872 send_latest(op, e);
2873}
2874
2875// pg_created
2876bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2877{
2878 op->mark_osdmon_event(__func__);
2879 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2880 dout(10) << __func__ << " " << *m << dendl;
2881 auto session = m->get_session();
94b18763 2882 mon->no_reply(op);
7c673cae
FG
2883 if (!session) {
2884 dout(10) << __func__ << ": no monitor session!" << dendl;
2885 return true;
2886 }
2887 if (!session->is_capable("osd", MON_CAP_X)) {
2888 derr << __func__ << " received from entity "
2889 << "with insufficient privileges " << session->caps << dendl;
2890 return true;
2891 }
2892 // always forward the "created!" to the leader
2893 return false;
2894}
2895
2896bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2897{
2898 op->mark_osdmon_event(__func__);
2899 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2900 dout(10) << __func__ << " " << *m << dendl;
2901 auto src = m->get_orig_source();
2902 auto from = src.num();
2903 if (!src.is_osd() ||
2904 !mon->osdmon()->osdmap.is_up(from) ||
2905 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2906 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2907 return false;
2908 }
2909 pending_created_pgs.push_back(m->pgid);
2910 return true;
2911}
2912
2913// -------------
2914// pg_temp changes
2915
2916bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2917{
2918 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2919 dout(10) << "preprocess_pgtemp " << *m << dendl;
2920 mempool::osdmap::vector<int> empty;
2921 int from = m->get_orig_source().num();
2922 size_t ignore_cnt = 0;
2923
2924 // check caps
2925 MonSession *session = m->get_session();
2926 if (!session)
2927 goto ignore;
2928 if (!session->is_capable("osd", MON_CAP_X)) {
2929 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2930 << session->caps << dendl;
2931 goto ignore;
2932 }
2933
2934 if (!osdmap.is_up(from) ||
2935 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2936 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2937 goto ignore;
2938 }
2939
3efd9988
FG
2940 if (m->forced) {
2941 return false;
2942 }
2943
7c673cae
FG
2944 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2945 dout(20) << " " << p->first
31f18b77 2946 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
2947 << " -> " << p->second << dendl;
2948
2949 // does the pool exist?
2950 if (!osdmap.have_pg_pool(p->first.pool())) {
2951 /*
2952 * 1. If the osdmap does not have the pool, it means the pool has been
2953 * removed in-between the osd sending this message and us handling it.
2954 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2955 * not exist in the pending either, as the osds would not send a
2956 * message about a pool they know nothing about (yet).
2957 * 3. However, if the pool does exist in the pending, then it must be a
2958 * new pool, and not relevant to this message (see 1).
2959 */
2960 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2961 << ": pool has been removed" << dendl;
2962 ignore_cnt++;
2963 continue;
2964 }
2965
2966 int acting_primary = -1;
2967 osdmap.pg_to_up_acting_osds(
2968 p->first, nullptr, nullptr, nullptr, &acting_primary);
2969 if (acting_primary != from) {
2970 /* If the source isn't the primary based on the current osdmap, we know
2971 * that the interval changed and that we can discard this message.
2972 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2973 * which of two pg temp mappings on the same pg is more recent.
2974 */
2975 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2976 << ": primary has changed" << dendl;
2977 ignore_cnt++;
2978 continue;
2979 }
2980
2981 // removal?
2982 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2983 osdmap.primary_temp->count(p->first)))
2984 return false;
2985 // change?
2986 // NOTE: we assume that this will clear pg_primary, so consider
2987 // an existing pg_primary field to imply a change
2988 if (p->second.size() &&
2989 (osdmap.pg_temp->count(p->first) == 0 ||
31f18b77 2990 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
7c673cae
FG
2991 osdmap.primary_temp->count(p->first)))
2992 return false;
2993 }
2994
2995 // should we ignore all the pgs?
2996 if (ignore_cnt == m->pg_temp.size())
2997 goto ignore;
2998
2999 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3000 _reply_map(op, m->map_epoch);
3001 return true;
3002
3003 ignore:
3004 return true;
3005}
3006
3007void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
3008{
3009 epoch_t old_up_thru = osdmap.get_up_thru(from);
3010 auto ut = pending_inc.new_up_thru.find(from);
3011 if (ut != pending_inc.new_up_thru.end()) {
3012 old_up_thru = ut->second;
3013 }
3014 if (up_thru > old_up_thru) {
3015 // set up_thru too, so the osd doesn't have to ask again
3016 pending_inc.new_up_thru[from] = up_thru;
3017 }
3018}
3019
3020bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
3021{
3022 op->mark_osdmon_event(__func__);
3023 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3024 int from = m->get_orig_source().num();
3025 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
3026 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3027 uint64_t pool = p->first.pool();
3028 if (pending_inc.old_pools.count(pool)) {
3029 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3030 << ": pool pending removal" << dendl;
3031 continue;
3032 }
3033 if (!osdmap.have_pg_pool(pool)) {
3034 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3035 << ": pool has been removed" << dendl;
3036 continue;
3037 }
3038 pending_inc.new_pg_temp[p->first] =
3039 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
3040
3041 // unconditionally clear pg_primary (until this message can encode
3042 // a change for that, too.. at which point we need to also fix
3043 // preprocess_pg_temp)
3044 if (osdmap.primary_temp->count(p->first) ||
3045 pending_inc.new_primary_temp.count(p->first))
3046 pending_inc.new_primary_temp[p->first] = -1;
3047 }
3048
3049 // set up_thru too, so the osd doesn't have to ask again
3050 update_up_thru(from, m->map_epoch);
3051
3052 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
3053 return true;
3054}
3055
3056
3057// ---
3058
3059bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
3060{
3061 op->mark_osdmon_event(__func__);
3062 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3063 dout(7) << "preprocess_remove_snaps " << *m << dendl;
3064
3065 // check privilege, ignore if failed
3066 MonSession *session = m->get_session();
3067 if (!session)
3068 goto ignore;
3069 if (!session->caps.is_capable(
3070 g_ceph_context,
3071 CEPH_ENTITY_TYPE_MON,
3072 session->entity_name,
3073 "osd", "osd pool rmsnap", {}, true, true, false)) {
3074 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3075 << session->caps << dendl;
3076 goto ignore;
3077 }
3078
3079 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3080 q != m->snaps.end();
3081 ++q) {
3082 if (!osdmap.have_pg_pool(q->first)) {
3083 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3084 continue;
3085 }
3086 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3087 for (vector<snapid_t>::iterator p = q->second.begin();
3088 p != q->second.end();
3089 ++p) {
3090 if (*p > pi->get_snap_seq() ||
3091 !pi->removed_snaps.contains(*p))
3092 return false;
3093 }
3094 }
3095
3096 ignore:
3097 return true;
3098}
3099
3100bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3101{
3102 op->mark_osdmon_event(__func__);
3103 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3104 dout(7) << "prepare_remove_snaps " << *m << dendl;
3105
3106 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3107 p != m->snaps.end();
3108 ++p) {
3109
3110 if (!osdmap.have_pg_pool(p->first)) {
3111 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3112 continue;
3113 }
3114
3115 pg_pool_t& pi = osdmap.pools[p->first];
3116 for (vector<snapid_t>::iterator q = p->second.begin();
3117 q != p->second.end();
3118 ++q) {
3119 if (!pi.removed_snaps.contains(*q) &&
3120 (!pending_inc.new_pools.count(p->first) ||
3121 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3122 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3123 newpi->removed_snaps.insert(*q);
3124 dout(10) << " pool " << p->first << " removed_snaps added " << *q
3125 << " (now " << newpi->removed_snaps << ")" << dendl;
3126 if (*q > newpi->get_snap_seq()) {
3127 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
3128 newpi->set_snap_seq(*q);
3129 }
3130 newpi->set_snap_epoch(pending_inc.epoch);
3131 }
3132 }
3133 }
3134 return true;
3135}
3136
3137// osd beacon
3138bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3139{
3140 op->mark_osdmon_event(__func__);
3141 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3142 // check caps
3143 auto session = beacon->get_session();
94b18763 3144 mon->no_reply(op);
7c673cae
FG
3145 if (!session) {
3146 dout(10) << __func__ << " no monitor session!" << dendl;
3147 return true;
3148 }
3149 if (!session->is_capable("osd", MON_CAP_X)) {
3150 derr << __func__ << " received from entity "
3151 << "with insufficient privileges " << session->caps << dendl;
3152 return true;
3153 }
3154 // Always forward the beacon to the leader, even if they are the same as
3155 // the old one. The leader will mark as down osds that haven't sent
3156 // beacon for a few minutes.
3157 return false;
3158}
3159
3160bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3161{
3162 op->mark_osdmon_event(__func__);
3163 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3164 const auto src = beacon->get_orig_source();
3165 dout(10) << __func__ << " " << *beacon
3166 << " from " << src << dendl;
3167 int from = src.num();
3168
3169 if (!src.is_osd() ||
3170 !osdmap.is_up(from) ||
3171 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
3172 dout(1) << " ignoring beacon from non-active osd." << dendl;
3173 return false;
3174 }
3175
3176 last_osd_report[from] = ceph_clock_now();
3177 osd_epochs[from] = beacon->version;
3178
3179 for (const auto& pg : beacon->pgs) {
3180 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3181 }
3182 return false;
3183}
3184
3185// ---------------
3186// map helpers
3187
3188void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3189{
3190 op->mark_osdmon_event(__func__);
3191 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3192 << " start " << start << dendl;
3193 if (start == 0)
3194 send_full(op);
3195 else
3196 send_incremental(op, start);
3197}
3198
3199
28e407b8 3200MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
7c673cae 3201{
28e407b8
AA
3202 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
3203 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
7c673cae
FG
3204 r->oldest_map = get_first_committed();
3205 r->newest_map = osdmap.get_epoch();
3206 return r;
3207}
3208
28e407b8 3209MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
7c673cae 3210{
28e407b8
AA
3211 dout(10) << "build_incremental [" << from << ".." << to << "] with features " << std::hex << features << dendl;
3212 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
7c673cae
FG
3213 m->oldest_map = get_first_committed();
3214 m->newest_map = osdmap.get_epoch();
3215
3216 for (epoch_t e = to; e >= from && e > 0; e--) {
3217 bufferlist bl;
28e407b8 3218 int err = get_version(e, features, bl);
7c673cae
FG
3219 if (err == 0) {
3220 assert(bl.length());
3221 // if (get_version(e, bl) > 0) {
3222 dout(20) << "build_incremental inc " << e << " "
3223 << bl.length() << " bytes" << dendl;
3224 m->incremental_maps[e] = bl;
3225 } else {
3226 assert(err == -ENOENT);
3227 assert(!bl.length());
28e407b8 3228 get_version_full(e, features, bl);
7c673cae
FG
3229 if (bl.length() > 0) {
3230 //else if (get_version("full", e, bl) > 0) {
3231 dout(20) << "build_incremental full " << e << " "
3232 << bl.length() << " bytes" << dendl;
3233 m->maps[e] = bl;
3234 } else {
3235 ceph_abort(); // we should have all maps.
3236 }
3237 }
3238 }
3239 return m;
3240}
3241
3242void OSDMonitor::send_full(MonOpRequestRef op)
3243{
3244 op->mark_osdmon_event(__func__);
3245 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
28e407b8 3246 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
7c673cae
FG
3247}
3248
3249void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3250{
3251 op->mark_osdmon_event(__func__);
3252
3253 MonSession *s = op->get_session();
3254 assert(s);
3255
3256 if (s->proxy_con &&
3257 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
3258 // oh, we can tell the other mon to do it
3259 dout(10) << __func__ << " asking proxying mon to send_incremental from "
3260 << first << dendl;
3261 MRoute *r = new MRoute(s->proxy_tid, NULL);
3262 r->send_osdmap_first = first;
3263 s->proxy_con->send_message(r);
3264 op->mark_event("reply: send routed send_osdmap_first reply");
3265 } else {
3266 // do it ourselves
3267 send_incremental(first, s, false, op);
3268 }
3269}
3270
3271void OSDMonitor::send_incremental(epoch_t first,
3272 MonSession *session,
3273 bool onetime,
3274 MonOpRequestRef req)
3275{
3276 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3277 << " to " << session->inst << dendl;
3278
28e407b8
AA
3279 // get feature of the peer
3280 // use quorum_con_features, if it's an anonymous connection.
3281 uint64_t features = session->con_features ? session->con_features :
3282 mon->get_quorum_con_features();
3283
7c673cae 3284 if (first <= session->osd_epoch) {
31f18b77 3285 dout(10) << __func__ << " " << session->inst << " should already have epoch "
7c673cae
FG
3286 << session->osd_epoch << dendl;
3287 first = session->osd_epoch + 1;
3288 }
3289
3290 if (first < get_first_committed()) {
3291 first = get_first_committed();
3292 bufferlist bl;
28e407b8 3293 int err = get_version_full(first, features, bl);
7c673cae
FG
3294 assert(err == 0);
3295 assert(bl.length());
3296
3297 dout(20) << "send_incremental starting with base full "
3298 << first << " " << bl.length() << " bytes" << dendl;
3299
28e407b8 3300 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
7c673cae
FG
3301 m->oldest_map = get_first_committed();
3302 m->newest_map = osdmap.get_epoch();
3303 m->maps[first] = bl;
3304
3305 if (req) {
3306 mon->send_reply(req, m);
3307 session->osd_epoch = first;
3308 return;
3309 } else {
3310 session->con->send_message(m);
3311 session->osd_epoch = first;
3312 }
3313 first++;
3314 }
3315
3316 while (first <= osdmap.get_epoch()) {
28e407b8
AA
3317 epoch_t last = std::min<epoch_t>(first + g_conf->osd_map_message_max - 1,
3318 osdmap.get_epoch());
3319 MOSDMap *m = build_incremental(first, last, features);
7c673cae
FG
3320
3321 if (req) {
3322 // send some maps. it may not be all of them, but it will get them
3323 // started.
3324 mon->send_reply(req, m);
3325 } else {
3326 session->con->send_message(m);
3327 first = last + 1;
3328 }
3329 session->osd_epoch = last;
3330 if (onetime || req)
3331 break;
3332 }
3333}
3334
3335int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3336{
28e407b8
AA
3337 return get_version(ver, mon->get_quorum_con_features(), bl);
3338}
3339
3340void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
3341{
3342 OSDMap::Incremental inc;
3343 bufferlist::iterator q = bl.begin();
3344 inc.decode(q);
3345 // always encode with subset of osdmap's canonical features
3346 uint64_t f = features & inc.encode_features;
3347 dout(20) << __func__ << " " << inc.epoch << " with features " << f
3348 << dendl;
3349 bl.clear();
3350 if (inc.fullmap.length()) {
3351 // embedded full map?
3352 OSDMap m;
3353 m.decode(inc.fullmap);
3354 inc.fullmap.clear();
3355 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
3356 }
3357 if (inc.crush.length()) {
3358 // embedded crush map
3359 CrushWrapper c;
3360 auto p = inc.crush.begin();
3361 c.decode(p);
3362 inc.crush.clear();
3363 c.encode(inc.crush, f);
3364 }
3365 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
3366}
3367
3368void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
3369{
3370 OSDMap m;
3371 bufferlist::iterator q = bl.begin();
3372 m.decode(q);
3373 // always encode with subset of osdmap's canonical features
3374 uint64_t f = features & m.get_encoding_features();
3375 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
3376 << dendl;
3377 bl.clear();
3378 m.encode(bl, f | CEPH_FEATURE_RESERVED);
3379}
3380
3381int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
3382{
3383 uint64_t significant_features = OSDMap::get_significant_features(features);
3384 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
3385 return 0;
3386 }
3387 int ret = PaxosService::get_version(ver, bl);
3388 if (ret < 0) {
7c673cae 3389 return ret;
28e407b8
AA
3390 }
3391 // NOTE: this check is imprecise; the OSDMap encoding features may
3392 // be a subset of the latest mon quorum features, but worst case we
3393 // reencode once and then cache the (identical) result under both
3394 // feature masks.
3395 if (significant_features !=
3396 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3397 reencode_incremental_map(bl, features);
3398 }
3399 inc_osd_cache.add({ver, significant_features}, bl);
3400 return 0;
7c673cae
FG
3401}
3402
3403int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3404{
28e407b8
AA
3405 return get_version_full(ver, mon->get_quorum_con_features(), bl);
3406}
3407
3408int OSDMonitor::get_version_full(version_t ver, uint64_t features,
3409 bufferlist& bl)
3410{
3411 uint64_t significant_features = OSDMap::get_significant_features(features);
3412 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
3413 return 0;
3414 }
3415 int ret = PaxosService::get_version_full(ver, bl);
3416 if (ret < 0) {
7c673cae 3417 return ret;
28e407b8
AA
3418 }
3419 // NOTE: this check is imprecise; the OSDMap encoding features may
3420 // be a subset of the latest mon quorum features, but worst case we
3421 // reencode once and then cache the (identical) result under both
3422 // feature masks.
3423 if (significant_features !=
3424 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3425 reencode_full_map(bl, features);
3426 }
3427 full_osd_cache.add({ver, significant_features}, bl);
3428 return 0;
7c673cae
FG
3429}
3430
3431epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
3432{
3433 dout(10) << "blacklist " << a << " until " << until << dendl;
3434 pending_inc.new_blacklist[a] = until;
3435 return pending_inc.epoch;
3436}
3437
3438
3439void OSDMonitor::check_osdmap_subs()
3440{
3441 dout(10) << __func__ << dendl;
3442 if (!osdmap.get_epoch()) {
3443 return;
3444 }
3445 auto osdmap_subs = mon->session_map.subs.find("osdmap");
3446 if (osdmap_subs == mon->session_map.subs.end()) {
3447 return;
3448 }
3449 auto p = osdmap_subs->second->begin();
3450 while (!p.end()) {
3451 auto sub = *p;
3452 ++p;
3453 check_osdmap_sub(sub);
3454 }
3455}
3456
3457void OSDMonitor::check_osdmap_sub(Subscription *sub)
3458{
3459 dout(10) << __func__ << " " << sub << " next " << sub->next
3460 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
3461 if (sub->next <= osdmap.get_epoch()) {
3462 if (sub->next >= 1)
3463 send_incremental(sub->next, sub->session, sub->incremental_onetime);
3464 else
28e407b8 3465 sub->session->con->send_message(build_latest_full(sub->session->con_features));
7c673cae
FG
3466 if (sub->onetime)
3467 mon->session_map.remove_sub(sub);
3468 else
3469 sub->next = osdmap.get_epoch() + 1;
3470 }
3471}
3472
3473void OSDMonitor::check_pg_creates_subs()
3474{
3475 if (!mon->monmap->get_required_features().contains_all(
3476 ceph::features::mon::FEATURE_LUMINOUS)) {
3477 // PGMonitor takes care of this in pre-luminous era.
3478 return;
3479 }
3480 if (!osdmap.get_num_up_osds()) {
3481 return;
3482 }
3483 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3484 mon->with_session_map([this](const MonSessionMap& session_map) {
3485 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3486 if (pg_creates_subs == session_map.subs.end()) {
3487 return;
3488 }
3489 for (auto sub : *pg_creates_subs->second) {
3490 check_pg_creates_sub(sub);
3491 }
3492 });
3493}
3494
3495void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3496{
3497 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3498 assert(sub->type == "osd_pg_creates");
3499 // only send these if the OSD is up. we will check_subs() when they do
3500 // come up so they will get the creates then.
3501 if (sub->session->inst.name.is_osd() &&
3502 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3503 sub->next = send_pg_creates(sub->session->inst.name.num(),
3504 sub->session->con.get(),
3505 sub->next);
3506 }
3507}
3508
c07f9fc5
FG
3509void OSDMonitor::do_application_enable(int64_t pool_id,
3510 const std::string &app_name)
3511{
35e4c445 3512 assert(paxos->is_plugged() && is_writeable());
c07f9fc5
FG
3513
3514 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
3515 << dendl;
3516
35e4c445
FG
3517 assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS ||
3518 pending_inc.new_require_osd_release >= CEPH_RELEASE_LUMINOUS);
3519
c07f9fc5
FG
3520 auto pp = osdmap.get_pg_pool(pool_id);
3521 assert(pp != nullptr);
3522
3523 pg_pool_t p = *pp;
3524 if (pending_inc.new_pools.count(pool_id)) {
3525 p = pending_inc.new_pools[pool_id];
3526 }
3527
3528 p.application_metadata.insert({app_name, {}});
3529 p.last_change = pending_inc.epoch;
3530 pending_inc.new_pools[pool_id] = p;
3531}
3532
31f18b77 3533unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
3534 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3535 const mempool::osdmap::set<int64_t>& removed_pools,
3536 utime_t modified,
3537 creating_pgs_t* creating_pgs) const
3538{
31f18b77 3539 unsigned queued = 0;
7c673cae
FG
3540 for (auto& p : pools) {
3541 int64_t poolid = p.first;
3542 const pg_pool_t& pool = p.second;
31f18b77 3543 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
3544 pool.get_type(), pool.get_size());
3545 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3546 continue;
3547
3548 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3549 const auto created = pool.get_last_change();
3550 if (last_scan_epoch && created <= last_scan_epoch) {
3551 dout(10) << __func__ << " no change in pool " << poolid
3552 << " " << pool << dendl;
3553 continue;
3554 }
3555 if (removed_pools.count(poolid)) {
3556 dout(10) << __func__ << " pool is being removed: " << poolid
3557 << " " << pool << dendl;
3558 continue;
3559 }
31f18b77 3560 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 3561 << " " << pool << dendl;
31f18b77
FG
3562 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3563 created, modified)) {
3564 queued++;
7c673cae
FG
3565 }
3566 }
31f18b77 3567 return queued;
7c673cae
FG
3568}
3569
3570void OSDMonitor::update_creating_pgs()
3571{
31f18b77
FG
3572 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3573 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
3574 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3575 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 3576 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
3577 int acting_primary = -1;
3578 auto pgid = pg.first;
94b18763
FG
3579 if (!osdmap.pg_exists(pgid)) {
3580 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
3581 << dendl;
3582 continue;
3583 }
7c673cae 3584 auto mapped = pg.second.first;
c07f9fc5 3585 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
7c673cae
FG
3586 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3587 // check the previous creating_pgs, look for the target to whom the pg was
3588 // previously mapped
3589 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3590 const auto last_acting_primary = pgs_by_epoch.first;
3591 for (auto& pgs: pgs_by_epoch.second) {
3592 if (pgs.second.count(pgid)) {
3593 if (last_acting_primary == acting_primary) {
3594 mapped = pgs.first;
3595 } else {
3596 dout(20) << __func__ << " " << pgid << " "
3597 << " acting_primary:" << last_acting_primary
3598 << " -> " << acting_primary << dendl;
3599 // note epoch if the target of the create message changed.
3600 mapped = mapping.get_epoch();
3601 }
3602 break;
31f18b77
FG
3603 } else {
3604 // newly creating
3605 mapped = mapping.get_epoch();
3606 }
7c673cae
FG
3607 }
3608 }
3609 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 3610 << " to create " << pgid << "@" << mapped << dendl;
7c673cae
FG
3611 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3612 }
3613 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3614 creating_pgs_epoch = mapping.get_epoch();
3615}
3616
c07f9fc5 3617epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
3618{
3619 dout(30) << __func__ << " osd." << osd << " next=" << next
3620 << " " << creating_pgs_by_osd_epoch << dendl;
3621 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
3622 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
3623 dout(20) << __func__
3624 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
3625 // the subscribers will be updated when the mapping is completed anyway
3626 return next;
3627 }
7c673cae
FG
3628 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3629 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3630 return next;
3631 assert(!creating_pgs_by_epoch->second.empty());
3632
3633 MOSDPGCreate *m = nullptr;
3634 epoch_t last = 0;
3635 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3636 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3637 auto epoch = epoch_pgs->first;
3638 auto& pgs = epoch_pgs->second;
3639 dout(20) << __func__ << " osd." << osd << " from " << next
3640 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3641 last = epoch;
3642 for (auto& pg : pgs) {
3643 if (!m)
3644 m = new MOSDPGCreate(creating_pgs_epoch);
3645 // Need the create time from the monitor using its clock to set
3646 // last_scrub_stamp upon pg creation.
c07f9fc5
FG
3647 auto create = creating_pgs.pgs.find(pg);
3648 assert(create != creating_pgs.pgs.end());
3649 m->mkpg.emplace(pg, pg_create_t{create->second.first, pg, 0});
3650 m->ctimes.emplace(pg, create->second.second);
7c673cae 3651 dout(20) << __func__ << " will create " << pg
c07f9fc5 3652 << " at " << create->second.first << dendl;
7c673cae
FG
3653 }
3654 }
3655 if (!m) {
3656 dout(20) << __func__ << " osd." << osd << " from " << next
3657 << " has nothing to send" << dendl;
3658 return next;
3659 }
3660 con->send_message(m);
3661 // sub is current through last + 1
3662 return last + 1;
3663}
3664
3665// TICK
3666
3667
3668void OSDMonitor::tick()
3669{
3670 if (!is_active()) return;
3671
3672 dout(10) << osdmap << dendl;
3673
3674 if (!mon->is_leader()) return;
3675
3676 bool do_propose = false;
3677 utime_t now = ceph_clock_now();
3678
31f18b77 3679 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
3680 mon->monmap->get_required_features().contains_all(
3681 ceph::features::mon::FEATURE_LUMINOUS)) {
3682 if (handle_osd_timeouts(now, last_osd_report)) {
3683 do_propose = true;
3684 }
3685 }
181888fb
FG
3686 if (!osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) &&
3687 osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3688 mon->mgrstatmon()->is_readable() &&
3689 mon->mgrstatmon()->definitely_converted_snapsets()) {
3690 dout(1) << __func__ << " all snapsets converted, setting purged_snapdirs"
3691 << dendl;
3692 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS);
3693 do_propose = true;
3694 }
7c673cae
FG
3695
3696 // mark osds down?
3697 if (check_failures(now))
3698 do_propose = true;
3699
3700 // mark down osds out?
3701
3702 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3703 * influence at all. The decision is made based on the ratio of "in" osds,
3704 * and the function returns false if this ratio is lower that the minimum
3705 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3706 */
3707 if (can_mark_out(-1)) {
3708 set<int> down_cache; // quick cache of down subtrees
3709
3710 map<int,utime_t>::iterator i = down_pending_out.begin();
3711 while (i != down_pending_out.end()) {
3712 int o = i->first;
3713 utime_t down = now;
3714 down -= i->second;
3715 ++i;
3716
3717 if (osdmap.is_down(o) &&
3718 osdmap.is_in(o) &&
3719 can_mark_out(o)) {
3720 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3721 utime_t grace = orig_grace;
3722 double my_grace = 0.0;
3723
3724 if (g_conf->mon_osd_adjust_down_out_interval) {
3725 // scale grace period the same way we do the heartbeat grace.
3726 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3727 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3728 double decay_k = ::log(.5) / halflife;
3729 double decay = exp((double)down * decay_k);
3730 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3731 << " down for " << down << " decay " << decay << dendl;
3732 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3733 grace += my_grace;
3734 }
3735
3736 // is this an entire large subtree down?
3737 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3738 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3739 if (type > 0) {
3740 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3741 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3742 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3743 // reset timer, too.
3744 down_pending_out[o] = now;
3745 continue;
3746 }
3747 }
3748 }
3749
c07f9fc5
FG
3750 bool down_out = !osdmap.is_destroyed(o) &&
3751 g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace;
3752 bool destroyed_out = osdmap.is_destroyed(o) &&
3753 g_conf->mon_osd_destroyed_out_interval > 0 &&
3754 // this is not precise enough as we did not make a note when this osd
3755 // was marked as destroyed, but let's not bother with that
3756 // complexity for now.
3757 down.sec() >= g_conf->mon_osd_destroyed_out_interval;
3758 if (down_out || destroyed_out) {
7c673cae
FG
3759 dout(10) << "tick marking osd." << o << " OUT after " << down
3760 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3761 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3762
3763 // set the AUTOOUT bit.
3764 if (pending_inc.new_state.count(o) == 0)
3765 pending_inc.new_state[o] = 0;
3766 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3767
3768 // remember previous weight
3769 if (pending_inc.new_xinfo.count(o) == 0)
3770 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3771 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3772
3773 do_propose = true;
3774
224ce89b
WB
3775 mon->clog->info() << "Marking osd." << o << " out (has been down for "
3776 << int(down.sec()) << " seconds)";
7c673cae
FG
3777 } else
3778 continue;
3779 }
3780
3781 down_pending_out.erase(o);
3782 }
3783 } else {
3784 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3785 }
3786
3787 // expire blacklisted items?
3788 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3789 p != osdmap.blacklist.end();
3790 ++p) {
3791 if (p->second < now) {
3792 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3793 pending_inc.old_blacklist.push_back(p->first);
3794 do_propose = true;
3795 }
3796 }
3797
3798 // if map full setting has changed, get that info out there!
31f18b77
FG
3799 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3800 mon->pgservice->is_readable()) {
7c673cae 3801 // for pre-luminous compat only!
31f18b77 3802 if (mon->pgservice->have_full_osds()) {
7c673cae
FG
3803 dout(5) << "There are full osds, setting full flag" << dendl;
3804 add_flag(CEPH_OSDMAP_FULL);
3805 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3806 dout(10) << "No full osds, removing full flag" << dendl;
3807 remove_flag(CEPH_OSDMAP_FULL);
3808 }
3809
31f18b77 3810 if (mon->pgservice->have_nearfull_osds()) {
7c673cae
FG
3811 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3812 add_flag(CEPH_OSDMAP_NEARFULL);
3813 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3814 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3815 remove_flag(CEPH_OSDMAP_NEARFULL);
3816 }
3817 if (pending_inc.new_flags != -1 &&
3818 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3819 dout(1) << "New setting for" <<
3820 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3821 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3822 << " -- doing propose" << dendl;
3823 do_propose = true;
3824 }
3825 }
3826
3827 if (update_pools_status())
3828 do_propose = true;
3829
3830 if (do_propose ||
3831 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3832 propose_pending();
3833}
3834
3835bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3836 std::map<int,utime_t> &last_osd_report)
3837{
3838 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3839 if (now - mon->get_leader_since() < timeo) {
3840 // We haven't been the leader for long enough to consider OSD timeouts
3841 return false;
3842 }
3843
3844 int max_osd = osdmap.get_max_osd();
3845 bool new_down = false;
3846
3847 for (int i=0; i < max_osd; ++i) {
3848 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
3849 if (!osdmap.exists(i)) {
3850 last_osd_report.erase(i); // if any
3851 continue;
3852 }
7c673cae
FG
3853 if (!osdmap.is_up(i))
3854 continue;
3855 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3856 if (t == last_osd_report.end()) {
3857 // it wasn't in the map; start the timer.
3858 last_osd_report[i] = now;
3859 } else if (can_mark_down(i)) {
3860 utime_t diff = now - t->second;
3861 if (diff > timeo) {
31f18b77
FG
3862 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3863 << diff << " seconds";
3864 derr << "no beacon from osd." << i << " since " << t->second
3865 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
3866 pending_inc.new_state[i] = CEPH_OSD_UP;
3867 new_down = true;
3868 }
3869 }
3870 }
3871 return new_down;
3872}
3873
3874void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3875 list<pair<health_status_t,string> > *detail,
3876 CephContext *cct) const
3877{
3878 int num_osds = osdmap.get_num_osds();
3879
3880 if (num_osds == 0) {
3881 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3882 } else {
3883 int num_in_osds = 0;
3884 int num_down_in_osds = 0;
3885 set<int> osds;
31f18b77
FG
3886 set<int> down_in_osds;
3887 set<int> up_in_osds;
3888 set<int> subtree_up;
3889 unordered_map<int, set<int> > subtree_type_down;
3890 unordered_map<int, int> num_osds_subtree;
3891 int max_type = osdmap.crush->get_max_type_id();
3892
7c673cae
FG
3893 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3894 if (!osdmap.exists(i)) {
3895 if (osdmap.crush->item_exists(i)) {
3896 osds.insert(i);
3897 }
31f18b77 3898 continue;
224ce89b 3899 }
7c673cae
FG
3900 if (osdmap.is_out(i))
3901 continue;
3902 ++num_in_osds;
31f18b77
FG
3903 if (down_in_osds.count(i) || up_in_osds.count(i))
3904 continue;
7c673cae 3905 if (!osdmap.is_up(i)) {
31f18b77
FG
3906 down_in_osds.insert(i);
3907 int parent_id = 0;
3908 int current = i;
3909 for (int type = 0; type <= max_type; type++) {
3910 if (!osdmap.crush->get_type_name(type))
3911 continue;
3912 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3913 if (r == -ENOENT)
3914 break;
3915 // break early if this parent is already marked as up
3916 if (subtree_up.count(parent_id))
3917 break;
3918 type = osdmap.crush->get_bucket_type(parent_id);
3919 if (!osdmap.subtree_type_is_down(
3920 g_ceph_context, parent_id, type,
3921 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3922 break;
3923 current = parent_id;
3924 }
3925 }
3926 }
3927
3928 // calculate the number of down osds in each down subtree and
3929 // store it in num_osds_subtree
3930 for (int type = 1; type <= max_type; type++) {
3931 if (!osdmap.crush->get_type_name(type))
3932 continue;
3933 for (auto j = subtree_type_down[type].begin();
3934 j != subtree_type_down[type].end();
3935 ++j) {
3936 if (type == 1) {
3937 list<int> children;
3938 int num = osdmap.crush->get_children(*j, &children);
3939 num_osds_subtree[*j] = num;
3940 } else {
3941 list<int> children;
3942 int num = 0;
3943 int num_children = osdmap.crush->get_children(*j, &children);
3944 if (num_children == 0)
3945 continue;
3946 for (auto l = children.begin(); l != children.end(); ++l) {
3947 if (num_osds_subtree[*l] > 0) {
3948 num = num + num_osds_subtree[*l];
3949 }
3950 }
3951 num_osds_subtree[*j] = num;
7c673cae
FG
3952 }
3953 }
3954 }
31f18b77 3955 num_down_in_osds = down_in_osds.size();
7c673cae
FG
3956 assert(num_down_in_osds <= num_in_osds);
3957 if (num_down_in_osds > 0) {
31f18b77
FG
3958 // summary of down subtree types and osds
3959 for (int type = max_type; type > 0; type--) {
3960 if (!osdmap.crush->get_type_name(type))
3961 continue;
3962 if (subtree_type_down[type].size() > 0) {
3963 ostringstream ss;
3964 ss << subtree_type_down[type].size() << " "
3965 << osdmap.crush->get_type_name(type);
3966 if (subtree_type_down[type].size() > 1) {
3967 ss << "s";
3968 }
3969 int sum_down_osds = 0;
3970 for (auto j = subtree_type_down[type].begin();
3971 j != subtree_type_down[type].end();
3972 ++j) {
3973 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
3974 }
3975 ss << " (" << sum_down_osds << " osds) down";
3976 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3977 }
3978 }
7c673cae 3979 ostringstream ss;
31f18b77 3980 ss << down_in_osds.size() << " osds down";
7c673cae 3981 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
31f18b77
FG
3982
3983 if (detail) {
3984 // details of down subtree types
3985 for (int type = max_type; type > 0; type--) {
3986 if (!osdmap.crush->get_type_name(type))
3987 continue;
3988 for (auto j = subtree_type_down[type].rbegin();
3989 j != subtree_type_down[type].rend();
3990 ++j) {
3991 ostringstream ss;
3992 ss << osdmap.crush->get_type_name(type);
3993 ss << " ";
3994 ss << osdmap.crush->get_item_name(*j);
3995 // at the top level, do not print location
3996 if (type != max_type) {
3997 ss << " (";
3998 ss << osdmap.crush->get_full_location_ordered_string(*j);
3999 ss << ")";
4000 }
4001 int num = num_osds_subtree[*j];
4002 ss << " (" << num << " osds)";
4003 ss << " is down";
4004 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4005 }
4006 }
4007 // details of down osds
4008 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
4009 ostringstream ss;
4010 ss << "osd." << *it << " (";
4011 ss << osdmap.crush->get_full_location_ordered_string(*it);
4012 ss << ") is down";
4013 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4014 }
4015 }
7c673cae
FG
4016 }
4017
4018 if (!osds.empty()) {
4019 ostringstream ss;
31f18b77 4020 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
7c673cae
FG
4021 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4022 if (detail) {
31f18b77 4023 ss << " (osds: " << osds << ")";
7c673cae
FG
4024 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4025 }
4026 }
4027
7c673cae
FG
4028 // note: we leave it to ceph-mgr to generate details health warnings
4029 // with actual osd utilizations
4030
4031 // warn about flags
4032 uint64_t warn_flags =
4033 CEPH_OSDMAP_FULL |
4034 CEPH_OSDMAP_PAUSERD |
4035 CEPH_OSDMAP_PAUSEWR |
4036 CEPH_OSDMAP_PAUSEREC |
4037 CEPH_OSDMAP_NOUP |
4038 CEPH_OSDMAP_NODOWN |
4039 CEPH_OSDMAP_NOIN |
4040 CEPH_OSDMAP_NOOUT |
4041 CEPH_OSDMAP_NOBACKFILL |
4042 CEPH_OSDMAP_NORECOVER |
4043 CEPH_OSDMAP_NOSCRUB |
4044 CEPH_OSDMAP_NODEEP_SCRUB |
4045 CEPH_OSDMAP_NOTIERAGENT |
4046 CEPH_OSDMAP_NOREBALANCE;
4047 if (osdmap.test_flag(warn_flags)) {
4048 ostringstream ss;
4049 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
4050 << " flag(s) set";
4051 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4052 if (detail)
4053 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4054 }
4055
4056 // old crush tunables?
4057 if (g_conf->mon_warn_on_legacy_crush_tunables) {
4058 string min = osdmap.crush->get_min_required_version();
4059 if (min < g_conf->mon_crush_min_required_version) {
4060 ostringstream ss;
4061 ss << "crush map has legacy tunables (require " << min
4062 << ", min is " << g_conf->mon_crush_min_required_version << ")";
4063 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4064 if (detail) {
4065 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
4066 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4067 }
4068 }
4069 }
4070 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
4071 if (osdmap.crush->get_straw_calc_version() == 0) {
4072 ostringstream ss;
4073 ss << "crush map has straw_calc_version=0";
4074 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4075 if (detail) {
4076 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
4077 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4078 }
4079 }
4080 }
4081
4082 // hit_set-less cache_mode?
4083 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
4084 int problem_cache_pools = 0;
4085 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
4086 p != osdmap.pools.end();
4087 ++p) {
4088 const pg_pool_t& info = p->second;
4089 if (info.cache_mode_requires_hit_set() &&
4090 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
4091 ++problem_cache_pools;
4092 if (detail) {
4093 ostringstream ss;
4094 ss << "pool '" << osdmap.get_pool_name(p->first)
4095 << "' with cache_mode " << info.get_cache_mode_name()
4096 << " needs hit_set_type to be set but it is not";
4097 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4098 }
4099 }
4100 }
4101 if (problem_cache_pools) {
4102 ostringstream ss;
4103 ss << problem_cache_pools << " cache pools are missing hit_sets";
4104 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4105 }
4106 }
4107
4108 // Not using 'sortbitwise' and should be?
4109 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
4110 (osdmap.get_up_osd_features() &
4111 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
4112 ostringstream ss;
4113 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
4114 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4115 }
4116
4117 // Warn if 'mon_osd_down_out_interval' is set to zero.
4118 // Having this option set to zero on the leader acts much like the
4119 // 'noout' flag. It's hard to figure out what's going wrong with clusters
4120 // without the 'noout' flag set but acting like that just the same, so
4121 // we report a HEALTH_WARN in case this option is set to zero.
4122 // This is an ugly hack to get the warning out, but until we find a way
4123 // to spread global options throughout the mon cluster and have all mons
4124 // using a base set of the same options, we need to work around this sort
4125 // of things.
4126 // There's also the obvious drawback that if this is set on a single
4127 // monitor on a 3-monitor cluster, this warning will only be shown every
4128 // third monitor connection.
4129 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
4130 g_conf->mon_osd_down_out_interval == 0) {
4131 ostringstream ss;
4132 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
4133 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4134 if (detail) {
4135 ss << "; this has the same effect as the 'noout' flag";
4136 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4137 }
4138 }
4139
4140 // warn about upgrade flags that can be set but are not.
4141 if (g_conf->mon_debug_no_require_luminous) {
4142 // ignore these checks
4143 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
31f18b77
FG
4144 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
4145 string msg = "all OSDs are running luminous or later but"
4146 " require_osd_release < luminous";
7c673cae
FG
4147 summary.push_back(make_pair(HEALTH_WARN, msg));
4148 if (detail) {
4149 detail->push_back(make_pair(HEALTH_WARN, msg));
4150 }
4151 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
31f18b77
FG
4152 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
4153 string msg = "all OSDs are running kraken or later but"
4154 " require_osd_release < kraken";
7c673cae
FG
4155 summary.push_back(make_pair(HEALTH_WARN, msg));
4156 if (detail) {
4157 detail->push_back(make_pair(HEALTH_WARN, msg));
4158 }
4159 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
31f18b77
FG
4160 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
4161 string msg = "all OSDs are running jewel or later but"
4162 " require_osd_release < jewel";
7c673cae
FG
4163 summary.push_back(make_pair(HEALTH_WARN, msg));
4164 if (detail) {
4165 detail->push_back(make_pair(HEALTH_WARN, msg));
4166 }
4167 }
4168
224ce89b
WB
4169 for (auto it : osdmap.get_pools()) {
4170 const pg_pool_t &pool = it.second;
4171 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4172 const string& pool_name = osdmap.get_pool_name(it.first);
4173 stringstream ss;
4174 ss << "pool '" << pool_name << "' is full";
4175 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4176 if (detail)
4177 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4178 }
4179 }
7c673cae
FG
4180 }
4181}
4182
4183void OSDMonitor::dump_info(Formatter *f)
4184{
4185 f->open_object_section("osdmap");
4186 osdmap.dump(f);
4187 f->close_section();
4188
4189 f->open_array_section("osd_metadata");
4190 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4191 if (osdmap.exists(i)) {
4192 f->open_object_section("osd");
4193 f->dump_unsigned("id", i);
4194 dump_osd_metadata(i, f, NULL);
4195 f->close_section();
4196 }
4197 }
4198 f->close_section();
4199
4200 f->dump_unsigned("osdmap_first_committed", get_first_committed());
4201 f->dump_unsigned("osdmap_last_committed", get_last_committed());
4202
4203 f->open_object_section("crushmap");
4204 osdmap.crush->dump(f);
4205 f->close_section();
4206}
4207
4208namespace {
4209 enum osd_pool_get_choices {
4210 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
28e407b8 4211 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
7c673cae
FG
4212 NODELETE, NOPGCHANGE, NOSIZECHANGE,
4213 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4214 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4215 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4216 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4217 CACHE_TARGET_FULL_RATIO,
4218 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4219 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4220 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4221 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4222 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4223 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4224 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4225 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4226 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
4227
4228 std::set<osd_pool_get_choices>
4229 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4230 const std::set<osd_pool_get_choices>& second)
4231 {
4232 std::set<osd_pool_get_choices> result;
4233 std::set_difference(first.begin(), first.end(),
4234 second.begin(), second.end(),
4235 std::inserter(result, result.end()));
4236 return result;
4237 }
4238}
4239
4240
4241bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4242{
4243 op->mark_osdmon_event(__func__);
4244 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4245 int r = 0;
4246 bufferlist rdata;
4247 stringstream ss, ds;
4248
4249 map<string, cmd_vartype> cmdmap;
4250 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4251 string rs = ss.str();
4252 mon->reply_command(op, -EINVAL, rs, get_last_committed());
4253 return true;
4254 }
4255
4256 MonSession *session = m->get_session();
4257 if (!session) {
4258 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4259 return true;
4260 }
4261
4262 string prefix;
4263 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
4264
4265 string format;
4266 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
4267 boost::scoped_ptr<Formatter> f(Formatter::create(format));
4268
4269 if (prefix == "osd stat") {
224ce89b 4270 osdmap.print_summary(f.get(), ds, "");
7c673cae
FG
4271 if (f)
4272 f->flush(rdata);
4273 else
4274 rdata.append(ds);
4275 }
4276 else if (prefix == "osd perf" ||
4277 prefix == "osd blocked-by") {
31f18b77
FG
4278 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4279 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
4280 }
4281 else if (prefix == "osd dump" ||
4282 prefix == "osd tree" ||
4283 prefix == "osd ls" ||
4284 prefix == "osd getmap" ||
31f18b77
FG
4285 prefix == "osd getcrushmap" ||
4286 prefix == "osd ls-tree") {
7c673cae
FG
4287 string val;
4288
4289 epoch_t epoch = 0;
4290 int64_t epochnum;
4291 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
4292 epoch = epochnum;
4293
4294 bufferlist osdmap_bl;
4295 int err = get_version_full(epoch, osdmap_bl);
4296 if (err == -ENOENT) {
4297 r = -ENOENT;
4298 ss << "there is no map for epoch " << epoch;
4299 goto reply;
4300 }
4301 assert(err == 0);
4302 assert(osdmap_bl.length());
4303
4304 OSDMap *p;
4305 if (epoch == osdmap.get_epoch()) {
4306 p = &osdmap;
4307 } else {
4308 p = new OSDMap;
4309 p->decode(osdmap_bl);
4310 }
4311
224ce89b
WB
4312 auto sg = make_scope_guard([&] {
4313 if (p != &osdmap) {
4314 delete p;
4315 }
4316 });
4317
7c673cae
FG
4318 if (prefix == "osd dump") {
4319 stringstream ds;
4320 if (f) {
4321 f->open_object_section("osdmap");
4322 p->dump(f.get());
4323 f->close_section();
4324 f->flush(ds);
4325 } else {
4326 p->print(ds);
4327 }
4328 rdata.append(ds);
4329 if (!f)
4330 ds << " ";
4331 } else if (prefix == "osd ls") {
4332 if (f) {
4333 f->open_array_section("osds");
4334 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4335 if (osdmap.exists(i)) {
4336 f->dump_int("osd", i);
4337 }
4338 }
4339 f->close_section();
4340 f->flush(ds);
4341 } else {
4342 bool first = true;
4343 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4344 if (osdmap.exists(i)) {
4345 if (!first)
4346 ds << "\n";
4347 first = false;
4348 ds << i;
4349 }
4350 }
4351 }
4352 rdata.append(ds);
4353 } else if (prefix == "osd tree") {
31f18b77
FG
4354 vector<string> states;
4355 cmd_getval(g_ceph_context, cmdmap, "states", states);
4356 unsigned filter = 0;
4357 for (auto& s : states) {
4358 if (s == "up") {
4359 filter |= OSDMap::DUMP_UP;
4360 } else if (s == "down") {
4361 filter |= OSDMap::DUMP_DOWN;
4362 } else if (s == "in") {
4363 filter |= OSDMap::DUMP_IN;
4364 } else if (s == "out") {
4365 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
4366 } else if (s == "destroyed") {
4367 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
4368 } else {
4369 ss << "unrecognized state '" << s << "'";
4370 r = -EINVAL;
4371 goto reply;
4372 }
4373 }
4374 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
4375 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4376 ss << "cannot specify both 'in' and 'out'";
4377 r = -EINVAL;
4378 goto reply;
4379 }
4380 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4381 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4382 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4383 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4384 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4385 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4386 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
4387 r = -EINVAL;
4388 goto reply;
4389 }
7c673cae
FG
4390 if (f) {
4391 f->open_object_section("tree");
31f18b77 4392 p->print_tree(f.get(), NULL, filter);
7c673cae
FG
4393 f->close_section();
4394 f->flush(ds);
4395 } else {
31f18b77 4396 p->print_tree(NULL, &ds, filter);
7c673cae
FG
4397 }
4398 rdata.append(ds);
4399 } else if (prefix == "osd getmap") {
4400 rdata.append(osdmap_bl);
4401 ss << "got osdmap epoch " << p->get_epoch();
4402 } else if (prefix == "osd getcrushmap") {
4403 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
4404 ss << p->get_crush_version();
4405 } else if (prefix == "osd ls-tree") {
4406 string bucket_name;
4407 cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
4408 set<int> osds;
4409 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4410 if (r == -ENOENT) {
4411 ss << "\"" << bucket_name << "\" does not exist";
4412 goto reply;
4413 } else if (r < 0) {
4414 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4415 goto reply;
4416 }
4417
4418 if (f) {
4419 f->open_array_section("osds");
4420 for (auto &i : osds) {
4421 if (osdmap.exists(i)) {
4422 f->dump_int("osd", i);
4423 }
4424 }
4425 f->close_section();
4426 f->flush(ds);
4427 } else {
4428 bool first = true;
4429 for (auto &i : osds) {
4430 if (osdmap.exists(i)) {
4431 if (!first)
4432 ds << "\n";
4433 first = false;
4434 ds << i;
4435 }
4436 }
4437 }
4438
4439 rdata.append(ds);
7c673cae 4440 }
7c673cae
FG
4441 } else if (prefix == "osd df") {
4442 string method;
4443 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
31f18b77
FG
4444 print_osd_utilization(osdmap, mon->pgservice, ds,
4445 f.get(), method == "tree");
7c673cae
FG
4446 rdata.append(ds);
4447 } else if (prefix == "osd getmaxosd") {
4448 if (f) {
4449 f->open_object_section("getmaxosd");
4450 f->dump_unsigned("epoch", osdmap.get_epoch());
4451 f->dump_int("max_osd", osdmap.get_max_osd());
4452 f->close_section();
4453 f->flush(rdata);
4454 } else {
4455 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4456 rdata.append(ds);
4457 }
4458 } else if (prefix == "osd utilization") {
4459 string out;
4460 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4461 if (f)
4462 f->flush(rdata);
4463 else
4464 rdata.append(out);
4465 r = 0;
4466 goto reply;
4467 } else if (prefix == "osd find") {
4468 int64_t osd;
4469 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4470 ss << "unable to parse osd id value '"
4471 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4472 r = -EINVAL;
4473 goto reply;
4474 }
4475 if (!osdmap.exists(osd)) {
4476 ss << "osd." << osd << " does not exist";
4477 r = -ENOENT;
4478 goto reply;
4479 }
4480 string format;
4481 cmd_getval(g_ceph_context, cmdmap, "format", format);
4482 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4483 f->open_object_section("osd_location");
4484 f->dump_int("osd", osd);
4485 f->dump_stream("ip") << osdmap.get_addr(osd);
4486 f->open_object_section("crush_location");
4487 map<string,string> loc = osdmap.crush->get_full_location(osd);
4488 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4489 f->dump_string(p->first.c_str(), p->second);
4490 f->close_section();
4491 f->close_section();
4492 f->flush(rdata);
4493 } else if (prefix == "osd metadata") {
4494 int64_t osd = -1;
4495 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4496 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4497 ss << "unable to parse osd id value '"
4498 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4499 r = -EINVAL;
4500 goto reply;
4501 }
4502 if (osd >= 0 && !osdmap.exists(osd)) {
4503 ss << "osd." << osd << " does not exist";
4504 r = -ENOENT;
4505 goto reply;
4506 }
4507 string format;
4508 cmd_getval(g_ceph_context, cmdmap, "format", format);
4509 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4510 if (osd >= 0) {
4511 f->open_object_section("osd_metadata");
4512 f->dump_unsigned("id", osd);
4513 r = dump_osd_metadata(osd, f.get(), &ss);
4514 if (r < 0)
4515 goto reply;
4516 f->close_section();
4517 } else {
4518 r = 0;
4519 f->open_array_section("osd_metadata");
4520 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4521 if (osdmap.exists(i)) {
4522 f->open_object_section("osd");
4523 f->dump_unsigned("id", i);
4524 r = dump_osd_metadata(i, f.get(), NULL);
4525 if (r == -EINVAL || r == -ENOENT) {
4526 // Drop error, continue to get other daemons' metadata
4527 dout(4) << "No metadata for osd." << i << dendl;
4528 r = 0;
4529 } else if (r < 0) {
4530 // Unexpected error
4531 goto reply;
4532 }
4533 f->close_section();
4534 }
4535 }
4536 f->close_section();
4537 }
4538 f->flush(rdata);
31f18b77
FG
4539 } else if (prefix == "osd versions") {
4540 if (!f)
4541 f.reset(Formatter::create("json-pretty"));
4542 count_metadata("ceph_version", f.get());
4543 f->flush(rdata);
4544 r = 0;
4545 } else if (prefix == "osd count-metadata") {
4546 if (!f)
4547 f.reset(Formatter::create("json-pretty"));
4548 string field;
4549 cmd_getval(g_ceph_context, cmdmap, "property", field);
4550 count_metadata(field, f.get());
4551 f->flush(rdata);
4552 r = 0;
7c673cae
FG
4553 } else if (prefix == "osd map") {
4554 string poolstr, objstr, namespacestr;
4555 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4556 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4557 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4558
4559 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4560 if (pool < 0) {
4561 ss << "pool " << poolstr << " does not exist";
4562 r = -ENOENT;
4563 goto reply;
4564 }
4565 object_locator_t oloc(pool, namespacestr);
4566 object_t oid(objstr);
4567 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4568 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4569 vector<int> up, acting;
4570 int up_p, acting_p;
4571 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4572
4573 string fullobjname;
4574 if (!namespacestr.empty())
4575 fullobjname = namespacestr + string("/") + oid.name;
4576 else
4577 fullobjname = oid.name;
4578 if (f) {
4579 f->open_object_section("osd_map");
4580 f->dump_unsigned("epoch", osdmap.get_epoch());
4581 f->dump_string("pool", poolstr);
4582 f->dump_int("pool_id", pool);
4583 f->dump_stream("objname") << fullobjname;
4584 f->dump_stream("raw_pgid") << pgid;
4585 f->dump_stream("pgid") << mpgid;
4586 f->open_array_section("up");
4587 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4588 f->dump_int("osd", *p);
4589 f->close_section();
4590 f->dump_int("up_primary", up_p);
4591 f->open_array_section("acting");
4592 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4593 f->dump_int("osd", *p);
4594 f->close_section();
4595 f->dump_int("acting_primary", acting_p);
4596 f->close_section(); // osd_map
4597 f->flush(rdata);
4598 } else {
4599 ds << "osdmap e" << osdmap.get_epoch()
4600 << " pool '" << poolstr << "' (" << pool << ")"
4601 << " object '" << fullobjname << "' ->"
4602 << " pg " << pgid << " (" << mpgid << ")"
4603 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4604 << pg_vector_string(acting) << ", p" << acting_p << ")";
4605 rdata.append(ds);
4606 }
4607
4608 } else if (prefix == "pg map") {
4609 pg_t pgid;
4610 string pgidstr;
4611 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4612 if (!pgid.parse(pgidstr.c_str())) {
4613 ss << "invalid pgid '" << pgidstr << "'";
4614 r = -EINVAL;
4615 goto reply;
4616 }
4617 vector<int> up, acting;
4618 if (!osdmap.have_pg_pool(pgid.pool())) {
4619 ss << "pg '" << pgidstr << "' does not exist";
4620 r = -ENOENT;
4621 goto reply;
4622 }
4623 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4624 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4625 if (f) {
4626 f->open_object_section("pg_map");
4627 f->dump_unsigned("epoch", osdmap.get_epoch());
4628 f->dump_stream("raw_pgid") << pgid;
4629 f->dump_stream("pgid") << mpgid;
4630 f->open_array_section("up");
4631 for (auto osd : up) {
4632 f->dump_int("up_osd", osd);
4633 }
4634 f->close_section();
4635 f->open_array_section("acting");
4636 for (auto osd : acting) {
4637 f->dump_int("acting_osd", osd);
4638 }
4639 f->close_section();
4640 f->close_section();
4641 f->flush(rdata);
4642 } else {
4643 ds << "osdmap e" << osdmap.get_epoch()
4644 << " pg " << pgid << " (" << mpgid << ")"
4645 << " -> up " << up << " acting " << acting;
4646 rdata.append(ds);
4647 }
4648 goto reply;
4649
224ce89b
WB
4650 } else if (prefix == "osd scrub" ||
4651 prefix == "osd deep-scrub" ||
4652 prefix == "osd repair") {
7c673cae
FG
4653 string whostr;
4654 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4655 vector<string> pvec;
4656 get_str_vec(prefix, pvec);
4657
224ce89b 4658 if (whostr == "*" || whostr == "all" || whostr == "any") {
7c673cae
FG
4659 ss << "osds ";
4660 int c = 0;
4661 for (int i = 0; i < osdmap.get_max_osd(); i++)
4662 if (osdmap.is_up(i)) {
4663 ss << (c++ ? "," : "") << i;
4664 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4665 pvec.back() == "repair",
4666 pvec.back() == "deep-scrub"),
4667 osdmap.get_inst(i));
4668 }
4669 r = 0;
4670 ss << " instructed to " << pvec.back();
4671 } else {
4672 long osd = parse_osd_id(whostr.c_str(), &ss);
4673 if (osd < 0) {
4674 r = -EINVAL;
4675 } else if (osdmap.is_up(osd)) {
4676 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4677 pvec.back() == "repair",
4678 pvec.back() == "deep-scrub"),
4679 osdmap.get_inst(osd));
4680 ss << "osd." << osd << " instructed to " << pvec.back();
4681 } else {
4682 ss << "osd." << osd << " is not up";
4683 r = -EAGAIN;
4684 }
4685 }
4686 } else if (prefix == "osd lspools") {
4687 int64_t auid;
4688 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4689 if (f)
4690 f->open_array_section("pools");
4691 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4692 p != osdmap.pools.end();
4693 ++p) {
4694 if (!auid || p->second.auid == (uint64_t)auid) {
4695 if (f) {
4696 f->open_object_section("pool");
4697 f->dump_int("poolnum", p->first);
4698 f->dump_string("poolname", osdmap.pool_name[p->first]);
4699 f->close_section();
4700 } else {
4701 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4702 }
4703 }
4704 }
4705 if (f) {
4706 f->close_section();
4707 f->flush(ds);
4708 }
4709 rdata.append(ds);
4710 } else if (prefix == "osd blacklist ls") {
4711 if (f)
4712 f->open_array_section("blacklist");
4713
4714 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4715 p != osdmap.blacklist.end();
4716 ++p) {
4717 if (f) {
4718 f->open_object_section("entry");
4719 f->dump_stream("addr") << p->first;
4720 f->dump_stream("until") << p->second;
4721 f->close_section();
4722 } else {
4723 stringstream ss;
4724 string s;
4725 ss << p->first << " " << p->second;
4726 getline(ss, s);
4727 s += "\n";
4728 rdata.append(s);
4729 }
4730 }
4731 if (f) {
4732 f->close_section();
4733 f->flush(rdata);
4734 }
4735 ss << "listed " << osdmap.blacklist.size() << " entries";
4736
4737 } else if (prefix == "osd pool ls") {
4738 string detail;
4739 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4740 if (!f && detail == "detail") {
4741 ostringstream ss;
4742 osdmap.print_pools(ss);
4743 rdata.append(ss.str());
4744 } else {
4745 if (f)
4746 f->open_array_section("pools");
4747 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4748 it != osdmap.get_pools().end();
4749 ++it) {
4750 if (f) {
4751 if (detail == "detail") {
4752 f->open_object_section("pool");
4753 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4754 it->second.dump(f.get());
4755 f->close_section();
4756 } else {
4757 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4758 }
4759 } else {
4760 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4761 }
4762 }
4763 if (f) {
4764 f->close_section();
4765 f->flush(rdata);
4766 }
4767 }
4768
4769 } else if (prefix == "osd crush get-tunable") {
4770 string tunable;
4771 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4772 ostringstream rss;
4773 if (f)
4774 f->open_object_section("tunable");
4775 if (tunable == "straw_calc_version") {
4776 if (f)
4777 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4778 else
4779 rss << osdmap.crush->get_straw_calc_version() << "\n";
4780 } else {
4781 r = -EINVAL;
4782 goto reply;
4783 }
4784 if (f) {
4785 f->close_section();
4786 f->flush(rdata);
4787 } else {
4788 rdata.append(rss.str());
4789 }
4790 r = 0;
4791
4792 } else if (prefix == "osd pool get") {
4793 string poolstr;
4794 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4795 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4796 if (pool < 0) {
4797 ss << "unrecognized pool '" << poolstr << "'";
4798 r = -ENOENT;
4799 goto reply;
4800 }
4801
4802 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4803 string var;
4804 cmd_getval(g_ceph_context, cmdmap, "var", var);
4805
4806 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4807 const choices_map_t ALL_CHOICES = {
4808 {"size", SIZE},
4809 {"min_size", MIN_SIZE},
4810 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4811 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
28e407b8
AA
4812 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
4813 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
7c673cae
FG
4814 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4815 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4816 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4817 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4818 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4819 {"use_gmt_hitset", USE_GMT_HITSET},
4820 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4821 {"target_max_bytes", TARGET_MAX_BYTES},
4822 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4823 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4824 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4825 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4826 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4827 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4828 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4829 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4830 {"fast_read", FAST_READ},
4831 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4832 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4833 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4834 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4835 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4836 {"recovery_priority", RECOVERY_PRIORITY},
4837 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4838 {"scrub_priority", SCRUB_PRIORITY},
4839 {"compression_mode", COMPRESSION_MODE},
4840 {"compression_algorithm", COMPRESSION_ALGORITHM},
4841 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4842 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4843 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4844 {"csum_type", CSUM_TYPE},
4845 {"csum_max_block", CSUM_MAX_BLOCK},
4846 {"csum_min_block", CSUM_MIN_BLOCK},
4847 };
4848
4849 typedef std::set<osd_pool_get_choices> choices_set_t;
4850
4851 const choices_set_t ONLY_TIER_CHOICES = {
4852 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4853 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4854 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4855 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4856 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 4857 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
4858 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4859 };
4860 const choices_set_t ONLY_ERASURE_CHOICES = {
28e407b8 4861 EC_OVERWRITES, ERASURE_CODE_PROFILE
7c673cae
FG
4862 };
4863
4864 choices_set_t selected_choices;
4865 if (var == "all") {
4866 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4867 it != ALL_CHOICES.end(); ++it) {
4868 selected_choices.insert(it->second);
4869 }
4870
4871 if(!p->is_tier()) {
4872 selected_choices = subtract_second_from_first(selected_choices,
4873 ONLY_TIER_CHOICES);
4874 }
4875
4876 if(!p->is_erasure()) {
4877 selected_choices = subtract_second_from_first(selected_choices,
4878 ONLY_ERASURE_CHOICES);
4879 }
4880 } else /* var != "all" */ {
4881 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4882 osd_pool_get_choices selected = found->second;
4883
4884 if (!p->is_tier() &&
4885 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4886 ss << "pool '" << poolstr
4887 << "' is not a tier pool: variable not applicable";
4888 r = -EACCES;
4889 goto reply;
4890 }
4891
4892 if (!p->is_erasure() &&
4893 ONLY_ERASURE_CHOICES.find(selected)
4894 != ONLY_ERASURE_CHOICES.end()) {
4895 ss << "pool '" << poolstr
4896 << "' is not a erasure pool: variable not applicable";
4897 r = -EACCES;
4898 goto reply;
4899 }
4900
94b18763
FG
4901 if (pool_opts_t::is_opt_name(var) &&
4902 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
4903 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
4904 r = -ENOENT;
4905 goto reply;
4906 }
4907
7c673cae
FG
4908 selected_choices.insert(selected);
4909 }
4910
4911 if (f) {
94b18763
FG
4912 f->open_object_section("pool");
4913 f->dump_string("pool", poolstr);
4914 f->dump_int("pool_id", pool);
7c673cae
FG
4915 for(choices_set_t::const_iterator it = selected_choices.begin();
4916 it != selected_choices.end(); ++it) {
4917 choices_map_t::const_iterator i;
c07f9fc5
FG
4918 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4919 if (i->second == *it) {
4920 break;
4921 }
4922 }
4923 assert(i != ALL_CHOICES.end());
7c673cae
FG
4924 switch(*it) {
4925 case PG_NUM:
4926 f->dump_int("pg_num", p->get_pg_num());
4927 break;
4928 case PGP_NUM:
4929 f->dump_int("pgp_num", p->get_pgp_num());
4930 break;
4931 case AUID:
4932 f->dump_int("auid", p->get_auid());
4933 break;
4934 case SIZE:
4935 f->dump_int("size", p->get_size());
4936 break;
4937 case MIN_SIZE:
4938 f->dump_int("min_size", p->get_min_size());
4939 break;
4940 case CRASH_REPLAY_INTERVAL:
4941 f->dump_int("crash_replay_interval",
4942 p->get_crash_replay_interval());
4943 break;
4944 case CRUSH_RULE:
31f18b77 4945 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4946 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 4947 p->get_crush_rule()));
7c673cae 4948 } else {
31f18b77 4949 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
4950 }
4951 break;
28e407b8
AA
4952 case EC_OVERWRITES:
4953 f->dump_bool("allow_ec_overwrites",
4954 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
4955 break;
7c673cae
FG
4956 case HASHPSPOOL:
4957 case NODELETE:
4958 case NOPGCHANGE:
4959 case NOSIZECHANGE:
4960 case WRITE_FADVISE_DONTNEED:
4961 case NOSCRUB:
4962 case NODEEP_SCRUB:
94b18763
FG
4963 f->dump_bool(i->first.c_str(),
4964 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
4965 break;
4966 case HIT_SET_PERIOD:
4967 f->dump_int("hit_set_period", p->hit_set_period);
4968 break;
4969 case HIT_SET_COUNT:
4970 f->dump_int("hit_set_count", p->hit_set_count);
4971 break;
4972 case HIT_SET_TYPE:
4973 f->dump_string("hit_set_type",
4974 HitSet::get_type_name(p->hit_set_params.get_type()));
4975 break;
4976 case HIT_SET_FPP:
4977 {
4978 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4979 BloomHitSet::Params *bloomp =
4980 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4981 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4982 } else if(var != "all") {
4983 f->close_section();
4984 ss << "hit set is not of type Bloom; " <<
4985 "invalid to get a false positive rate!";
4986 r = -EINVAL;
4987 goto reply;
4988 }
4989 }
4990 break;
4991 case USE_GMT_HITSET:
4992 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4993 break;
4994 case TARGET_MAX_OBJECTS:
4995 f->dump_unsigned("target_max_objects", p->target_max_objects);
4996 break;
4997 case TARGET_MAX_BYTES:
4998 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4999 break;
5000 case CACHE_TARGET_DIRTY_RATIO:
5001 f->dump_unsigned("cache_target_dirty_ratio_micro",
5002 p->cache_target_dirty_ratio_micro);
5003 f->dump_float("cache_target_dirty_ratio",
5004 ((float)p->cache_target_dirty_ratio_micro/1000000));
5005 break;
5006 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5007 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
5008 p->cache_target_dirty_high_ratio_micro);
5009 f->dump_float("cache_target_dirty_high_ratio",
5010 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
5011 break;
5012 case CACHE_TARGET_FULL_RATIO:
5013 f->dump_unsigned("cache_target_full_ratio_micro",
5014 p->cache_target_full_ratio_micro);
5015 f->dump_float("cache_target_full_ratio",
5016 ((float)p->cache_target_full_ratio_micro/1000000));
5017 break;
5018 case CACHE_MIN_FLUSH_AGE:
5019 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
5020 break;
5021 case CACHE_MIN_EVICT_AGE:
5022 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
5023 break;
5024 case ERASURE_CODE_PROFILE:
5025 f->dump_string("erasure_code_profile", p->erasure_code_profile);
5026 break;
5027 case MIN_READ_RECENCY_FOR_PROMOTE:
5028 f->dump_int("min_read_recency_for_promote",
5029 p->min_read_recency_for_promote);
5030 break;
5031 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5032 f->dump_int("min_write_recency_for_promote",
5033 p->min_write_recency_for_promote);
5034 break;
5035 case FAST_READ:
5036 f->dump_int("fast_read", p->fast_read);
5037 break;
5038 case HIT_SET_GRADE_DECAY_RATE:
5039 f->dump_int("hit_set_grade_decay_rate",
5040 p->hit_set_grade_decay_rate);
5041 break;
5042 case HIT_SET_SEARCH_LAST_N:
5043 f->dump_int("hit_set_search_last_n",
5044 p->hit_set_search_last_n);
5045 break;
5046 case SCRUB_MIN_INTERVAL:
5047 case SCRUB_MAX_INTERVAL:
5048 case DEEP_SCRUB_INTERVAL:
5049 case RECOVERY_PRIORITY:
5050 case RECOVERY_OP_PRIORITY:
5051 case SCRUB_PRIORITY:
5052 case COMPRESSION_MODE:
5053 case COMPRESSION_ALGORITHM:
5054 case COMPRESSION_REQUIRED_RATIO:
5055 case COMPRESSION_MAX_BLOB_SIZE:
5056 case COMPRESSION_MIN_BLOB_SIZE:
5057 case CSUM_TYPE:
5058 case CSUM_MAX_BLOCK:
5059 case CSUM_MIN_BLOCK:
c07f9fc5
FG
5060 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5061 if (p->opts.is_set(key)) {
c07f9fc5
FG
5062 if(*it == CSUM_TYPE) {
5063 int val;
5064 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
5065 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
5066 } else {
5067 p->opts.dump(i->first, f.get());
5068 }
94b18763 5069 }
7c673cae
FG
5070 break;
5071 }
7c673cae 5072 }
94b18763
FG
5073 f->close_section();
5074 f->flush(rdata);
7c673cae
FG
5075 } else /* !f */ {
5076 for(choices_set_t::const_iterator it = selected_choices.begin();
5077 it != selected_choices.end(); ++it) {
5078 choices_map_t::const_iterator i;
5079 switch(*it) {
5080 case PG_NUM:
5081 ss << "pg_num: " << p->get_pg_num() << "\n";
5082 break;
5083 case PGP_NUM:
5084 ss << "pgp_num: " << p->get_pgp_num() << "\n";
5085 break;
5086 case AUID:
5087 ss << "auid: " << p->get_auid() << "\n";
5088 break;
5089 case SIZE:
5090 ss << "size: " << p->get_size() << "\n";
5091 break;
5092 case MIN_SIZE:
5093 ss << "min_size: " << p->get_min_size() << "\n";
5094 break;
5095 case CRASH_REPLAY_INTERVAL:
5096 ss << "crash_replay_interval: " <<
5097 p->get_crash_replay_interval() << "\n";
5098 break;
5099 case CRUSH_RULE:
31f18b77 5100 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 5101 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 5102 p->get_crush_rule()) << "\n";
7c673cae 5103 } else {
31f18b77 5104 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
5105 }
5106 break;
7c673cae
FG
5107 case HIT_SET_PERIOD:
5108 ss << "hit_set_period: " << p->hit_set_period << "\n";
5109 break;
5110 case HIT_SET_COUNT:
5111 ss << "hit_set_count: " << p->hit_set_count << "\n";
5112 break;
5113 case HIT_SET_TYPE:
5114 ss << "hit_set_type: " <<
5115 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
5116 break;
5117 case HIT_SET_FPP:
5118 {
5119 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5120 BloomHitSet::Params *bloomp =
5121 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5122 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
5123 } else if(var != "all") {
5124 ss << "hit set is not of type Bloom; " <<
5125 "invalid to get a false positive rate!";
5126 r = -EINVAL;
5127 goto reply;
5128 }
5129 }
5130 break;
5131 case USE_GMT_HITSET:
5132 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
5133 break;
5134 case TARGET_MAX_OBJECTS:
5135 ss << "target_max_objects: " << p->target_max_objects << "\n";
5136 break;
5137 case TARGET_MAX_BYTES:
5138 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
5139 break;
5140 case CACHE_TARGET_DIRTY_RATIO:
5141 ss << "cache_target_dirty_ratio: "
5142 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
5143 break;
5144 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5145 ss << "cache_target_dirty_high_ratio: "
5146 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
5147 break;
5148 case CACHE_TARGET_FULL_RATIO:
5149 ss << "cache_target_full_ratio: "
5150 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
5151 break;
5152 case CACHE_MIN_FLUSH_AGE:
5153 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5154 break;
5155 case CACHE_MIN_EVICT_AGE:
5156 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5157 break;
5158 case ERASURE_CODE_PROFILE:
5159 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5160 break;
5161 case MIN_READ_RECENCY_FOR_PROMOTE:
5162 ss << "min_read_recency_for_promote: " <<
5163 p->min_read_recency_for_promote << "\n";
5164 break;
5165 case HIT_SET_GRADE_DECAY_RATE:
5166 ss << "hit_set_grade_decay_rate: " <<
5167 p->hit_set_grade_decay_rate << "\n";
5168 break;
5169 case HIT_SET_SEARCH_LAST_N:
5170 ss << "hit_set_search_last_n: " <<
5171 p->hit_set_search_last_n << "\n";
5172 break;
28e407b8
AA
5173 case EC_OVERWRITES:
5174 ss << "allow_ec_overwrites: " <<
5175 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
5176 "\n";
5177 break;
7c673cae
FG
5178 case HASHPSPOOL:
5179 case NODELETE:
5180 case NOPGCHANGE:
5181 case NOSIZECHANGE:
5182 case WRITE_FADVISE_DONTNEED:
5183 case NOSCRUB:
5184 case NODEEP_SCRUB:
5185 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5186 if (i->second == *it)
5187 break;
5188 }
5189 assert(i != ALL_CHOICES.end());
5190 ss << i->first << ": " <<
5191 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5192 "true" : "false") << "\n";
5193 break;
5194 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5195 ss << "min_write_recency_for_promote: " <<
5196 p->min_write_recency_for_promote << "\n";
5197 break;
5198 case FAST_READ:
5199 ss << "fast_read: " << p->fast_read << "\n";
5200 break;
5201 case SCRUB_MIN_INTERVAL:
5202 case SCRUB_MAX_INTERVAL:
5203 case DEEP_SCRUB_INTERVAL:
5204 case RECOVERY_PRIORITY:
5205 case RECOVERY_OP_PRIORITY:
5206 case SCRUB_PRIORITY:
5207 case COMPRESSION_MODE:
5208 case COMPRESSION_ALGORITHM:
5209 case COMPRESSION_REQUIRED_RATIO:
5210 case COMPRESSION_MAX_BLOB_SIZE:
5211 case COMPRESSION_MIN_BLOB_SIZE:
5212 case CSUM_TYPE:
5213 case CSUM_MAX_BLOCK:
5214 case CSUM_MIN_BLOCK:
5215 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5216 if (i->second == *it)
5217 break;
5218 }
5219 assert(i != ALL_CHOICES.end());
5220 {
5221 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5222 if (p->opts.is_set(key)) {
5223 if(key == pool_opts_t::CSUM_TYPE) {
5224 int val;
5225 p->opts.get(key, &val);
5226 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5227 } else {
5228 ss << i->first << ": " << p->opts.get(key) << "\n";
5229 }
5230 }
5231 }
5232 break;
5233 }
5234 rdata.append(ss.str());
5235 ss.str("");
5236 }
5237 }
5238 r = 0;
5239 } else if (prefix == "osd pool stats") {
31f18b77
FG
5240 r = mon->pgservice->process_pg_command(prefix, cmdmap,
5241 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
5242 } else if (prefix == "osd pool get-quota") {
5243 string pool_name;
5244 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
5245
5246 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5247 if (poolid < 0) {
5248 assert(poolid == -ENOENT);
5249 ss << "unrecognized pool '" << pool_name << "'";
5250 r = -ENOENT;
5251 goto reply;
5252 }
5253 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5254
5255 if (f) {
5256 f->open_object_section("pool_quotas");
5257 f->dump_string("pool_name", pool_name);
5258 f->dump_unsigned("pool_id", poolid);
5259 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5260 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5261 f->close_section();
5262 f->flush(rdata);
5263 } else {
5264 stringstream rs;
5265 rs << "quotas for pool '" << pool_name << "':\n"
5266 << " max objects: ";
5267 if (p->quota_max_objects == 0)
5268 rs << "N/A";
5269 else
5270 rs << si_t(p->quota_max_objects) << " objects";
5271 rs << "\n"
5272 << " max bytes : ";
5273 if (p->quota_max_bytes == 0)
5274 rs << "N/A";
5275 else
5276 rs << si_t(p->quota_max_bytes) << "B";
5277 rdata.append(rs.str());
5278 }
5279 rdata.append("\n");
5280 r = 0;
5281 } else if (prefix == "osd crush rule list" ||
5282 prefix == "osd crush rule ls") {
c07f9fc5
FG
5283 if (f) {
5284 f->open_array_section("rules");
5285 osdmap.crush->list_rules(f.get());
5286 f->close_section();
5287 f->flush(rdata);
5288 } else {
5289 ostringstream ss;
5290 osdmap.crush->list_rules(&ss);
5291 rdata.append(ss.str());
5292 }
b5b8bbf5
FG
5293 } else if (prefix == "osd crush rule ls-by-class") {
5294 string class_name;
5295 cmd_getval(g_ceph_context, cmdmap, "class", class_name);
5296 if (class_name.empty()) {
5297 ss << "no class specified";
5298 r = -EINVAL;
5299 goto reply;
5300 }
5301 set<int> rules;
5302 r = osdmap.crush->get_rules_by_class(class_name, &rules);
5303 if (r < 0) {
5304 ss << "failed to get rules by class '" << class_name << "'";
5305 goto reply;
5306 }
5307 if (f) {
5308 f->open_array_section("rules");
5309 for (auto &rule: rules) {
5310 f->dump_string("name", osdmap.crush->get_rule_name(rule));
5311 }
5312 f->close_section();
5313 f->flush(rdata);
5314 } else {
5315 ostringstream rs;
5316 for (auto &rule: rules) {
5317 rs << osdmap.crush->get_rule_name(rule) << "\n";
5318 }
5319 rdata.append(rs.str());
5320 }
7c673cae
FG
5321 } else if (prefix == "osd crush rule dump") {
5322 string name;
5323 cmd_getval(g_ceph_context, cmdmap, "name", name);
5324 string format;
5325 cmd_getval(g_ceph_context, cmdmap, "format", format);
5326 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5327 if (name == "") {
5328 f->open_array_section("rules");
5329 osdmap.crush->dump_rules(f.get());
5330 f->close_section();
5331 } else {
5332 int ruleno = osdmap.crush->get_rule_id(name);
5333 if (ruleno < 0) {
31f18b77 5334 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
5335 r = ruleno;
5336 goto reply;
5337 }
5338 osdmap.crush->dump_rule(ruleno, f.get());
5339 }
5340 ostringstream rs;
5341 f->flush(rs);
5342 rs << "\n";
5343 rdata.append(rs.str());
5344 } else if (prefix == "osd crush dump") {
5345 string format;
5346 cmd_getval(g_ceph_context, cmdmap, "format", format);
5347 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5348 f->open_object_section("crush_map");
5349 osdmap.crush->dump(f.get());
5350 f->close_section();
5351 ostringstream rs;
5352 f->flush(rs);
5353 rs << "\n";
5354 rdata.append(rs.str());
5355 } else if (prefix == "osd crush show-tunables") {
5356 string format;
5357 cmd_getval(g_ceph_context, cmdmap, "format", format);
5358 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5359 f->open_object_section("crush_map_tunables");
5360 osdmap.crush->dump_tunables(f.get());
5361 f->close_section();
5362 ostringstream rs;
5363 f->flush(rs);
5364 rs << "\n";
5365 rdata.append(rs.str());
5366 } else if (prefix == "osd crush tree") {
c07f9fc5
FG
5367 string shadow;
5368 cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
5369 bool show_shadow = shadow == "--show-shadow";
5370 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5371 if (f) {
5372 osdmap.crush->dump_tree(nullptr,
5373 f.get(),
5374 osdmap.get_pool_names(),
5375 show_shadow);
5376 f->flush(rdata);
5377 } else {
5378 ostringstream ss;
5379 osdmap.crush->dump_tree(&ss,
5380 nullptr,
5381 osdmap.get_pool_names(),
5382 show_shadow);
5383 rdata.append(ss.str());
5384 }
d2e6a577
FG
5385 } else if (prefix == "osd crush ls") {
5386 string name;
5387 if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
5388 ss << "no node specified";
5389 r = -EINVAL;
5390 goto reply;
5391 }
5392 if (!osdmap.crush->name_exists(name)) {
5393 ss << "node '" << name << "' does not exist";
5394 r = -ENOENT;
5395 goto reply;
5396 }
5397 int id = osdmap.crush->get_item_id(name);
5398 list<int> result;
5399 if (id >= 0) {
5400 result.push_back(id);
5401 } else {
5402 int num = osdmap.crush->get_bucket_size(id);
5403 for (int i = 0; i < num; ++i) {
5404 result.push_back(osdmap.crush->get_bucket_item(id, i));
5405 }
5406 }
5407 if (f) {
5408 f->open_array_section("items");
5409 for (auto i : result) {
5410 f->dump_string("item", osdmap.crush->get_item_name(i));
5411 }
5412 f->close_section();
5413 f->flush(rdata);
5414 } else {
5415 ostringstream ss;
5416 for (auto i : result) {
5417 ss << osdmap.crush->get_item_name(i) << "\n";
5418 }
5419 rdata.append(ss.str());
5420 }
5421 r = 0;
7c673cae
FG
5422 } else if (prefix == "osd crush class ls") {
5423 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5424 f->open_array_section("crush_classes");
5425 for (auto i : osdmap.crush->class_name)
5426 f->dump_string("class", i.second);
5427 f->close_section();
5428 f->flush(rdata);
224ce89b
WB
5429 } else if (prefix == "osd crush class ls-osd") {
5430 string name;
5431 cmd_getval(g_ceph_context, cmdmap, "class", name);
224ce89b
WB
5432 set<int> osds;
5433 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
5434 if (f) {
5435 f->open_array_section("osds");
5436 for (auto &osd: osds)
5437 f->dump_int("osd", osd);
5438 f->close_section();
5439 f->flush(rdata);
5440 } else {
5441 bool first = true;
5442 for (auto &osd : osds) {
5443 if (!first)
5444 ds << "\n";
5445 first = false;
5446 ds << osd;
5447 }
5448 rdata.append(ds);
5449 }
7c673cae
FG
5450 } else if (prefix == "osd erasure-code-profile ls") {
5451 const auto &profiles = osdmap.get_erasure_code_profiles();
5452 if (f)
5453 f->open_array_section("erasure-code-profiles");
5454 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5455 if (f)
5456 f->dump_string("profile", i->first.c_str());
5457 else
5458 rdata.append(i->first + "\n");
5459 }
5460 if (f) {
5461 f->close_section();
5462 ostringstream rs;
5463 f->flush(rs);
5464 rs << "\n";
5465 rdata.append(rs.str());
5466 }
c07f9fc5
FG
5467 } else if (prefix == "osd crush weight-set ls") {
5468 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5469 if (f) {
5470 f->open_array_section("weight_sets");
5471 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5472 f->dump_string("pool", "(compat)");
5473 }
5474 for (auto& i : osdmap.crush->choose_args) {
5475 if (i.first >= 0) {
5476 f->dump_string("pool", osdmap.get_pool_name(i.first));
5477 }
5478 }
5479 f->close_section();
5480 f->flush(rdata);
5481 } else {
5482 ostringstream rs;
5483 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5484 rs << "(compat)\n";
5485 }
5486 for (auto& i : osdmap.crush->choose_args) {
5487 if (i.first >= 0) {
5488 rs << osdmap.get_pool_name(i.first) << "\n";
5489 }
5490 }
5491 rdata.append(rs.str());
5492 }
5493 } else if (prefix == "osd crush weight-set dump") {
5494 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5495 "json-pretty"));
5496 osdmap.crush->dump_choose_args(f.get());
5497 f->flush(rdata);
7c673cae
FG
5498 } else if (prefix == "osd erasure-code-profile get") {
5499 string name;
5500 cmd_getval(g_ceph_context, cmdmap, "name", name);
5501 if (!osdmap.has_erasure_code_profile(name)) {
5502 ss << "unknown erasure code profile '" << name << "'";
5503 r = -ENOENT;
5504 goto reply;
5505 }
5506 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5507 if (f)
5508 f->open_object_section("profile");
5509 for (map<string,string>::const_iterator i = profile.begin();
5510 i != profile.end();
5511 ++i) {
5512 if (f)
5513 f->dump_string(i->first.c_str(), i->second.c_str());
5514 else
5515 rdata.append(i->first + "=" + i->second + "\n");
5516 }
5517 if (f) {
5518 f->close_section();
5519 ostringstream rs;
5520 f->flush(rs);
5521 rs << "\n";
5522 rdata.append(rs.str());
5523 }
181888fb
FG
5524 } else if (prefix == "osd pool application get") {
5525 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5526 "json-pretty"));
5527 string pool_name;
5528 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
5529 string app;
5530 cmd_getval(g_ceph_context, cmdmap, "app", app);
5531 string key;
5532 cmd_getval(g_ceph_context, cmdmap, "key", key);
5533
5534 if (pool_name.empty()) {
5535 // all
5536 f->open_object_section("pools");
5537 for (const auto &pool : osdmap.pools) {
5538 std::string name("<unknown>");
5539 const auto &pni = osdmap.pool_name.find(pool.first);
5540 if (pni != osdmap.pool_name.end())
5541 name = pni->second;
5542 f->open_object_section(name.c_str());
5543 for (auto &app_pair : pool.second.application_metadata) {
5544 f->open_object_section(app_pair.first.c_str());
5545 for (auto &kv_pair : app_pair.second) {
5546 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5547 }
5548 f->close_section();
5549 }
5550 f->close_section(); // name
5551 }
5552 f->close_section(); // pools
5553 f->flush(rdata);
5554 } else {
5555 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
5556 if (pool < 0) {
5557 ss << "unrecognized pool '" << pool_name << "'";
5558 r = -ENOENT;
5559 goto reply;
5560 }
5561 auto p = osdmap.get_pg_pool(pool);
5562 // filter by pool
5563 if (app.empty()) {
5564 f->open_object_section(pool_name.c_str());
5565 for (auto &app_pair : p->application_metadata) {
5566 f->open_object_section(app_pair.first.c_str());
5567 for (auto &kv_pair : app_pair.second) {
5568 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5569 }
5570 f->close_section(); // application
5571 }
5572 f->close_section(); // pool_name
5573 f->flush(rdata);
5574 goto reply;
5575 }
5576
5577 auto app_it = p->application_metadata.find(app);
5578 if (app_it == p->application_metadata.end()) {
5579 ss << "pool '" << pool_name << "' has no application '" << app << "'";
5580 r = -ENOENT;
5581 goto reply;
5582 }
5583 // filter by pool + app
5584 if (key.empty()) {
5585 f->open_object_section(app_it->first.c_str());
5586 for (auto &kv_pair : app_it->second) {
5587 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5588 }
5589 f->close_section(); // application
5590 f->flush(rdata);
5591 goto reply;
5592 }
5593 // filter by pool + app + key
5594 auto key_it = app_it->second.find(key);
5595 if (key_it == app_it->second.end()) {
5596 ss << "application '" << app << "' on pool '" << pool_name
5597 << "' does not have key '" << key << "'";
5598 r = -ENOENT;
5599 goto reply;
5600 }
5601 ss << key_it->second << "\n";
5602 rdata.append(ss.str());
5603 ss.str("");
5604 }
7c673cae
FG
5605 } else {
5606 // try prepare update
5607 return false;
5608 }
5609
5610 reply:
5611 string rs;
5612 getline(ss, rs);
5613 mon->reply_command(op, r, rs, rdata, get_last_committed());
5614 return true;
5615}
5616
3efd9988
FG
5617void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
5618{
5619 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5620 osdmap.get_pg_pool(pool_id));
5621 assert(pool);
5622 pool->set_flag(flags);
5623}
5624
5625void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 5626{
3efd9988
FG
5627 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5628 osdmap.get_pg_pool(pool_id));
5629 assert(pool);
5630 pool->unset_flag(flags);
7c673cae
FG
5631}
5632
5633bool OSDMonitor::update_pools_status()
5634{
31f18b77 5635 if (!mon->pgservice->is_readable())
7c673cae
FG
5636 return false;
5637
5638 bool ret = false;
5639
5640 auto& pools = osdmap.get_pools();
5641 for (auto it = pools.begin(); it != pools.end(); ++it) {
31f18b77
FG
5642 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5643 if (!pstat)
7c673cae 5644 continue;
31f18b77 5645 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
5646 const pg_pool_t &pool = it->second;
5647 const string& pool_name = osdmap.get_pool_name(it->first);
5648
5649 bool pool_is_full =
5650 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
5651 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
5652
3efd9988 5653 if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
7c673cae
FG
5654 if (pool_is_full)
5655 continue;
5656
5657 mon->clog->info() << "pool '" << pool_name
3efd9988
FG
5658 << "' no longer out of quota; removing NO_QUOTA flag";
5659 // below we cancel FLAG_FULL too, we'll set it again in
5660 // OSDMonitor::encode_pending if it still fails the osd-full checking.
5661 clear_pool_flags(it->first,
5662 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
5663 ret = true;
5664 } else {
5665 if (!pool_is_full)
5666 continue;
5667
5668 if (pool.quota_max_bytes > 0 &&
5669 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5670 mon->clog->warn() << "pool '" << pool_name << "' is full"
5671 << " (reached quota's max_bytes: "
5672 << si_t(pool.quota_max_bytes) << ")";
5673 }
5674 if (pool.quota_max_objects > 0 &&
5675 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
5676 mon->clog->warn() << "pool '" << pool_name << "' is full"
5677 << " (reached quota's max_objects: "
5678 << pool.quota_max_objects << ")";
5679 }
3efd9988
FG
5680 // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
5681 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
5682 // since FLAG_FULL should always take precedence
5683 set_pool_flags(it->first,
5684 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
5685 clear_pool_flags(it->first,
5686 pg_pool_t::FLAG_NEARFULL |
5687 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
5688 ret = true;
5689 }
5690 }
5691 return ret;
5692}
5693
7c673cae
FG
5694int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5695{
5696 op->mark_osdmon_event(__func__);
5697 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5698 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5699 MonSession *session = m->get_session();
5700 if (!session)
5701 return -EPERM;
5702 string erasure_code_profile;
5703 stringstream ss;
31f18b77 5704 string rule_name;
94b18763 5705 int ret = 0;
7c673cae 5706 if (m->auid)
94b18763 5707 ret = prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
7c673cae
FG
5708 0, 0,
5709 erasure_code_profile,
5710 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5711 else
94b18763 5712 ret = prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
7c673cae
FG
5713 0, 0,
5714 erasure_code_profile,
5715 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
94b18763
FG
5716
5717 if (ret < 0) {
5718 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
5719 }
5720 return ret;
7c673cae
FG
5721}
5722
5723int OSDMonitor::crush_rename_bucket(const string& srcname,
5724 const string& dstname,
5725 ostream *ss)
5726{
5727 int ret;
5728 //
5729 // Avoid creating a pending crush if it does not already exists and
5730 // the rename would fail.
5731 //
5732 if (!_have_pending_crush()) {
5733 ret = _get_stable_crush().can_rename_bucket(srcname,
5734 dstname,
5735 ss);
5736 if (ret)
5737 return ret;
5738 }
5739
5740 CrushWrapper newcrush;
5741 _get_pending_crush(newcrush);
5742
5743 ret = newcrush.rename_bucket(srcname,
5744 dstname,
5745 ss);
5746 if (ret)
5747 return ret;
5748
5749 pending_inc.crush.clear();
5750 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5751 *ss << "renamed bucket " << srcname << " into " << dstname;
5752 return 0;
5753}
5754
5755void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5756{
5757 string replacement = "";
5758
5759 if (plugin == "jerasure_generic" ||
5760 plugin == "jerasure_sse3" ||
5761 plugin == "jerasure_sse4" ||
5762 plugin == "jerasure_neon") {
5763 replacement = "jerasure";
5764 } else if (plugin == "shec_generic" ||
5765 plugin == "shec_sse3" ||
5766 plugin == "shec_sse4" ||
5767 plugin == "shec_neon") {
5768 replacement = "shec";
5769 }
5770
5771 if (replacement != "") {
5772 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5773 << plugin << " that has been deprecated. Please use "
5774 << replacement << " instead." << dendl;
5775 }
5776}
5777
5778int OSDMonitor::normalize_profile(const string& profilename,
5779 ErasureCodeProfile &profile,
5780 bool force,
5781 ostream *ss)
5782{
5783 ErasureCodeInterfaceRef erasure_code;
5784 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5785 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5786 check_legacy_ec_plugin(plugin->second, profilename);
5787 int err = instance.factory(plugin->second,
5788 g_conf->get_val<std::string>("erasure_code_dir"),
5789 profile, &erasure_code, ss);
5790 if (err) {
5791 return err;
5792 }
5793
5794 err = erasure_code->init(profile, ss);
5795 if (err) {
5796 return err;
5797 }
5798
5799 auto it = profile.find("stripe_unit");
5800 if (it != profile.end()) {
5801 string err_str;
5802 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5803 if (!err_str.empty()) {
5804 *ss << "could not parse stripe_unit '" << it->second
5805 << "': " << err_str << std::endl;
5806 return -EINVAL;
5807 }
5808 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5809 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5810 if (chunk_size != stripe_unit) {
5811 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5812 << "alignment. Would be padded to " << chunk_size
5813 << std::endl;
5814 return -EINVAL;
5815 }
5816 if ((stripe_unit % 4096) != 0 && !force) {
5817 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5818 << "use --force to override this check" << std::endl;
5819 return -EINVAL;
5820 }
5821 }
5822 return 0;
5823}
5824
31f18b77 5825int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 5826 const string &profile,
31f18b77 5827 int *rule,
7c673cae
FG
5828 ostream *ss)
5829{
5830 int ruleid = osdmap.crush->get_rule_id(name);
5831 if (ruleid != -ENOENT) {
31f18b77 5832 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
5833 return -EEXIST;
5834 }
5835
5836 CrushWrapper newcrush;
5837 _get_pending_crush(newcrush);
5838
5839 ruleid = newcrush.get_rule_id(name);
5840 if (ruleid != -ENOENT) {
31f18b77 5841 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
5842 return -EALREADY;
5843 } else {
5844 ErasureCodeInterfaceRef erasure_code;
5845 int err = get_erasure_code(profile, &erasure_code, ss);
5846 if (err) {
5847 *ss << "failed to load plugin using profile " << profile << std::endl;
5848 return err;
5849 }
5850
224ce89b 5851 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
5852 erasure_code.reset();
5853 if (err < 0)
5854 return err;
31f18b77 5855 *rule = err;
7c673cae
FG
5856 pending_inc.crush.clear();
5857 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5858 return 0;
5859 }
5860}
5861
5862int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5863 ErasureCodeInterfaceRef *erasure_code,
5864 ostream *ss) const
5865{
5866 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5867 return -EAGAIN;
5868 ErasureCodeProfile profile =
5869 osdmap.get_erasure_code_profile(erasure_code_profile);
5870 ErasureCodeProfile::const_iterator plugin =
5871 profile.find("plugin");
5872 if (plugin == profile.end()) {
5873 *ss << "cannot determine the erasure code plugin"
5874 << " because there is no 'plugin' entry in the erasure_code_profile "
5875 << profile << std::endl;
5876 return -EINVAL;
5877 }
5878 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5879 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5880 return instance.factory(plugin->second,
5881 g_conf->get_val<std::string>("erasure_code_dir"),
5882 profile, erasure_code, ss);
5883}
5884
5885int OSDMonitor::check_cluster_features(uint64_t features,
5886 stringstream &ss)
5887{
5888 stringstream unsupported_ss;
5889 int unsupported_count = 0;
5890 if ((mon->get_quorum_con_features() & features) != features) {
5891 unsupported_ss << "the monitor cluster";
5892 ++unsupported_count;
5893 }
5894
5895 set<int32_t> up_osds;
5896 osdmap.get_up_osds(up_osds);
5897 for (set<int32_t>::iterator it = up_osds.begin();
5898 it != up_osds.end(); ++it) {
5899 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5900 if ((xi.features & features) != features) {
5901 if (unsupported_count > 0)
5902 unsupported_ss << ", ";
5903 unsupported_ss << "osd." << *it;
5904 unsupported_count ++;
5905 }
5906 }
5907
5908 if (unsupported_count > 0) {
5909 ss << "features " << features << " unsupported by: "
5910 << unsupported_ss.str();
5911 return -ENOTSUP;
5912 }
5913
5914 // check pending osd state, too!
5915 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5916 pending_inc.new_xinfo.begin();
5917 p != pending_inc.new_xinfo.end(); ++p) {
5918 const osd_xinfo_t &xi = p->second;
5919 if ((xi.features & features) != features) {
5920 dout(10) << __func__ << " pending osd." << p->first
5921 << " features are insufficient; retry" << dendl;
5922 return -EAGAIN;
5923 }
5924 }
5925
5926 return 0;
5927}
5928
5929bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5930 stringstream& ss)
5931{
5932 OSDMap::Incremental new_pending = pending_inc;
5933 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5934 OSDMap newmap;
5935 newmap.deepish_copy_from(osdmap);
5936 newmap.apply_incremental(new_pending);
5937
5938 // client compat
31f18b77 5939 if (newmap.require_min_compat_client > 0) {
7c673cae 5940 auto mv = newmap.get_min_compat_client();
31f18b77
FG
5941 if (mv > newmap.require_min_compat_client) {
5942 ss << "new crush map requires client version " << ceph_release_name(mv)
7c673cae 5943 << " but require_min_compat_client is "
31f18b77 5944 << ceph_release_name(newmap.require_min_compat_client);
7c673cae
FG
5945 return false;
5946 }
5947 }
5948
5949 // osd compat
5950 uint64_t features =
5951 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5952 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5953 stringstream features_ss;
5954 int r = check_cluster_features(features, features_ss);
5955 if (r) {
5956 ss << "Could not change CRUSH: " << features_ss.str();
5957 return false;
5958 }
5959
5960 return true;
5961}
5962
5963bool OSDMonitor::erasure_code_profile_in_use(
5964 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5965 const string &profile,
5966 ostream *ss)
5967{
5968 bool found = false;
5969 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5970 p != pools.end();
5971 ++p) {
5972 if (p->second.erasure_code_profile == profile) {
5973 *ss << osdmap.pool_name[p->first] << " ";
5974 found = true;
5975 }
5976 }
5977 if (found) {
5978 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5979 }
5980 return found;
5981}
5982
5983int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5984 map<string,string> *erasure_code_profile_map,
5985 ostream *ss)
5986{
5987 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5988 *ss,
5989 erasure_code_profile_map);
5990 if (r)
5991 return r;
5992 assert((*erasure_code_profile_map).count("plugin"));
5993 string default_plugin = (*erasure_code_profile_map)["plugin"];
5994 map<string,string> user_map;
5995 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5996 i != erasure_code_profile.end();
5997 ++i) {
5998 size_t equal = i->find('=');
5999 if (equal == string::npos) {
6000 user_map[*i] = string();
6001 (*erasure_code_profile_map)[*i] = string();
6002 } else {
3efd9988 6003 string key = i->substr(0, equal);
7c673cae
FG
6004 equal++;
6005 const string value = i->substr(equal);
b32b8144
FG
6006 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
6007 key.find("ruleset-") == 0) {
6008 if (g_conf->get_val<bool>("mon_fixup_legacy_erasure_code_profiles")) {
3efd9988
FG
6009 mon->clog->warn() << "erasure code profile property '" << key
6010 << "' is no longer supported; try "
6011 << "'crush-" << key.substr(8) << "' instead";
6012 key = string("crush-") + key.substr(8);
6013 } else {
6014 *ss << "property '" << key << "' is no longer supported; try "
6015 << "'crush-" << key.substr(8) << "' instead";
6016 return -EINVAL;
6017 }
6018 }
7c673cae
FG
6019 user_map[key] = value;
6020 (*erasure_code_profile_map)[key] = value;
6021 }
6022 }
6023
6024 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
6025 (*erasure_code_profile_map) = user_map;
6026
6027 return 0;
6028}
6029
6030int OSDMonitor::prepare_pool_size(const unsigned pool_type,
6031 const string &erasure_code_profile,
6032 unsigned *size, unsigned *min_size,
6033 ostream *ss)
6034{
6035 int err = 0;
6036 switch (pool_type) {
6037 case pg_pool_t::TYPE_REPLICATED:
6038 *size = g_conf->osd_pool_default_size;
6039 *min_size = g_conf->get_osd_pool_default_min_size();
6040 break;
6041 case pg_pool_t::TYPE_ERASURE:
6042 {
6043 ErasureCodeInterfaceRef erasure_code;
6044 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6045 if (err == 0) {
6046 *size = erasure_code->get_chunk_count();
6047 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
6048 }
6049 }
6050 break;
6051 default:
6052 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
6053 err = -EINVAL;
6054 break;
6055 }
6056 return err;
6057}
6058
6059int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
6060 const string &erasure_code_profile,
6061 uint32_t *stripe_width,
6062 ostream *ss)
6063{
6064 int err = 0;
6065 switch (pool_type) {
6066 case pg_pool_t::TYPE_REPLICATED:
6067 // ignored
6068 break;
6069 case pg_pool_t::TYPE_ERASURE:
6070 {
6071 ErasureCodeProfile profile =
6072 osdmap.get_erasure_code_profile(erasure_code_profile);
6073 ErasureCodeInterfaceRef erasure_code;
6074 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6075 if (err)
6076 break;
6077 uint32_t data_chunks = erasure_code->get_data_chunk_count();
6078 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
6079 auto it = profile.find("stripe_unit");
6080 if (it != profile.end()) {
6081 string err_str;
6082 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
6083 assert(err_str.empty());
6084 }
6085 *stripe_width = data_chunks *
6086 erasure_code->get_chunk_size(stripe_unit * data_chunks);
6087 }
6088 break;
6089 default:
6090 *ss << "prepare_pool_stripe_width: "
6091 << pool_type << " is not a known pool type";
6092 err = -EINVAL;
6093 break;
6094 }
6095 return err;
6096}
6097
31f18b77 6098int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
6099 const string &erasure_code_profile,
6100 const string &rule_name,
6101 int *crush_rule,
6102 ostream *ss)
7c673cae
FG
6103{
6104
31f18b77 6105 if (*crush_rule < 0) {
7c673cae
FG
6106 switch (pool_type) {
6107 case pg_pool_t::TYPE_REPLICATED:
6108 {
31f18b77 6109 if (rule_name == "") {
224ce89b 6110 // Use default rule
31f18b77
FG
6111 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
6112 if (*crush_rule < 0) {
6113 // Errors may happen e.g. if no valid rule is available
6114 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
6115 << "'osd pool default crush *' config options";
6116 return -ENOENT;
6117 }
6118 } else {
31f18b77 6119 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
6120 }
6121 }
6122 break;
6123 case pg_pool_t::TYPE_ERASURE:
6124 {
31f18b77 6125 int err = crush_rule_create_erasure(rule_name,
7c673cae 6126 erasure_code_profile,
31f18b77 6127 crush_rule, ss);
7c673cae
FG
6128 switch (err) {
6129 case -EALREADY:
31f18b77
FG
6130 dout(20) << "prepare_pool_crush_rule: rule "
6131 << rule_name << " try again" << dendl;
7c673cae
FG
6132 // fall through
6133 case 0:
6134 // need to wait for the crush rule to be proposed before proceeding
6135 err = -EAGAIN;
6136 break;
6137 case -EEXIST:
6138 err = 0;
6139 break;
6140 }
6141 return err;
6142 }
6143 break;
6144 default:
31f18b77 6145 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
6146 << " is not a known pool type";
6147 return -EINVAL;
6148 break;
6149 }
6150 } else {
31f18b77
FG
6151 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
6152 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
6153 return -ENOENT;
6154 }
6155 }
6156
6157 return 0;
6158}
6159
31f18b77 6160int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
6161 int *crush_rule,
6162 ostream *ss)
7c673cae
FG
6163{
6164 int ret;
31f18b77 6165 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
6166 if (ret != -ENOENT) {
6167 // found it, use it
31f18b77 6168 *crush_rule = ret;
7c673cae
FG
6169 } else {
6170 CrushWrapper newcrush;
6171 _get_pending_crush(newcrush);
6172
31f18b77 6173 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
6174 if (ret != -ENOENT) {
6175 // found it, wait for it to be proposed
31f18b77 6176 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
6177 << " try again" << dendl;
6178 return -EAGAIN;
6179 } else {
224ce89b 6180 // Cannot find it , return error
31f18b77 6181 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
6182 return ret;
6183 }
6184 }
6185 return 0;
6186}
6187
3efd9988
FG
6188int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6189{
6190 auto max_pgs_per_osd = g_conf->get_val<uint64_t>("mon_max_pg_per_osd");
6191 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
6192 auto max_pgs = max_pgs_per_osd * num_osds;
6193 uint64_t projected = 0;
6194 if (pool < 0) {
6195 projected += pg_num * size;
6196 }
6197 for (const auto& i : osdmap.get_pools()) {
6198 if (i.first == pool) {
6199 projected += pg_num * size;
6200 } else {
6201 projected += i.second.get_pg_num() * i.second.get_size();
6202 }
6203 }
6204 if (projected > max_pgs) {
6205 if (pool >= 0) {
6206 *ss << "pool id " << pool;
6207 }
6208 *ss << " pg_num " << pg_num << " size " << size
6209 << " would mean " << projected
6210 << " total pgs, which exceeds max " << max_pgs
6211 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6212 << " * num_in_osds " << num_osds << ")";
6213 return -ERANGE;
6214 }
6215 return 0;
6216}
6217
7c673cae
FG
6218/**
6219 * @param name The name of the new pool
6220 * @param auid The auid of the pool owner. Can be -1
31f18b77
FG
6221 * @param crush_rule The crush rule to use. If <0, will use the system default
6222 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
6223 * @param pg_num The pg_num to use. If set to 0, will use the system default
6224 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6225 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6226 * @param pool_type TYPE_ERASURE, or TYPE_REP
6227 * @param expected_num_objects expected number of objects on the pool
6228 * @param fast_read fast read type.
6229 * @param ss human readable error message, if any.
6230 *
6231 * @return 0 on success, negative errno on failure.
6232 */
6233int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
31f18b77
FG
6234 int crush_rule,
6235 const string &crush_rule_name,
7c673cae
FG
6236 unsigned pg_num, unsigned pgp_num,
6237 const string &erasure_code_profile,
6238 const unsigned pool_type,
6239 const uint64_t expected_num_objects,
6240 FastReadType fast_read,
6241 ostream *ss)
6242{
6243 if (name.length() == 0)
6244 return -EINVAL;
6245 if (pg_num == 0)
6246 pg_num = g_conf->osd_pool_default_pg_num;
6247 if (pgp_num == 0)
6248 pgp_num = g_conf->osd_pool_default_pgp_num;
6249 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
6250 *ss << "'pg_num' must be greater than 0 and less than or equal to "
6251 << g_conf->mon_max_pool_pg_num
6252 << " (you may adjust 'mon max pool pg num' for higher values)";
6253 return -ERANGE;
6254 }
6255 if (pgp_num > pg_num) {
6256 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6257 << ", which in this case is " << pg_num;
6258 return -ERANGE;
6259 }
6260 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6261 *ss << "'fast_read' can only apply to erasure coding pool";
6262 return -EINVAL;
6263 }
6264 int r;
31f18b77
FG
6265 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6266 crush_rule_name, &crush_rule, ss);
7c673cae 6267 if (r) {
94b18763 6268 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
6269 return r;
6270 }
224ce89b
WB
6271 if (g_conf->mon_osd_crush_smoke_test) {
6272 CrushWrapper newcrush;
6273 _get_pending_crush(newcrush);
6274 ostringstream err;
6275 CrushTester tester(newcrush, err);
b5b8bbf5 6276 tester.set_min_x(0);
224ce89b
WB
6277 tester.set_max_x(50);
6278 tester.set_rule(crush_rule);
b5b8bbf5 6279 auto start = ceph::coarse_mono_clock::now();
224ce89b 6280 r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 6281 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 6282 if (r < 0) {
94b18763 6283 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
6284 << ": " << err.str() << dendl;
6285 *ss << "crush test failed with " << r << ": " << err.str();
6286 return r;
6287 }
181888fb 6288 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 6289 << duration << dendl;
7c673cae
FG
6290 }
6291 unsigned size, min_size;
6292 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
6293 if (r) {
94b18763 6294 dout(10) << "prepare_pool_size returns " << r << dendl;
7c673cae
FG
6295 return r;
6296 }
3efd9988
FG
6297 r = check_pg_num(-1, pg_num, size, ss);
6298 if (r) {
94b18763 6299 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
6300 return r;
6301 }
7c673cae 6302
31f18b77 6303 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
6304 return -EINVAL;
6305 }
6306
6307 uint32_t stripe_width = 0;
6308 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6309 if (r) {
94b18763 6310 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
6311 return r;
6312 }
6313
6314 bool fread = false;
6315 if (pool_type == pg_pool_t::TYPE_ERASURE) {
6316 switch (fast_read) {
6317 case FAST_READ_OFF:
6318 fread = false;
6319 break;
6320 case FAST_READ_ON:
6321 fread = true;
6322 break;
6323 case FAST_READ_DEFAULT:
6324 fread = g_conf->mon_osd_pool_ec_fast_read;
6325 break;
6326 default:
6327 *ss << "invalid fast_read setting: " << fast_read;
6328 return -EINVAL;
6329 }
6330 }
6331
6332 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6333 p != pending_inc.new_pool_names.end();
6334 ++p) {
6335 if (p->second == name)
6336 return 0;
6337 }
6338
6339 if (-1 == pending_inc.new_pool_max)
6340 pending_inc.new_pool_max = osdmap.pool_max;
6341 int64_t pool = ++pending_inc.new_pool_max;
6342 pg_pool_t empty;
6343 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6344 pi->type = pool_type;
6345 pi->fast_read = fread;
6346 pi->flags = g_conf->osd_pool_default_flags;
6347 if (g_conf->osd_pool_default_flag_hashpspool)
6348 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6349 if (g_conf->osd_pool_default_flag_nodelete)
6350 pi->set_flag(pg_pool_t::FLAG_NODELETE);
6351 if (g_conf->osd_pool_default_flag_nopgchange)
6352 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6353 if (g_conf->osd_pool_default_flag_nosizechange)
6354 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
6355 if (g_conf->osd_pool_use_gmt_hitset &&
6356 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
6357 pi->use_gmt_hitset = true;
6358 else
6359 pi->use_gmt_hitset = false;
6360
6361 pi->size = size;
6362 pi->min_size = min_size;
31f18b77 6363 pi->crush_rule = crush_rule;
7c673cae
FG
6364 pi->expected_num_objects = expected_num_objects;
6365 pi->object_hash = CEPH_STR_HASH_RJENKINS;
6366 pi->set_pg_num(pg_num);
6367 pi->set_pgp_num(pgp_num);
6368 pi->last_change = pending_inc.epoch;
6369 pi->auid = auid;
6370 pi->erasure_code_profile = erasure_code_profile;
6371 pi->stripe_width = stripe_width;
6372 pi->cache_target_dirty_ratio_micro =
6373 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
6374 pi->cache_target_dirty_high_ratio_micro =
6375 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
6376 pi->cache_target_full_ratio_micro =
6377 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
6378 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
6379 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
6380 pending_inc.new_pool_names[pool] = name;
6381 return 0;
6382}
6383
6384bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
6385{
6386 op->mark_osdmon_event(__func__);
6387 ostringstream ss;
6388 if (pending_inc.new_flags < 0)
6389 pending_inc.new_flags = osdmap.get_flags();
6390 pending_inc.new_flags |= flag;
6391 ss << OSDMap::get_flag_string(flag) << " is set";
6392 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6393 get_last_committed() + 1));
6394 return true;
6395}
6396
6397bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
6398{
6399 op->mark_osdmon_event(__func__);
6400 ostringstream ss;
6401 if (pending_inc.new_flags < 0)
6402 pending_inc.new_flags = osdmap.get_flags();
6403 pending_inc.new_flags &= ~flag;
6404 ss << OSDMap::get_flag_string(flag) << " is unset";
6405 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6406 get_last_committed() + 1));
6407 return true;
6408}
6409
7c673cae
FG
6410int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
6411 stringstream& ss)
6412{
6413 string poolstr;
6414 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
6415 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6416 if (pool < 0) {
6417 ss << "unrecognized pool '" << poolstr << "'";
6418 return -ENOENT;
6419 }
6420 string var;
6421 cmd_getval(g_ceph_context, cmdmap, "var", var);
6422
6423 pg_pool_t p = *osdmap.get_pg_pool(pool);
6424 if (pending_inc.new_pools.count(pool))
6425 p = pending_inc.new_pools[pool];
6426
6427 // accept val as a json string in the normal case (current
6428 // generation monitor). parse out int or float values from the
6429 // string as needed. however, if it is not a string, try to pull
6430 // out an int, in case an older monitor with an older json schema is
6431 // forwarding a request.
6432 string val;
6433 string interr, floaterr;
6434 int64_t n = 0;
6435 double f = 0;
6436 int64_t uf = 0; // micro-f
6437 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
6438 // wasn't a string; maybe an older mon forwarded json with an int?
6439 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
6440 return -EINVAL; // no value!
6441 } else {
6442 // we got a string. see if it contains an int.
6443 n = strict_strtoll(val.c_str(), 10, &interr);
6444 // or a float
6445 f = strict_strtod(val.c_str(), &floaterr);
6446 uf = llrintl(f * (double)1000000.0);
6447 }
6448
6449 if (!p.is_tier() &&
6450 (var == "hit_set_type" || var == "hit_set_period" ||
6451 var == "hit_set_count" || var == "hit_set_fpp" ||
6452 var == "target_max_objects" || var == "target_max_bytes" ||
6453 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
6454 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
6455 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
6456 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
6457 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
6458 return -EACCES;
6459 }
6460
6461 if (var == "size") {
6462 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6463 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
6464 return -EPERM;
6465 }
6466 if (p.type == pg_pool_t::TYPE_ERASURE) {
6467 ss << "can not change the size of an erasure-coded pool";
6468 return -ENOTSUP;
6469 }
6470 if (interr.length()) {
6471 ss << "error parsing integer value '" << val << "': " << interr;
6472 return -EINVAL;
6473 }
6474 if (n <= 0 || n > 10) {
6475 ss << "pool size must be between 1 and 10";
6476 return -EINVAL;
6477 }
3efd9988
FG
6478 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
6479 if (r < 0) {
6480 return r;
6481 }
7c673cae
FG
6482 p.size = n;
6483 if (n < p.min_size)
6484 p.min_size = n;
6485 } else if (var == "min_size") {
6486 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6487 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6488 return -EPERM;
6489 }
6490 if (interr.length()) {
6491 ss << "error parsing integer value '" << val << "': " << interr;
6492 return -EINVAL;
6493 }
6494
6495 if (p.type != pg_pool_t::TYPE_ERASURE) {
6496 if (n < 1 || n > p.size) {
6497 ss << "pool min_size must be between 1 and " << (int)p.size;
6498 return -EINVAL;
6499 }
6500 } else {
6501 ErasureCodeInterfaceRef erasure_code;
6502 int k;
6503 stringstream tmp;
6504 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
6505 if (err == 0) {
6506 k = erasure_code->get_data_chunk_count();
6507 } else {
b32b8144 6508 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
6509 return err;
6510 }
6511
6512 if (n < k || n > p.size) {
6513 ss << "pool min_size must be between " << k << " and " << (int)p.size;
6514 return -EINVAL;
6515 }
6516 }
6517 p.min_size = n;
6518 } else if (var == "auid") {
6519 if (interr.length()) {
6520 ss << "error parsing integer value '" << val << "': " << interr;
6521 return -EINVAL;
6522 }
6523 p.auid = n;
6524 } else if (var == "crash_replay_interval") {
6525 if (interr.length()) {
6526 ss << "error parsing integer value '" << val << "': " << interr;
6527 return -EINVAL;
6528 }
6529 p.crash_replay_interval = n;
6530 } else if (var == "pg_num") {
6531 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6532 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6533 return -EPERM;
6534 }
6535 if (interr.length()) {
6536 ss << "error parsing integer value '" << val << "': " << interr;
6537 return -EINVAL;
6538 }
6539 if (n <= (int)p.get_pg_num()) {
6540 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
6541 if (n < (int)p.get_pg_num())
6542 return -EEXIST;
6543 return 0;
6544 }
c07f9fc5
FG
6545 if (n > (unsigned)g_conf->mon_max_pool_pg_num) {
6546 ss << "'pg_num' must be greater than 0 and less than or equal to "
6547 << g_conf->mon_max_pool_pg_num
6548 << " (you may adjust 'mon max pool pg num' for higher values)";
6549 return -ERANGE;
6550 }
3efd9988
FG
6551 int r = check_pg_num(pool, n, p.get_size(), &ss);
6552 if (r) {
6553 return r;
6554 }
7c673cae
FG
6555 string force;
6556 cmd_getval(g_ceph_context,cmdmap, "force", force);
6557 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
6558 force != "--yes-i-really-mean-it") {
6559 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6560 return -EPERM;
6561 }
6562 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
6563 int64_t new_pgs = n - p.get_pg_num();
6564 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
6565 ss << "specified pg_num " << n << " is too large (creating "
6566 << new_pgs << " new PGs on ~" << expected_osds
6567 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
6568 << ')';
6569 return -E2BIG;
6570 }
6571 p.set_pg_num(n);
6572 // force pre-luminous clients to resend their ops, since they
6573 // don't understand that split PGs now form a new interval.
6574 p.last_force_op_resend_preluminous = pending_inc.epoch;
6575 } else if (var == "pgp_num") {
6576 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6577 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6578 return -EPERM;
6579 }
6580 if (interr.length()) {
6581 ss << "error parsing integer value '" << val << "': " << interr;
6582 return -EINVAL;
6583 }
6584 if (n <= 0) {
6585 ss << "specified pgp_num must > 0, but you set to " << n;
6586 return -EINVAL;
6587 }
6588 if (n > (int)p.get_pg_num()) {
6589 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
6590 return -EINVAL;
6591 }
6592 p.set_pgp_num(n);
6593 } else if (var == "crush_rule") {
6594 int id = osdmap.crush->get_rule_id(val);
6595 if (id == -ENOENT) {
6596 ss << "crush rule " << val << " does not exist";
6597 return -ENOENT;
6598 }
6599 if (id < 0) {
6600 ss << cpp_strerror(id);
6601 return -ENOENT;
6602 }
6603 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
6604 return -EINVAL;
6605 }
31f18b77 6606 p.crush_rule = id;
7c673cae
FG
6607 } else if (var == "nodelete" || var == "nopgchange" ||
6608 var == "nosizechange" || var == "write_fadvise_dontneed" ||
6609 var == "noscrub" || var == "nodeep-scrub") {
6610 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6611 // make sure we only compare against 'n' if we didn't receive a string
6612 if (val == "true" || (interr.empty() && n == 1)) {
6613 p.set_flag(flag);
6614 } else if (val == "false" || (interr.empty() && n == 0)) {
6615 p.unset_flag(flag);
6616 } else {
6617 ss << "expecting value 'true', 'false', '0', or '1'";
6618 return -EINVAL;
6619 }
6620 } else if (var == "hashpspool") {
6621 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6622 string force;
6623 cmd_getval(g_ceph_context, cmdmap, "force", force);
6624 if (force != "--yes-i-really-mean-it") {
6625 ss << "are you SURE? this will remap all placement groups in this pool,"
6626 " this triggers large data movement,"
6627 " pass --yes-i-really-mean-it if you really do.";
6628 return -EPERM;
6629 }
6630 // make sure we only compare against 'n' if we didn't receive a string
6631 if (val == "true" || (interr.empty() && n == 1)) {
6632 p.set_flag(flag);
6633 } else if (val == "false" || (interr.empty() && n == 0)) {
6634 p.unset_flag(flag);
6635 } else {
6636 ss << "expecting value 'true', 'false', '0', or '1'";
6637 return -EINVAL;
6638 }
6639 } else if (var == "hit_set_type") {
6640 if (val == "none")
6641 p.hit_set_params = HitSet::Params();
6642 else {
6643 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
6644 if (err)
6645 return err;
6646 if (val == "bloom") {
6647 BloomHitSet::Params *bsp = new BloomHitSet::Params;
6648 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
6649 p.hit_set_params = HitSet::Params(bsp);
6650 } else if (val == "explicit_hash")
6651 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
6652 else if (val == "explicit_object")
6653 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
6654 else {
6655 ss << "unrecognized hit_set type '" << val << "'";
6656 return -EINVAL;
6657 }
6658 }
6659 } else if (var == "hit_set_period") {
6660 if (interr.length()) {
6661 ss << "error parsing integer value '" << val << "': " << interr;
6662 return -EINVAL;
6663 }
6664 p.hit_set_period = n;
6665 } else if (var == "hit_set_count") {
6666 if (interr.length()) {
6667 ss << "error parsing integer value '" << val << "': " << interr;
6668 return -EINVAL;
6669 }
6670 p.hit_set_count = n;
6671 } else if (var == "hit_set_fpp") {
6672 if (floaterr.length()) {
6673 ss << "error parsing floating point value '" << val << "': " << floaterr;
6674 return -EINVAL;
6675 }
6676 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
6677 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
6678 return -EINVAL;
6679 }
6680 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
6681 bloomp->set_fpp(f);
6682 } else if (var == "use_gmt_hitset") {
6683 if (val == "true" || (interr.empty() && n == 1)) {
3efd9988
FG
6684 string force;
6685 cmd_getval(g_ceph_context, cmdmap, "force", force);
6686 if (!osdmap.get_num_up_osds() && force != "--yes-i-really-mean-it") {
6687 ss << "Not advisable to continue since no OSDs are up. Pass "
6688 << "--yes-i-really-mean-it if you really wish to continue.";
6689 return -EPERM;
6690 }
6691 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)
6692 && force != "--yes-i-really-mean-it") {
7c673cae
FG
6693 ss << "not all OSDs support GMT hit set.";
6694 return -EINVAL;
6695 }
6696 p.use_gmt_hitset = true;
6697 } else {
6698 ss << "expecting value 'true' or '1'";
6699 return -EINVAL;
6700 }
6701 } else if (var == "allow_ec_overwrites") {
6702 if (!p.is_erasure()) {
6703 ss << "ec overwrites can only be enabled for an erasure coded pool";
6704 return -EINVAL;
6705 }
224ce89b
WB
6706 stringstream err;
6707 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6708 !is_pool_currently_all_bluestore(pool, p, &err)) {
6709 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6710 return -EINVAL;
6711 }
7c673cae
FG
6712 if (val == "true" || (interr.empty() && n == 1)) {
6713 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6714 } else if (val == "false" || (interr.empty() && n == 0)) {
6715 ss << "ec overwrites cannot be disabled once enabled";
6716 return -EINVAL;
6717 } else {
6718 ss << "expecting value 'true', 'false', '0', or '1'";
6719 return -EINVAL;
6720 }
7c673cae
FG
6721 } else if (var == "target_max_objects") {
6722 if (interr.length()) {
6723 ss << "error parsing int '" << val << "': " << interr;
6724 return -EINVAL;
6725 }
6726 p.target_max_objects = n;
6727 } else if (var == "target_max_bytes") {
6728 if (interr.length()) {
6729 ss << "error parsing int '" << val << "': " << interr;
6730 return -EINVAL;
6731 }
6732 p.target_max_bytes = n;
6733 } else if (var == "cache_target_dirty_ratio") {
6734 if (floaterr.length()) {
6735 ss << "error parsing float '" << val << "': " << floaterr;
6736 return -EINVAL;
6737 }
6738 if (f < 0 || f > 1.0) {
6739 ss << "value must be in the range 0..1";
6740 return -ERANGE;
6741 }
6742 p.cache_target_dirty_ratio_micro = uf;
6743 } else if (var == "cache_target_dirty_high_ratio") {
6744 if (floaterr.length()) {
6745 ss << "error parsing float '" << val << "': " << floaterr;
6746 return -EINVAL;
6747 }
6748 if (f < 0 || f > 1.0) {
6749 ss << "value must be in the range 0..1";
6750 return -ERANGE;
6751 }
6752 p.cache_target_dirty_high_ratio_micro = uf;
6753 } else if (var == "cache_target_full_ratio") {
6754 if (floaterr.length()) {
6755 ss << "error parsing float '" << val << "': " << floaterr;
6756 return -EINVAL;
6757 }
6758 if (f < 0 || f > 1.0) {
6759 ss << "value must be in the range 0..1";
6760 return -ERANGE;
6761 }
6762 p.cache_target_full_ratio_micro = uf;
6763 } else if (var == "cache_min_flush_age") {
6764 if (interr.length()) {
6765 ss << "error parsing int '" << val << "': " << interr;
6766 return -EINVAL;
6767 }
6768 p.cache_min_flush_age = n;
6769 } else if (var == "cache_min_evict_age") {
6770 if (interr.length()) {
6771 ss << "error parsing int '" << val << "': " << interr;
6772 return -EINVAL;
6773 }
6774 p.cache_min_evict_age = n;
6775 } else if (var == "min_read_recency_for_promote") {
6776 if (interr.length()) {
6777 ss << "error parsing integer value '" << val << "': " << interr;
6778 return -EINVAL;
6779 }
6780 p.min_read_recency_for_promote = n;
6781 } else if (var == "hit_set_grade_decay_rate") {
6782 if (interr.length()) {
6783 ss << "error parsing integer value '" << val << "': " << interr;
6784 return -EINVAL;
6785 }
6786 if (n > 100 || n < 0) {
6787 ss << "value out of range,valid range is 0 - 100";
6788 return -EINVAL;
6789 }
6790 p.hit_set_grade_decay_rate = n;
6791 } else if (var == "hit_set_search_last_n") {
6792 if (interr.length()) {
6793 ss << "error parsing integer value '" << val << "': " << interr;
6794 return -EINVAL;
6795 }
6796 if (n > p.hit_set_count || n < 0) {
6797 ss << "value out of range,valid range is 0 - hit_set_count";
6798 return -EINVAL;
6799 }
6800 p.hit_set_search_last_n = n;
6801 } else if (var == "min_write_recency_for_promote") {
6802 if (interr.length()) {
6803 ss << "error parsing integer value '" << val << "': " << interr;
6804 return -EINVAL;
6805 }
6806 p.min_write_recency_for_promote = n;
6807 } else if (var == "fast_read") {
6808 if (p.is_replicated()) {
6809 ss << "fast read is not supported in replication pool";
6810 return -EINVAL;
6811 }
6812 if (val == "true" || (interr.empty() && n == 1)) {
6813 p.fast_read = true;
6814 } else if (val == "false" || (interr.empty() && n == 0)) {
6815 p.fast_read = false;
6816 } else {
6817 ss << "expecting value 'true', 'false', '0', or '1'";
6818 return -EINVAL;
6819 }
6820 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 6821 bool unset = val == "unset";
7c673cae 6822 if (var == "compression_mode") {
224ce89b
WB
6823 if (!unset) {
6824 auto cmode = Compressor::get_comp_mode_type(val);
6825 if (!cmode) {
6826 ss << "unrecognized compression mode '" << val << "'";
6827 return -EINVAL;
6828 }
7c673cae
FG
6829 }
6830 } else if (var == "compression_algorithm") {
224ce89b
WB
6831 if (!unset) {
6832 auto alg = Compressor::get_comp_alg_type(val);
6833 if (!alg) {
6834 ss << "unrecognized compression_algorithm '" << val << "'";
6835 return -EINVAL;
6836 }
7c673cae
FG
6837 }
6838 } else if (var == "compression_required_ratio") {
6839 if (floaterr.length()) {
6840 ss << "error parsing float value '" << val << "': " << floaterr;
6841 return -EINVAL;
6842 }
224ce89b 6843 if (f < 0 || f > 1) {
7c673cae 6844 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 6845 return -EINVAL;
7c673cae
FG
6846 }
6847 } else if (var == "csum_type") {
224ce89b 6848 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
6849 if (t < 0 ) {
6850 ss << "unrecognized csum_type '" << val << "'";
224ce89b 6851 return -EINVAL;
7c673cae
FG
6852 }
6853 //preserve csum_type numeric value
6854 n = t;
6855 interr.clear();
6856 } else if (var == "compression_max_blob_size" ||
6857 var == "compression_min_blob_size" ||
6858 var == "csum_max_block" ||
6859 var == "csum_min_block") {
6860 if (interr.length()) {
6861 ss << "error parsing int value '" << val << "': " << interr;
6862 return -EINVAL;
6863 }
6864 }
6865
6866 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6867 switch (desc.type) {
6868 case pool_opts_t::STR:
224ce89b 6869 if (unset) {
7c673cae
FG
6870 p.opts.unset(desc.key);
6871 } else {
6872 p.opts.set(desc.key, static_cast<std::string>(val));
6873 }
6874 break;
6875 case pool_opts_t::INT:
6876 if (interr.length()) {
6877 ss << "error parsing integer value '" << val << "': " << interr;
6878 return -EINVAL;
6879 }
6880 if (n == 0) {
6881 p.opts.unset(desc.key);
6882 } else {
6883 p.opts.set(desc.key, static_cast<int>(n));
6884 }
6885 break;
6886 case pool_opts_t::DOUBLE:
6887 if (floaterr.length()) {
6888 ss << "error parsing floating point value '" << val << "': " << floaterr;
6889 return -EINVAL;
6890 }
6891 if (f == 0) {
6892 p.opts.unset(desc.key);
6893 } else {
6894 p.opts.set(desc.key, static_cast<double>(f));
6895 }
6896 break;
6897 default:
6898 assert(!"unknown type");
6899 }
6900 } else {
6901 ss << "unrecognized variable '" << var << "'";
6902 return -EINVAL;
6903 }
224ce89b
WB
6904 if (val != "unset") {
6905 ss << "set pool " << pool << " " << var << " to " << val;
6906 } else {
6907 ss << "unset pool " << pool << " " << var;
6908 }
7c673cae
FG
6909 p.last_change = pending_inc.epoch;
6910 pending_inc.new_pools[pool] = p;
6911 return 0;
6912}
6913
c07f9fc5
FG
6914int OSDMonitor::prepare_command_pool_application(const string &prefix,
6915 map<string,cmd_vartype> &cmdmap,
6916 stringstream& ss)
6917{
6918 string pool_name;
6919 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
6920 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6921 if (pool < 0) {
6922 ss << "unrecognized pool '" << pool_name << "'";
6923 return -ENOENT;
6924 }
6925
6926 pg_pool_t p = *osdmap.get_pg_pool(pool);
6927 if (pending_inc.new_pools.count(pool)) {
6928 p = pending_inc.new_pools[pool];
6929 }
6930
6931 string app;
6932 cmd_getval(g_ceph_context, cmdmap, "app", app);
6933 bool app_exists = (p.application_metadata.count(app) > 0);
6934
6935 if (boost::algorithm::ends_with(prefix, "enable")) {
6936 if (app.empty()) {
6937 ss << "application name must be provided";
6938 return -EINVAL;
6939 }
6940
6941 if (p.is_tier()) {
6942 ss << "application must be enabled on base tier";
6943 return -EINVAL;
6944 }
6945
6946 string force;
6947 cmd_getval(g_ceph_context, cmdmap, "force", force);
6948
6949 if (!app_exists && !p.application_metadata.empty() &&
6950 force != "--yes-i-really-mean-it") {
6951 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
6952 << "application; pass --yes-i-really-mean-it to proceed anyway";
6953 return -EPERM;
6954 }
6955
6956 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
6957 ss << "too many enabled applications on pool '" << pool_name << "'; "
6958 << "max " << MAX_POOL_APPLICATIONS;
6959 return -EINVAL;
6960 }
6961
6962 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
6963 ss << "application name '" << app << "' too long; max length "
6964 << MAX_POOL_APPLICATION_LENGTH;
6965 return -EINVAL;
6966 }
6967
6968 if (!app_exists) {
6969 p.application_metadata[app] = {};
6970 }
6971 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
6972
6973 } else if (boost::algorithm::ends_with(prefix, "disable")) {
6974 string force;
6975 cmd_getval(g_ceph_context, cmdmap, "force", force);
6976
6977 if (force != "--yes-i-really-mean-it") {
6978 ss << "Are you SURE? Disabling an application within a pool might result "
6979 << "in loss of application functionality; pass "
6980 << "--yes-i-really-mean-it to proceed anyway";
6981 return -EPERM;
6982 }
6983
6984 if (!app_exists) {
6985 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6986 << "'";
6987 return 0; // idempotent
6988 }
6989
6990 p.application_metadata.erase(app);
6991 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
6992
6993 } else if (boost::algorithm::ends_with(prefix, "set")) {
6994 if (p.is_tier()) {
6995 ss << "application metadata must be set on base tier";
6996 return -EINVAL;
6997 }
6998
6999 if (!app_exists) {
7000 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7001 << "'";
7002 return -ENOENT;
7003 }
7004
7005 string key;
7006 cmd_getval(g_ceph_context, cmdmap, "key", key);
7007
7008 if (key.empty()) {
7009 ss << "key must be provided";
7010 return -EINVAL;
7011 }
7012
7013 auto &app_keys = p.application_metadata[app];
7014 if (app_keys.count(key) == 0 &&
7015 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
7016 ss << "too many keys set for application '" << app << "' on pool '"
7017 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
7018 return -EINVAL;
7019 }
7020
7021 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
7022 ss << "key '" << app << "' too long; max length "
7023 << MAX_POOL_APPLICATION_LENGTH;
7024 return -EINVAL;
7025 }
7026
7027 string value;
7028 cmd_getval(g_ceph_context, cmdmap, "value", value);
7029 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
7030 ss << "value '" << value << "' too long; max length "
7031 << MAX_POOL_APPLICATION_LENGTH;
7032 return -EINVAL;
7033 }
7034
7035 p.application_metadata[app][key] = value;
7036 ss << "set application '" << app << "' key '" << key << "' to '"
7037 << value << "' on pool '" << pool_name << "'";
7038 } else if (boost::algorithm::ends_with(prefix, "rm")) {
7039 if (!app_exists) {
7040 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7041 << "'";
7042 return -ENOENT;
7043 }
7044
7045 string key;
7046 cmd_getval(g_ceph_context, cmdmap, "key", key);
7047 auto it = p.application_metadata[app].find(key);
7048 if (it == p.application_metadata[app].end()) {
7049 ss << "application '" << app << "' on pool '" << pool_name
7050 << "' does not have key '" << key << "'";
7051 return 0; // idempotent
7052 }
7053
7054 p.application_metadata[app].erase(it);
7055 ss << "removed application '" << app << "' key '" << key << "' on pool '"
7056 << pool_name << "'";
7057 } else {
7058 assert(false);
7059 }
7060
7061 p.last_change = pending_inc.epoch;
7062 pending_inc.new_pools[pool] = p;
7063 return 0;
7064}
7065
31f18b77
FG
7066int OSDMonitor::_prepare_command_osd_crush_remove(
7067 CrushWrapper &newcrush,
7068 int32_t id,
7069 int32_t ancestor,
7070 bool has_ancestor,
7071 bool unlink_only)
7072{
7073 int err = 0;
7074
7075 if (has_ancestor) {
7076 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
7077 unlink_only);
7078 } else {
7079 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
7080 }
7081 return err;
7082}
7083
7084void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
7085{
7086 pending_inc.crush.clear();
7087 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7088}
7089
7090int OSDMonitor::prepare_command_osd_crush_remove(
7091 CrushWrapper &newcrush,
7092 int32_t id,
7093 int32_t ancestor,
7094 bool has_ancestor,
7095 bool unlink_only)
7096{
7097 int err = _prepare_command_osd_crush_remove(
7098 newcrush, id, ancestor,
7099 has_ancestor, unlink_only);
7100
7101 if (err < 0)
7102 return err;
7103
7104 assert(err == 0);
7105 do_osd_crush_remove(newcrush);
7106
7107 return 0;
7108}
7109
7110int OSDMonitor::prepare_command_osd_remove(int32_t id)
7111{
7112 if (osdmap.is_up(id)) {
7113 return -EBUSY;
7114 }
7115
7116 pending_inc.new_state[id] = osdmap.get_state(id);
7117 pending_inc.new_uuid[id] = uuid_d();
7118 pending_metadata_rm.insert(id);
7119 pending_metadata.erase(id);
7120
7121 return 0;
7122}
7123
7124int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
7125{
7126 assert(existing_id);
7127 *existing_id = -1;
7128
7129 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
7130 if (!osdmap.exists(i) &&
7131 pending_inc.new_up_client.count(i) == 0 &&
7132 (pending_inc.new_state.count(i) == 0 ||
7133 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
7134 *existing_id = i;
7135 return -1;
7136 }
7137 }
7138
7139 if (pending_inc.new_max_osd < 0) {
7140 return osdmap.get_max_osd();
7141 }
7142 return pending_inc.new_max_osd;
7143}
7144
7145void OSDMonitor::do_osd_create(
7146 const int32_t id,
7147 const uuid_d& uuid,
3a9019d9 7148 const string& device_class,
31f18b77
FG
7149 int32_t* new_id)
7150{
7151 dout(10) << __func__ << " uuid " << uuid << dendl;
7152 assert(new_id);
7153
7154 // We presume validation has been performed prior to calling this
7155 // function. We assert with prejudice.
7156
7157 int32_t allocated_id = -1; // declare here so we can jump
7158 int32_t existing_id = -1;
7159 if (!uuid.is_zero()) {
7160 existing_id = osdmap.identify_osd(uuid);
7161 if (existing_id >= 0) {
7162 assert(id < 0 || id == existing_id);
7163 *new_id = existing_id;
7164 goto out;
7165 } else if (id >= 0) {
7166 // uuid does not exist, and id has been provided, so just create
7167 // the new osd.id
7168 *new_id = id;
7169 goto out;
7170 }
7171 }
7172
7173 // allocate a new id
7174 allocated_id = _allocate_osd_id(&existing_id);
7175 dout(10) << __func__ << " allocated id " << allocated_id
7176 << " existing id " << existing_id << dendl;
7177 if (existing_id >= 0) {
7178 assert(existing_id < osdmap.get_max_osd());
7179 assert(allocated_id < 0);
7180 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
7181 *new_id = existing_id;
31f18b77
FG
7182 } else if (allocated_id >= 0) {
7183 assert(existing_id < 0);
7184 // raise max_osd
7185 if (pending_inc.new_max_osd < 0) {
7186 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
7187 } else {
7188 ++pending_inc.new_max_osd;
7189 }
7190 *new_id = pending_inc.new_max_osd - 1;
7191 assert(*new_id == allocated_id);
7192 } else {
7193 assert(0 == "unexpected condition");
7194 }
7195
7196out:
3a9019d9
FG
7197 if (device_class.size()) {
7198 CrushWrapper newcrush;
7199 _get_pending_crush(newcrush);
7200 if (newcrush.get_max_devices() < *new_id + 1) {
7201 newcrush.set_max_devices(*new_id + 1);
7202 }
7203 string name = string("osd.") + stringify(*new_id);
7204 if (!newcrush.item_exists(*new_id)) {
7205 newcrush.set_item_name(*new_id, name);
7206 }
7207 ostringstream ss;
7208 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
7209 if (r < 0) {
7210 derr << __func__ << " failed to set " << name << " device_class "
7211 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
7212 << dendl;
7213 // non-fatal... this might be a replay and we want to be idempotent.
7214 } else {
7215 dout(20) << __func__ << " set " << name << " device_class " << device_class
7216 << dendl;
7217 pending_inc.crush.clear();
7218 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7219 }
7220 } else {
7221 dout(20) << __func__ << " no device_class" << dendl;
7222 }
7223
31f18b77
FG
7224 dout(10) << __func__ << " using id " << *new_id << dendl;
7225 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
7226 pending_inc.new_max_osd = *new_id + 1;
7227 }
7228
7229 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
7230 if (!uuid.is_zero())
7231 pending_inc.new_uuid[*new_id] = uuid;
7232}
7233
7234int OSDMonitor::validate_osd_create(
7235 const int32_t id,
7236 const uuid_d& uuid,
7237 const bool check_osd_exists,
7238 int32_t* existing_id,
7239 stringstream& ss)
7240{
7241
7242 dout(10) << __func__ << " id " << id << " uuid " << uuid
7243 << " check_osd_exists " << check_osd_exists << dendl;
7244
7245 assert(existing_id);
7246
7247 if (id < 0 && uuid.is_zero()) {
7248 // we have nothing to validate
7249 *existing_id = -1;
7250 return 0;
7251 } else if (uuid.is_zero()) {
7252 // we have an id but we will ignore it - because that's what
7253 // `osd create` does.
7254 return 0;
7255 }
7256
7257 /*
7258 * This function will be used to validate whether we are able to
7259 * create a new osd when the `uuid` is specified.
7260 *
7261 * It will be used by both `osd create` and `osd new`, as the checks
7262 * are basically the same when it pertains to osd id and uuid validation.
7263 * However, `osd create` presumes an `uuid` is optional, for legacy
7264 * reasons, while `osd new` requires the `uuid` to be provided. This
7265 * means that `osd create` will not be idempotent if an `uuid` is not
7266 * provided, but we will always guarantee the idempotency of `osd new`.
7267 */
7268
7269 assert(!uuid.is_zero());
7270 if (pending_inc.identify_osd(uuid) >= 0) {
7271 // osd is about to exist
7272 return -EAGAIN;
7273 }
7274
7275 int32_t i = osdmap.identify_osd(uuid);
7276 if (i >= 0) {
7277 // osd already exists
7278 if (id >= 0 && i != id) {
7279 ss << "uuid " << uuid << " already in use for different id " << i;
7280 return -EEXIST;
7281 }
7282 // return a positive errno to distinguish between a blocking error
7283 // and an error we consider to not be a problem (i.e., this would be
7284 // an idempotent operation).
7285 *existing_id = i;
7286 return EEXIST;
7287 }
7288 // i < 0
7289 if (id >= 0) {
7290 if (pending_inc.new_state.count(id)) {
7291 // osd is about to exist
7292 return -EAGAIN;
7293 }
7294 // we may not care if an osd exists if we are recreating a previously
7295 // destroyed osd.
7296 if (check_osd_exists && osdmap.exists(id)) {
7297 ss << "id " << id << " already in use and does not match uuid "
7298 << uuid;
7299 return -EINVAL;
7300 }
7301 }
7302 return 0;
7303}
7304
7305int OSDMonitor::prepare_command_osd_create(
7306 const int32_t id,
7307 const uuid_d& uuid,
7308 int32_t* existing_id,
7309 stringstream& ss)
7310{
7311 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7312 assert(existing_id);
b5b8bbf5
FG
7313 if (osdmap.is_destroyed(id)) {
7314 ss << "ceph osd create has been deprecated. Please use ceph osd new "
7315 "instead.";
7316 return -EINVAL;
7317 }
31f18b77
FG
7318
7319 if (uuid.is_zero()) {
7320 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
7321 }
7322
7323 return validate_osd_create(id, uuid, true, existing_id, ss);
7324}
7325
7326int OSDMonitor::prepare_command_osd_new(
7327 MonOpRequestRef op,
7328 const map<string,cmd_vartype>& cmdmap,
3a9019d9 7329 const map<string,string>& params,
31f18b77
FG
7330 stringstream &ss,
7331 Formatter *f)
7332{
7333 uuid_d uuid;
7334 string uuidstr;
7335 int64_t id = -1;
7336
7337 assert(paxos->is_plugged());
7338
7339 dout(10) << __func__ << " " << op << dendl;
7340
7341 /* validate command. abort now if something's wrong. */
7342
7343 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
7344 *
7345 * If `id` is not specified, we will identify any existing osd based
7346 * on `uuid`. Operation will be idempotent iff secrets match.
7347 *
7348 * If `id` is specified, we will identify any existing osd based on
7349 * `uuid` and match against `id`. If they match, operation will be
7350 * idempotent iff secrets match.
7351 *
7352 * `-i secrets.json` will be optional. If supplied, will be used
7353 * to check for idempotency when `id` and `uuid` match.
7354 *
7355 * If `id` is not specified, and `uuid` does not exist, an id will
7356 * be found or allocated for the osd.
7357 *
7358 * If `id` is specified, and the osd has been previously marked
7359 * as destroyed, then the `id` will be reused.
7360 */
7361 if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
7362 ss << "requires the OSD's UUID to be specified.";
7363 return -EINVAL;
7364 } else if (!uuid.parse(uuidstr.c_str())) {
7365 ss << "invalid UUID value '" << uuidstr << "'.";
7366 return -EINVAL;
7367 }
7368
7369 if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
7370 (id < 0)) {
7371 ss << "invalid OSD id; must be greater or equal than zero.";
7372 return -EINVAL;
7373 }
7374
7375 // are we running an `osd create`-like command, or recreating
7376 // a previously destroyed osd?
7377
7378 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
7379
7380 // we will care about `id` to assess whether osd is `destroyed`, or
7381 // to create a new osd.
7382 // we will need an `id` by the time we reach auth.
7383
7384 int32_t existing_id = -1;
7385 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
7386 &existing_id, ss);
7387
7388 bool may_be_idempotent = false;
7389 if (err == EEXIST) {
7390 // this is idempotent from the osdmon's point-of-view
7391 may_be_idempotent = true;
7392 assert(existing_id >= 0);
7393 id = existing_id;
7394 } else if (err < 0) {
7395 return err;
7396 }
7397
7398 if (!may_be_idempotent) {
7399 // idempotency is out of the window. We are either creating a new
7400 // osd or recreating a destroyed osd.
7401 //
7402 // We now need to figure out if we have an `id` (and if it's valid),
7403 // of find an `id` if we don't have one.
7404
7405 // NOTE: we need to consider the case where the `id` is specified for
7406 // `osd create`, and we must honor it. So this means checking if
7407 // the `id` is destroyed, and if so assume the destroy; otherwise,
7408 // check if it `exists` - in which case we complain about not being
7409 // `destroyed`. In the end, if nothing fails, we must allow the
7410 // creation, so that we are compatible with `create`.
7411 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
7412 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
7413 ss << "OSD " << id << " has not yet been destroyed";
7414 return -EINVAL;
7415 } else if (id < 0) {
7416 // find an `id`
7417 id = _allocate_osd_id(&existing_id);
7418 if (id < 0) {
7419 assert(existing_id >= 0);
7420 id = existing_id;
7421 }
7422 dout(10) << __func__ << " found id " << id << " to use" << dendl;
7423 } else if (id >= 0 && osdmap.is_destroyed(id)) {
7424 dout(10) << __func__ << " recreating osd." << id << dendl;
7425 } else {
7426 dout(10) << __func__ << " creating new osd." << id << dendl;
7427 }
7428 } else {
7429 assert(id >= 0);
7430 assert(osdmap.exists(id));
7431 }
7432
7433 // we are now able to either create a brand new osd or reuse an existing
7434 // osd that has been previously destroyed.
7435
7436 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7437
3a9019d9 7438 if (may_be_idempotent && params.empty()) {
31f18b77 7439 // nothing to do, really.
3a9019d9 7440 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
31f18b77
FG
7441 assert(id >= 0);
7442 if (f) {
7443 f->open_object_section("created_osd");
7444 f->dump_int("osdid", id);
7445 f->close_section();
7446 } else {
7447 ss << id;
7448 }
7449 return EEXIST;
7450 }
7451
3a9019d9
FG
7452 string device_class;
7453 auto p = params.find("crush_device_class");
7454 if (p != params.end()) {
7455 device_class = p->second;
7456 dout(20) << __func__ << " device_class will be " << device_class << dendl;
7457 }
31f18b77
FG
7458 string cephx_secret, lockbox_secret, dmcrypt_key;
7459 bool has_lockbox = false;
3a9019d9
FG
7460 bool has_secrets = params.count("cephx_secret")
7461 || params.count("cephx_lockbox_secret")
7462 || params.count("dmcrypt_key");
31f18b77
FG
7463
7464 ConfigKeyService *svc = nullptr;
7465 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
7466
7467 if (has_secrets) {
3a9019d9 7468 if (params.count("cephx_secret") == 0) {
31f18b77
FG
7469 ss << "requires a cephx secret.";
7470 return -EINVAL;
7471 }
3a9019d9 7472 cephx_secret = params.at("cephx_secret");
31f18b77 7473
3a9019d9
FG
7474 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
7475 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
7476
7477 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
7478 << " dmcrypt " << has_dmcrypt_key << dendl;
7479
7480 if (has_lockbox_secret && has_dmcrypt_key) {
7481 has_lockbox = true;
3a9019d9
FG
7482 lockbox_secret = params.at("cephx_lockbox_secret");
7483 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
7484 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
7485 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
7486 return -EINVAL;
7487 }
7488
7489 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
7490
7491 err = mon->authmon()->validate_osd_new(id, uuid,
7492 cephx_secret,
7493 lockbox_secret,
7494 cephx_entity,
7495 lockbox_entity,
7496 ss);
7497 if (err < 0) {
7498 return err;
7499 } else if (may_be_idempotent && err != EEXIST) {
7500 // for this to be idempotent, `id` should already be >= 0; no need
7501 // to use validate_id.
7502 assert(id >= 0);
7503 ss << "osd." << id << " exists but secrets do not match";
7504 return -EEXIST;
7505 }
7506
7507 if (has_lockbox) {
7508 svc = (ConfigKeyService*)mon->config_key_service;
7509 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
7510 if (err < 0) {
7511 return err;
7512 } else if (may_be_idempotent && err != EEXIST) {
7513 assert(id >= 0);
7514 ss << "osd." << id << " exists but dm-crypt key does not match.";
7515 return -EEXIST;
7516 }
7517 }
7518 }
7519 assert(!has_secrets || !cephx_secret.empty());
7520 assert(!has_lockbox || !lockbox_secret.empty());
7521
7522 if (may_be_idempotent) {
7523 // we have nothing to do for either the osdmon or the authmon,
7524 // and we have no lockbox - so the config key service will not be
7525 // touched. This is therefore an idempotent operation, and we can
7526 // just return right away.
7527 dout(10) << __func__ << " idempotent -- no op." << dendl;
7528 assert(id >= 0);
7529 if (f) {
7530 f->open_object_section("created_osd");
7531 f->dump_int("osdid", id);
7532 f->close_section();
7533 } else {
7534 ss << id;
7535 }
7536 return EEXIST;
7537 }
7538 assert(!may_be_idempotent);
7539
7540 // perform updates.
7541 if (has_secrets) {
7542 assert(!cephx_secret.empty());
7543 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
7544 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
7545
7546 err = mon->authmon()->do_osd_new(cephx_entity,
7547 lockbox_entity,
7548 has_lockbox);
7549 assert(0 == err);
7550
7551 if (has_lockbox) {
7552 assert(nullptr != svc);
7553 svc->do_osd_new(uuid, dmcrypt_key);
7554 }
7555 }
7556
7557 if (is_recreate_destroyed) {
7558 assert(id >= 0);
7559 assert(osdmap.is_destroyed(id));
7560 pending_inc.new_weight[id] = CEPH_OSD_OUT;
7561 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
c07f9fc5
FG
7562 if (osdmap.get_state(id) & CEPH_OSD_UP) {
7563 // due to http://tracker.ceph.com/issues/20751 some clusters may
7564 // have UP set for non-existent OSDs; make sure it is cleared
7565 // for a newly created osd.
7566 pending_inc.new_state[id] |= CEPH_OSD_UP;
7567 }
31f18b77
FG
7568 pending_inc.new_uuid[id] = uuid;
7569 } else {
7570 assert(id >= 0);
7571 int32_t new_id = -1;
3a9019d9 7572 do_osd_create(id, uuid, device_class, &new_id);
31f18b77
FG
7573 assert(new_id >= 0);
7574 assert(id == new_id);
7575 }
7576
7577 if (f) {
7578 f->open_object_section("created_osd");
7579 f->dump_int("osdid", id);
7580 f->close_section();
7581 } else {
7582 ss << id;
7583 }
7584
7585 return 0;
7586}
7587
7c673cae
FG
7588bool OSDMonitor::prepare_command(MonOpRequestRef op)
7589{
7590 op->mark_osdmon_event(__func__);
7591 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7592 stringstream ss;
7593 map<string, cmd_vartype> cmdmap;
7594 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
7595 string rs = ss.str();
7596 mon->reply_command(op, -EINVAL, rs, get_last_committed());
7597 return true;
7598 }
7599
7600 MonSession *session = m->get_session();
7601 if (!session) {
7602 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
7603 return true;
7604 }
7605
7606 return prepare_command_impl(op, cmdmap);
7607}
7608
7609static int parse_reweights(CephContext *cct,
7610 const map<string,cmd_vartype> &cmdmap,
7611 const OSDMap& osdmap,
7612 map<int32_t, uint32_t>* weights)
7613{
7614 string weights_str;
7615 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
7616 return -EINVAL;
7617 }
7618 std::replace(begin(weights_str), end(weights_str), '\'', '"');
7619 json_spirit::mValue json_value;
7620 if (!json_spirit::read(weights_str, json_value)) {
7621 return -EINVAL;
7622 }
7623 if (json_value.type() != json_spirit::obj_type) {
7624 return -EINVAL;
7625 }
7626 const auto obj = json_value.get_obj();
7627 try {
7628 for (auto& osd_weight : obj) {
7629 auto osd_id = std::stoi(osd_weight.first);
7630 if (!osdmap.exists(osd_id)) {
7631 return -ENOENT;
7632 }
7633 if (osd_weight.second.type() != json_spirit::str_type) {
7634 return -EINVAL;
7635 }
7636 auto weight = std::stoul(osd_weight.second.get_str());
7637 weights->insert({osd_id, weight});
7638 }
7639 } catch (const std::logic_error& e) {
7640 return -EINVAL;
7641 }
7642 return 0;
7643}
7644
31f18b77
FG
7645int OSDMonitor::prepare_command_osd_destroy(
7646 int32_t id,
7647 stringstream& ss)
7648{
7649 assert(paxos->is_plugged());
7650
7651 // we check if the osd exists for the benefit of `osd purge`, which may
7652 // have previously removed the osd. If the osd does not exist, return
7653 // -ENOENT to convey this, and let the caller deal with it.
7654 //
7655 // we presume that all auth secrets and config keys were removed prior
7656 // to this command being called. if they exist by now, we also assume
7657 // they must have been created by some other command and do not pertain
7658 // to this non-existent osd.
7659 if (!osdmap.exists(id)) {
7660 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
7661 return -ENOENT;
7662 }
7663
7664 uuid_d uuid = osdmap.get_uuid(id);
7665 dout(10) << __func__ << " destroying osd." << id
7666 << " uuid " << uuid << dendl;
7667
7668 // if it has been destroyed, we assume our work here is done.
7669 if (osdmap.is_destroyed(id)) {
7670 ss << "destroyed osd." << id;
7671 return 0;
7672 }
7673
7674 EntityName cephx_entity, lockbox_entity;
7675 bool idempotent_auth = false, idempotent_cks = false;
7676
7677 int err = mon->authmon()->validate_osd_destroy(id, uuid,
7678 cephx_entity,
7679 lockbox_entity,
7680 ss);
7681 if (err < 0) {
7682 if (err == -ENOENT) {
7683 idempotent_auth = true;
31f18b77
FG
7684 } else {
7685 return err;
7686 }
7687 }
7688
7689 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
7690 err = svc->validate_osd_destroy(id, uuid);
7691 if (err < 0) {
7692 assert(err == -ENOENT);
7693 err = 0;
7694 idempotent_cks = true;
7695 }
7696
7697 if (!idempotent_auth) {
7698 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
7699 assert(0 == err);
7700 }
7701
7702 if (!idempotent_cks) {
7703 svc->do_osd_destroy(id, uuid);
7704 }
7705
7706 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
7707 pending_inc.new_uuid[id] = uuid_d();
7708
7709 // we can only propose_pending() once per service, otherwise we'll be
7710 // defying PaxosService and all laws of nature. Therefore, as we may
7711 // be used during 'osd purge', let's keep the caller responsible for
7712 // proposing.
7713 assert(err == 0);
7714 return 0;
7715}
7716
7717int OSDMonitor::prepare_command_osd_purge(
7718 int32_t id,
7719 stringstream& ss)
7720{
7721 assert(paxos->is_plugged());
7722 dout(10) << __func__ << " purging osd." << id << dendl;
7723
7724 assert(!osdmap.is_up(id));
7725
7726 /*
7727 * This may look a bit weird, but this is what's going to happen:
7728 *
7729 * 1. we make sure that removing from crush works
7730 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7731 * error, then we abort the whole operation, as no updates
7732 * have been made. However, we this function will have
7733 * side-effects, thus we need to make sure that all operations
7734 * performed henceforth will *always* succeed.
7735 * 3. we call `prepare_command_osd_remove()`. Although this
7736 * function can return an error, it currently only checks if the
7737 * osd is up - and we have made sure that it is not so, so there
7738 * is no conflict, and it is effectively an update.
7739 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7740 * the crush update we delayed from before.
7741 */
7742
7743 CrushWrapper newcrush;
7744 _get_pending_crush(newcrush);
7745
7746 bool may_be_idempotent = false;
7747
7748 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
7749 if (err == -ENOENT) {
7750 err = 0;
7751 may_be_idempotent = true;
7752 } else if (err < 0) {
7753 ss << "error removing osd." << id << " from crush";
7754 return err;
7755 }
7756
7757 // no point destroying the osd again if it has already been marked destroyed
7758 if (!osdmap.is_destroyed(id)) {
7759 err = prepare_command_osd_destroy(id, ss);
7760 if (err < 0) {
7761 if (err == -ENOENT) {
7762 err = 0;
7763 } else {
7764 return err;
7765 }
7766 } else {
7767 may_be_idempotent = false;
7768 }
7769 }
7770 assert(0 == err);
7771
7772 if (may_be_idempotent && !osdmap.exists(id)) {
7773 dout(10) << __func__ << " osd." << id << " does not exist and "
7774 << "we are idempotent." << dendl;
7775 return -ENOENT;
7776 }
7777
7778 err = prepare_command_osd_remove(id);
7779 // we should not be busy, as we should have made sure this id is not up.
7780 assert(0 == err);
7781
7782 do_osd_crush_remove(newcrush);
7783 return 0;
7784}
7785
7c673cae
FG
7786bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
7787 map<string,cmd_vartype> &cmdmap)
7788{
7789 op->mark_osdmon_event(__func__);
7790 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7791 bool ret = false;
7792 stringstream ss;
7793 string rs;
7794 bufferlist rdata;
7795 int err = 0;
7796
7797 string format;
7798 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
7799 boost::scoped_ptr<Formatter> f(Formatter::create(format));
7800
7801 string prefix;
7802 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
7803
7804 int64_t osdid;
7805 string name;
b32b8144
FG
7806 bool osdid_present = false;
7807 if (prefix != "osd pg-temp" &&
7808 prefix != "osd pg-upmap" &&
7809 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
7810 osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
7811 }
7c673cae
FG
7812 if (osdid_present) {
7813 ostringstream oss;
7814 oss << "osd." << osdid;
7815 name = oss.str();
7816 }
7817
7818 // Even if there's a pending state with changes that could affect
7819 // a command, considering that said state isn't yet committed, we
7820 // just don't care about those changes if the command currently being
7821 // handled acts as a no-op against the current committed state.
7822 // In a nutshell, we assume this command happens *before*.
7823 //
7824 // Let me make this clearer:
7825 //
7826 // - If we have only one client, and that client issues some
7827 // operation that would conflict with this operation but is
7828 // still on the pending state, then we would be sure that said
7829 // operation wouldn't have returned yet, so the client wouldn't
7830 // issue this operation (unless the client didn't wait for the
7831 // operation to finish, and that would be the client's own fault).
7832 //
7833 // - If we have more than one client, each client will observe
7834 // whatever is the state at the moment of the commit. So, if we
7835 // have two clients, one issuing an unlink and another issuing a
7836 // link, and if the link happens while the unlink is still on the
7837 // pending state, from the link's point-of-view this is a no-op.
7838 // If different clients are issuing conflicting operations and
7839 // they care about that, then the clients should make sure they
7840 // enforce some kind of concurrency mechanism -- from our
7841 // perspective that's what Douglas Adams would call an SEP.
7842 //
7843 // This should be used as a general guideline for most commands handled
7844 // in this function. Adapt as you see fit, but please bear in mind that
7845 // this is the expected behavior.
7846
7847
7848 if (prefix == "osd setcrushmap" ||
7849 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
7850 if (pending_inc.crush.length()) {
7851 dout(10) << __func__ << " waiting for pending crush update " << dendl;
7852 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7853 return true;
7854 }
7c673cae
FG
7855 dout(10) << "prepare_command setting new crush map" << dendl;
7856 bufferlist data(m->get_data());
7857 CrushWrapper crush;
7858 try {
7859 bufferlist::iterator bl(data.begin());
7860 crush.decode(bl);
7861 }
7862 catch (const std::exception &e) {
7863 err = -EINVAL;
7864 ss << "Failed to parse crushmap: " << e.what();
7865 goto reply;
7866 }
31f18b77
FG
7867
7868 int64_t prior_version = 0;
7869 if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
7870 if (prior_version == osdmap.get_crush_version() - 1) {
7871 // see if we are a resend of the last update. this is imperfect
7872 // (multiple racing updaters may not both get reliable success)
7873 // but we expect crush updaters (via this interface) to be rare-ish.
7874 bufferlist current, proposed;
7875 osdmap.crush->encode(current, mon->get_quorum_con_features());
7876 crush.encode(proposed, mon->get_quorum_con_features());
7877 if (current.contents_equal(proposed)) {
7878 dout(10) << __func__
7879 << " proposed matches current and version equals previous"
7880 << dendl;
7881 err = 0;
7882 ss << osdmap.get_crush_version();
7883 goto reply;
7884 }
7885 }
7886 if (prior_version != osdmap.get_crush_version()) {
7887 err = -EPERM;
7888 ss << "prior_version " << prior_version << " != crush version "
7889 << osdmap.get_crush_version();
7890 goto reply;
7891 }
7892 }
7c673cae 7893
3efd9988 7894 if (crush.has_legacy_rule_ids()) {
31f18b77
FG
7895 err = -EINVAL;
7896 ss << "crush maps with ruleset != ruleid are no longer allowed";
7897 goto reply;
7898 }
7c673cae
FG
7899 if (!validate_crush_against_features(&crush, ss)) {
7900 err = -EINVAL;
7901 goto reply;
7902 }
31f18b77 7903
3efd9988
FG
7904 err = osdmap.validate_crush_rules(&crush, &ss);
7905 if (err < 0) {
7906 goto reply;
7c673cae
FG
7907 }
7908
224ce89b
WB
7909 if (g_conf->mon_osd_crush_smoke_test) {
7910 // sanity check: test some inputs to make sure this map isn't
7911 // totally broken
7912 dout(10) << " testing map" << dendl;
7913 stringstream ess;
7914 CrushTester tester(crush, ess);
b5b8bbf5 7915 tester.set_min_x(0);
224ce89b 7916 tester.set_max_x(50);
b5b8bbf5 7917 auto start = ceph::coarse_mono_clock::now();
224ce89b 7918 int r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 7919 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
7920 if (r < 0) {
7921 dout(10) << " tester.test_with_fork returns " << r
7922 << ": " << ess.str() << dendl;
7923 ss << "crush smoke test failed with " << r << ": " << ess.str();
7924 err = r;
7925 goto reply;
7926 }
b5b8bbf5
FG
7927 dout(10) << __func__ << " crush somke test duration: "
7928 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
7929 }
7930
7c673cae 7931 pending_inc.crush = data;
31f18b77 7932 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
7933 goto update;
7934
3efd9988
FG
7935 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
7936 CrushWrapper newcrush;
7937 _get_pending_crush(newcrush);
7938 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
7939 int bid = -1 - b;
7940 if (newcrush.bucket_exists(bid) &&
7941 newcrush.get_bucket_alg(bid)) {
7942 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
7943 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
7944 }
7945 }
7946 if (!validate_crush_against_features(&newcrush, ss)) {
7947 err = -EINVAL;
7948 goto reply;
7949 }
7950 pending_inc.crush.clear();
7951 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7952 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7953 get_last_committed() + 1));
7954 return true;
7c673cae 7955 } else if (prefix == "osd crush set-device-class") {
224ce89b
WB
7956 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7957 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7958 << "luminous' before using crush device classes";
7959 err = -EPERM;
7c673cae
FG
7960 goto reply;
7961 }
7962
7963 string device_class;
7964 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7965 err = -EINVAL; // no value!
7966 goto reply;
7967 }
7968
224ce89b
WB
7969 bool stop = false;
7970 vector<string> idvec;
7971 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7c673cae
FG
7972 CrushWrapper newcrush;
7973 _get_pending_crush(newcrush);
224ce89b
WB
7974 set<int> updated;
7975 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7976 set<int> osds;
7977 // wildcard?
7978 if (j == 0 &&
7979 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7980 osdmap.get_all_osds(osds);
7981 stop = true;
7982 } else {
7983 // try traditional single osd way
7984 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7985 if (osd < 0) {
7986 // ss has reason for failure
7987 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7988 err = -EINVAL;
7989 continue;
7990 }
7991 osds.insert(osd);
7992 }
7c673cae 7993
224ce89b
WB
7994 for (auto &osd : osds) {
7995 if (!osdmap.exists(osd)) {
7996 ss << "osd." << osd << " does not exist. ";
7997 continue;
7998 }
7c673cae 7999
224ce89b
WB
8000 ostringstream oss;
8001 oss << "osd." << osd;
8002 string name = oss.str();
7c673cae 8003
3a9019d9
FG
8004 if (newcrush.get_max_devices() < osd + 1) {
8005 newcrush.set_max_devices(osd + 1);
8006 }
224ce89b
WB
8007 string action;
8008 if (newcrush.item_exists(osd)) {
8009 action = "updating";
8010 } else {
8011 action = "creating";
8012 newcrush.set_item_name(osd, name);
8013 }
7c673cae 8014
224ce89b
WB
8015 dout(5) << action << " crush item id " << osd << " name '" << name
8016 << "' device_class '" << device_class << "'"
8017 << dendl;
8018 err = newcrush.update_device_class(osd, device_class, name, &ss);
8019 if (err < 0) {
8020 goto reply;
8021 }
8022 if (err == 0 && !_have_pending_crush()) {
8023 if (!stop) {
8024 // for single osd only, wildcard makes too much noise
8025 ss << "set-device-class item id " << osd << " name '" << name
8026 << "' device_class '" << device_class << "': no change";
8027 }
8028 } else {
8029 updated.insert(osd);
8030 }
8031 }
7c673cae
FG
8032 }
8033
224ce89b
WB
8034 if (!updated.empty()) {
8035 pending_inc.crush.clear();
8036 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8037 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
8038 getline(ss, rs);
8039 wait_for_finished_proposal(op,
8040 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8041 return true;
8042 }
7c673cae 8043
c07f9fc5
FG
8044 } else if (prefix == "osd crush rm-device-class") {
8045 bool stop = false;
8046 vector<string> idvec;
8047 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
8048 CrushWrapper newcrush;
8049 _get_pending_crush(newcrush);
8050 set<int> updated;
8051
8052 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8053 set<int> osds;
8054
8055 // wildcard?
8056 if (j == 0 &&
8057 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8058 osdmap.get_all_osds(osds);
8059 stop = true;
8060 } else {
8061 // try traditional single osd way
8062 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8063 if (osd < 0) {
8064 // ss has reason for failure
8065 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8066 err = -EINVAL;
8067 goto reply;
8068 }
8069 osds.insert(osd);
8070 }
8071
8072 for (auto &osd : osds) {
8073 if (!osdmap.exists(osd)) {
8074 ss << "osd." << osd << " does not exist. ";
8075 continue;
8076 }
8077
8078 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
8079 if (!class_name) {
8080 ss << "osd." << osd << " belongs to no class, ";
8081 continue;
8082 }
8083 // note that we do not verify if class_is_in_use here
8084 // in case the device is misclassified and user wants
8085 // to overridely reset...
8086
8087 err = newcrush.remove_device_class(g_ceph_context, osd, &ss);
8088 if (err < 0) {
8089 // ss has reason for failure
8090 goto reply;
8091 }
8092 updated.insert(osd);
8093 }
8094 }
8095
8096 if (!updated.empty()) {
8097 pending_inc.crush.clear();
8098 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8099 ss << "done removing class of osd(s): " << updated;
8100 getline(ss, rs);
8101 wait_for_finished_proposal(op,
8102 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8103 return true;
8104 }
35e4c445
FG
8105 } else if (prefix == "osd crush class rename") {
8106 string srcname, dstname;
8107 if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
8108 err = -EINVAL;
8109 goto reply;
8110 }
8111 if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
8112 err = -EINVAL;
8113 goto reply;
8114 }
8115
8116 CrushWrapper newcrush;
8117 _get_pending_crush(newcrush);
181888fb
FG
8118 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
8119 // suppose this is a replay and return success
8120 // so command is idempotent
8121 ss << "already renamed to '" << dstname << "'";
8122 err = 0;
35e4c445
FG
8123 goto reply;
8124 }
c07f9fc5 8125
35e4c445
FG
8126 err = newcrush.rename_class(srcname, dstname);
8127 if (err < 0) {
8128 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
8129 << cpp_strerror(err);
8130 goto reply;
8131 }
8132
8133 pending_inc.crush.clear();
8134 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8135 ss << "rename class '" << srcname << "' to '" << dstname << "'";
8136 goto update;
7c673cae
FG
8137 } else if (prefix == "osd crush add-bucket") {
8138 // os crush add-bucket <name> <type>
8139 string name, typestr;
8140 cmd_getval(g_ceph_context, cmdmap, "name", name);
8141 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
8142
8143 if (!_have_pending_crush() &&
8144 _get_stable_crush().name_exists(name)) {
8145 ss << "bucket '" << name << "' already exists";
8146 goto reply;
8147 }
8148
8149 CrushWrapper newcrush;
8150 _get_pending_crush(newcrush);
8151
8152 if (newcrush.name_exists(name)) {
8153 ss << "bucket '" << name << "' already exists";
8154 goto update;
8155 }
8156 int type = newcrush.get_type_id(typestr);
8157 if (type < 0) {
8158 ss << "type '" << typestr << "' does not exist";
8159 err = -EINVAL;
8160 goto reply;
8161 }
8162 if (type == 0) {
8163 ss << "type '" << typestr << "' is for devices, not buckets";
8164 err = -EINVAL;
8165 goto reply;
8166 }
8167 int bucketno;
8168 err = newcrush.add_bucket(0, 0,
8169 CRUSH_HASH_DEFAULT, type, 0, NULL,
8170 NULL, &bucketno);
8171 if (err < 0) {
8172 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
8173 goto reply;
8174 }
8175 err = newcrush.set_item_name(bucketno, name);
8176 if (err < 0) {
8177 ss << "error setting bucket name to '" << name << "'";
8178 goto reply;
8179 }
8180
8181 pending_inc.crush.clear();
8182 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8183 ss << "added bucket " << name << " type " << typestr
8184 << " to crush map";
8185 goto update;
8186 } else if (prefix == "osd crush rename-bucket") {
8187 string srcname, dstname;
8188 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
8189 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
8190
8191 err = crush_rename_bucket(srcname, dstname, &ss);
8192 if (err == -EALREADY) // equivalent to success for idempotency
8193 err = 0;
8194 if (err)
8195 goto reply;
8196 else
8197 goto update;
c07f9fc5
FG
8198 } else if (prefix == "osd crush weight-set create" ||
8199 prefix == "osd crush weight-set create-compat") {
8200 CrushWrapper newcrush;
8201 _get_pending_crush(newcrush);
8202 int64_t pool;
8203 int positions;
8204 if (newcrush.has_non_straw2_buckets()) {
8205 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
8206 err = -EPERM;
8207 goto reply;
8208 }
c07f9fc5
FG
8209 if (prefix == "osd crush weight-set create") {
8210 if (osdmap.require_min_compat_client > 0 &&
8211 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8212 ss << "require_min_compat_client "
8213 << ceph_release_name(osdmap.require_min_compat_client)
8214 << " < luminous, which is required for per-pool weight-sets. "
8215 << "Try 'ceph osd set-require-min-compat-client luminous' "
8216 << "before using the new interface";
8217 err = -EPERM;
8218 goto reply;
8219 }
8220 string poolname, mode;
8221 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8222 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8223 if (pool < 0) {
8224 ss << "pool '" << poolname << "' not found";
8225 err = -ENOENT;
8226 goto reply;
8227 }
8228 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8229 if (mode != "flat" && mode != "positional") {
8230 ss << "unrecognized weight-set mode '" << mode << "'";
8231 err = -EINVAL;
8232 goto reply;
8233 }
8234 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
8235 } else {
8236 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8237 positions = 1;
224ce89b 8238 }
c07f9fc5
FG
8239 newcrush.create_choose_args(pool, positions);
8240 pending_inc.crush.clear();
8241 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8242 goto update;
224ce89b 8243
c07f9fc5
FG
8244 } else if (prefix == "osd crush weight-set rm" ||
8245 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
8246 CrushWrapper newcrush;
8247 _get_pending_crush(newcrush);
c07f9fc5
FG
8248 int64_t pool;
8249 if (prefix == "osd crush weight-set rm") {
8250 string poolname;
8251 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8252 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8253 if (pool < 0) {
8254 ss << "pool '" << poolname << "' not found";
8255 err = -ENOENT;
8256 goto reply;
8257 }
8258 } else {
8259 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 8260 }
c07f9fc5
FG
8261 newcrush.rm_choose_args(pool);
8262 pending_inc.crush.clear();
8263 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8264 goto update;
224ce89b 8265
c07f9fc5
FG
8266 } else if (prefix == "osd crush weight-set reweight" ||
8267 prefix == "osd crush weight-set reweight-compat") {
8268 string poolname, item;
8269 vector<double> weight;
8270 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8271 cmd_getval(g_ceph_context, cmdmap, "item", item);
8272 cmd_getval(g_ceph_context, cmdmap, "weight", weight);
8273 CrushWrapper newcrush;
8274 _get_pending_crush(newcrush);
8275 int64_t pool;
8276 if (prefix == "osd crush weight-set reweight") {
8277 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8278 if (pool < 0) {
8279 ss << "pool '" << poolname << "' not found";
8280 err = -ENOENT;
8281 goto reply;
8282 }
8283 if (!newcrush.have_choose_args(pool)) {
8284 ss << "no weight-set for pool '" << poolname << "'";
8285 err = -ENOENT;
8286 goto reply;
8287 }
8288 auto arg_map = newcrush.choose_args_get(pool);
8289 int positions = newcrush.get_choose_args_positions(arg_map);
8290 if (weight.size() != (size_t)positions) {
8291 ss << "must specify exact " << positions << " weight values";
8292 err = -EINVAL;
8293 goto reply;
8294 }
8295 } else {
8296 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8297 if (!newcrush.have_choose_args(pool)) {
8298 ss << "no backward-compatible weight-set";
8299 err = -ENOENT;
8300 goto reply;
8301 }
224ce89b 8302 }
c07f9fc5
FG
8303 if (!newcrush.name_exists(item)) {
8304 ss << "item '" << item << "' does not exist";
8305 err = -ENOENT;
224ce89b
WB
8306 goto reply;
8307 }
c07f9fc5
FG
8308 err = newcrush.choose_args_adjust_item_weightf(
8309 g_ceph_context,
8310 newcrush.choose_args_get(pool),
8311 newcrush.get_item_id(item),
8312 weight,
8313 &ss);
224ce89b 8314 if (err < 0) {
224ce89b
WB
8315 goto reply;
8316 }
c07f9fc5 8317 err = 0;
224ce89b
WB
8318 pending_inc.crush.clear();
8319 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
224ce89b 8320 goto update;
7c673cae
FG
8321 } else if (osdid_present &&
8322 (prefix == "osd crush set" || prefix == "osd crush add")) {
8323 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
8324 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
8325 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
8326
8327 if (!osdmap.exists(osdid)) {
8328 err = -ENOENT;
c07f9fc5 8329 ss << name << " does not exist. Create it before updating the crush map";
7c673cae
FG
8330 goto reply;
8331 }
8332
8333 double weight;
8334 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
8335 ss << "unable to parse weight value '"
8336 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8337 err = -EINVAL;
8338 goto reply;
8339 }
8340
8341 string args;
8342 vector<string> argvec;
8343 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8344 map<string,string> loc;
8345 CrushWrapper::parse_loc_map(argvec, &loc);
8346
8347 if (prefix == "osd crush set"
8348 && !_get_stable_crush().item_exists(osdid)) {
8349 err = -ENOENT;
8350 ss << "unable to set item id " << osdid << " name '" << name
8351 << "' weight " << weight << " at location " << loc
8352 << ": does not exist";
8353 goto reply;
8354 }
8355
8356 dout(5) << "adding/updating crush item id " << osdid << " name '"
8357 << name << "' weight " << weight << " at location "
8358 << loc << dendl;
8359 CrushWrapper newcrush;
8360 _get_pending_crush(newcrush);
8361
8362 string action;
8363 if (prefix == "osd crush set" ||
8364 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
8365 action = "set";
8366 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
8367 } else {
8368 action = "add";
8369 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
8370 if (err == 0)
8371 err = 1;
8372 }
8373
8374 if (err < 0)
8375 goto reply;
8376
8377 if (err == 0 && !_have_pending_crush()) {
8378 ss << action << " item id " << osdid << " name '" << name << "' weight "
8379 << weight << " at location " << loc << ": no change";
8380 goto reply;
8381 }
8382
8383 pending_inc.crush.clear();
8384 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8385 ss << action << " item id " << osdid << " name '" << name << "' weight "
8386 << weight << " at location " << loc << " to crush map";
8387 getline(ss, rs);
8388 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8389 get_last_committed() + 1));
8390 return true;
8391
8392 } else if (prefix == "osd crush create-or-move") {
8393 do {
8394 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
8395 if (!osdmap.exists(osdid)) {
8396 err = -ENOENT;
8397 ss << name << " does not exist. create it before updating the crush map";
8398 goto reply;
8399 }
8400
8401 double weight;
8402 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
8403 ss << "unable to parse weight value '"
8404 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8405 err = -EINVAL;
8406 goto reply;
8407 }
8408
8409 string args;
8410 vector<string> argvec;
8411 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8412 map<string,string> loc;
8413 CrushWrapper::parse_loc_map(argvec, &loc);
8414
8415 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
8416 << " at location " << loc << dendl;
8417
8418 CrushWrapper newcrush;
8419 _get_pending_crush(newcrush);
8420
8421 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
8422 if (err == 0) {
8423 ss << "create-or-move updated item name '" << name << "' weight " << weight
8424 << " at location " << loc << " to crush map";
8425 break;
8426 }
8427 if (err > 0) {
8428 pending_inc.crush.clear();
8429 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8430 ss << "create-or-move updating item name '" << name << "' weight " << weight
8431 << " at location " << loc << " to crush map";
8432 getline(ss, rs);
8433 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8434 get_last_committed() + 1));
8435 return true;
8436 }
8437 } while (false);
8438
8439 } else if (prefix == "osd crush move") {
8440 do {
8441 // osd crush move <name> <loc1> [<loc2> ...]
8442
8443 string args;
8444 vector<string> argvec;
8445 cmd_getval(g_ceph_context, cmdmap, "name", name);
8446 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8447 map<string,string> loc;
8448 CrushWrapper::parse_loc_map(argvec, &loc);
8449
8450 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
8451 CrushWrapper newcrush;
8452 _get_pending_crush(newcrush);
8453
8454 if (!newcrush.name_exists(name)) {
8455 err = -ENOENT;
8456 ss << "item " << name << " does not exist";
8457 break;
8458 }
8459 int id = newcrush.get_item_id(name);
8460
8461 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8462 if (id >= 0) {
8463 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
8464 } else {
8465 err = newcrush.move_bucket(g_ceph_context, id, loc);
8466 }
8467 if (err >= 0) {
8468 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8469 pending_inc.crush.clear();
8470 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8471 getline(ss, rs);
8472 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8473 get_last_committed() + 1));
8474 return true;
8475 }
8476 } else {
8477 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8478 err = 0;
8479 }
8480 } while (false);
31f18b77
FG
8481 } else if (prefix == "osd crush swap-bucket") {
8482 string source, dest, force;
8483 cmd_getval(g_ceph_context, cmdmap, "source", source);
8484 cmd_getval(g_ceph_context, cmdmap, "dest", dest);
8485 cmd_getval(g_ceph_context, cmdmap, "force", force);
8486 CrushWrapper newcrush;
8487 _get_pending_crush(newcrush);
8488 if (!newcrush.name_exists(source)) {
8489 ss << "source item " << source << " does not exist";
8490 err = -ENOENT;
8491 goto reply;
8492 }
8493 if (!newcrush.name_exists(dest)) {
8494 ss << "dest item " << dest << " does not exist";
8495 err = -ENOENT;
8496 goto reply;
8497 }
8498 int sid = newcrush.get_item_id(source);
8499 int did = newcrush.get_item_id(dest);
8500 int sparent;
8501 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
8502 force != "--yes-i-really-mean-it") {
8503 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8504 err = -EPERM;
8505 goto reply;
8506 }
8507 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
8508 force != "--yes-i-really-mean-it") {
8509 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
8510 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
8511 << "; pass --yes-i-really-mean-it to proceed anyway";
8512 err = -EPERM;
8513 goto reply;
8514 }
8515 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
8516 if (r < 0) {
8517 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 8518 err = r;
31f18b77
FG
8519 goto reply;
8520 }
8521 ss << "swapped bucket of " << source << " to " << dest;
8522 pending_inc.crush.clear();
8523 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8524 wait_for_finished_proposal(op,
8525 new Monitor::C_Command(mon, op, err, ss.str(),
8526 get_last_committed() + 1));
8527 return true;
8528 } else if (prefix == "osd crush link") {
8529 // osd crush link <name> <loc1> [<loc2> ...]
8530 string name;
8531 cmd_getval(g_ceph_context, cmdmap, "name", name);
8532 vector<string> argvec;
8533 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8534 map<string,string> loc;
8535 CrushWrapper::parse_loc_map(argvec, &loc);
8536
8537 // Need an explicit check for name_exists because get_item_id returns
8538 // 0 on unfound.
8539 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
8540 if (!osdmap.crush->name_exists(name)) {
8541 err = -ENOENT;
8542 ss << "item " << name << " does not exist";
8543 goto reply;
8544 } else {
8545 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
8546 }
8547 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
8548 ss << "no need to move item id " << id << " name '" << name
8549 << "' to location " << loc << " in crush map";
8550 err = 0;
8551 goto reply;
8552 }
8553
8554 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
8555 CrushWrapper newcrush;
8556 _get_pending_crush(newcrush);
8557
8558 if (!newcrush.name_exists(name)) {
8559 err = -ENOENT;
8560 ss << "item " << name << " does not exist";
8561 goto reply;
8562 } else {
8563 int id = newcrush.get_item_id(name);
8564 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8565 err = newcrush.link_bucket(g_ceph_context, id, loc);
8566 if (err >= 0) {
8567 ss << "linked item id " << id << " name '" << name
8568 << "' to location " << loc << " in crush map";
8569 pending_inc.crush.clear();
8570 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8571 } else {
8572 ss << "cannot link item id " << id << " name '" << name
8573 << "' to location " << loc;
8574 goto reply;
8575 }
8576 } else {
8577 ss << "no need to move item id " << id << " name '" << name
8578 << "' to location " << loc << " in crush map";
8579 err = 0;
8580 }
8581 }
8582 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
8583 get_last_committed() + 1));
8584 return true;
8585 } else if (prefix == "osd crush rm" ||
8586 prefix == "osd crush remove" ||
8587 prefix == "osd crush unlink") {
8588 do {
8589 // osd crush rm <id> [ancestor]
8590 CrushWrapper newcrush;
8591 _get_pending_crush(newcrush);
8592
8593 string name;
8594 cmd_getval(g_ceph_context, cmdmap, "name", name);
8595
8596 if (!osdmap.crush->name_exists(name)) {
8597 err = 0;
8598 ss << "device '" << name << "' does not appear in the crush map";
8599 break;
8600 }
8601 if (!newcrush.name_exists(name)) {
8602 err = 0;
8603 ss << "device '" << name << "' does not appear in the crush map";
8604 getline(ss, rs);
8605 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8606 get_last_committed() + 1));
8607 return true;
8608 }
8609 int id = newcrush.get_item_id(name);
31f18b77
FG
8610 int ancestor = 0;
8611
7c673cae
FG
8612 bool unlink_only = prefix == "osd crush unlink";
8613 string ancestor_str;
8614 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
8615 if (!newcrush.name_exists(ancestor_str)) {
8616 err = -ENOENT;
8617 ss << "ancestor item '" << ancestor_str
8618 << "' does not appear in the crush map";
8619 break;
8620 }
31f18b77 8621 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 8622 }
31f18b77
FG
8623
8624 err = prepare_command_osd_crush_remove(
8625 newcrush,
8626 id, ancestor,
8627 (ancestor < 0), unlink_only);
8628
7c673cae
FG
8629 if (err == -ENOENT) {
8630 ss << "item " << id << " does not appear in that position";
8631 err = 0;
8632 break;
8633 }
8634 if (err == 0) {
7c673cae
FG
8635 ss << "removed item id " << id << " name '" << name << "' from crush map";
8636 getline(ss, rs);
8637 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8638 get_last_committed() + 1));
8639 return true;
8640 }
8641 } while (false);
8642
8643 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
8644 CrushWrapper newcrush;
8645 _get_pending_crush(newcrush);
8646
8647 newcrush.reweight(g_ceph_context);
8648 pending_inc.crush.clear();
8649 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8650 ss << "reweighted crush hierarchy";
8651 getline(ss, rs);
8652 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8653 get_last_committed() + 1));
8654 return true;
8655 } else if (prefix == "osd crush reweight") {
8656 // osd crush reweight <name> <weight>
8657 CrushWrapper newcrush;
8658 _get_pending_crush(newcrush);
8659
8660 string name;
8661 cmd_getval(g_ceph_context, cmdmap, "name", name);
8662 if (!newcrush.name_exists(name)) {
8663 err = -ENOENT;
8664 ss << "device '" << name << "' does not appear in the crush map";
8665 goto reply;
8666 }
8667
8668 int id = newcrush.get_item_id(name);
8669 if (id < 0) {
8670 ss << "device '" << name << "' is not a leaf in the crush map";
8671 err = -EINVAL;
8672 goto reply;
8673 }
8674 double w;
8675 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8676 ss << "unable to parse weight value '"
8677 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8678 err = -EINVAL;
8679 goto reply;
8680 }
8681
8682 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
8683 if (err < 0)
8684 goto reply;
8685 pending_inc.crush.clear();
8686 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8687 ss << "reweighted item id " << id << " name '" << name << "' to " << w
8688 << " in crush map";
8689 getline(ss, rs);
8690 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8691 get_last_committed() + 1));
8692 return true;
8693 } else if (prefix == "osd crush reweight-subtree") {
8694 // osd crush reweight <name> <weight>
8695 CrushWrapper newcrush;
8696 _get_pending_crush(newcrush);
8697
8698 string name;
8699 cmd_getval(g_ceph_context, cmdmap, "name", name);
8700 if (!newcrush.name_exists(name)) {
8701 err = -ENOENT;
8702 ss << "device '" << name << "' does not appear in the crush map";
8703 goto reply;
8704 }
8705
8706 int id = newcrush.get_item_id(name);
8707 if (id >= 0) {
8708 ss << "device '" << name << "' is not a subtree in the crush map";
8709 err = -EINVAL;
8710 goto reply;
8711 }
8712 double w;
8713 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8714 ss << "unable to parse weight value '"
8715 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8716 err = -EINVAL;
8717 goto reply;
8718 }
8719
8720 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
8721 if (err < 0)
8722 goto reply;
8723 pending_inc.crush.clear();
8724 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8725 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
8726 << " in crush map";
8727 getline(ss, rs);
8728 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8729 get_last_committed() + 1));
8730 return true;
8731 } else if (prefix == "osd crush tunables") {
8732 CrushWrapper newcrush;
8733 _get_pending_crush(newcrush);
8734
8735 err = 0;
8736 string profile;
8737 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8738 if (profile == "legacy" || profile == "argonaut") {
8739 newcrush.set_tunables_legacy();
8740 } else if (profile == "bobtail") {
8741 newcrush.set_tunables_bobtail();
8742 } else if (profile == "firefly") {
8743 newcrush.set_tunables_firefly();
8744 } else if (profile == "hammer") {
8745 newcrush.set_tunables_hammer();
8746 } else if (profile == "jewel") {
8747 newcrush.set_tunables_jewel();
8748 } else if (profile == "optimal") {
8749 newcrush.set_tunables_optimal();
8750 } else if (profile == "default") {
8751 newcrush.set_tunables_default();
8752 } else {
8753 ss << "unrecognized profile '" << profile << "'";
8754 err = -EINVAL;
8755 goto reply;
8756 }
8757
8758 if (!validate_crush_against_features(&newcrush, ss)) {
8759 err = -EINVAL;
8760 goto reply;
8761 }
8762
8763 pending_inc.crush.clear();
8764 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8765 ss << "adjusted tunables profile to " << profile;
8766 getline(ss, rs);
8767 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8768 get_last_committed() + 1));
8769 return true;
8770 } else if (prefix == "osd crush set-tunable") {
8771 CrushWrapper newcrush;
8772 _get_pending_crush(newcrush);
8773
8774 err = 0;
8775 string tunable;
8776 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
8777
8778 int64_t value = -1;
8779 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
8780 err = -EINVAL;
8781 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
8782 goto reply;
8783 }
8784
8785 if (tunable == "straw_calc_version") {
224ce89b 8786 if (value != 0 && value != 1) {
7c673cae
FG
8787 ss << "value must be 0 or 1; got " << value;
8788 err = -EINVAL;
8789 goto reply;
8790 }
8791 newcrush.set_straw_calc_version(value);
8792 } else {
8793 ss << "unrecognized tunable '" << tunable << "'";
8794 err = -EINVAL;
8795 goto reply;
8796 }
8797
8798 if (!validate_crush_against_features(&newcrush, ss)) {
8799 err = -EINVAL;
8800 goto reply;
8801 }
8802
8803 pending_inc.crush.clear();
8804 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8805 ss << "adjusted tunable " << tunable << " to " << value;
8806 getline(ss, rs);
8807 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8808 get_last_committed() + 1));
8809 return true;
8810
8811 } else if (prefix == "osd crush rule create-simple") {
8812 string name, root, type, mode;
8813 cmd_getval(g_ceph_context, cmdmap, "name", name);
8814 cmd_getval(g_ceph_context, cmdmap, "root", root);
8815 cmd_getval(g_ceph_context, cmdmap, "type", type);
8816 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8817 if (mode == "")
8818 mode = "firstn";
8819
8820 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
8821 // The name is uniquely associated to a ruleid and the rule it contains
8822 // From the user point of view, the rule is more meaningfull.
8823 ss << "rule " << name << " already exists";
7c673cae
FG
8824 err = 0;
8825 goto reply;
8826 }
8827
8828 CrushWrapper newcrush;
8829 _get_pending_crush(newcrush);
8830
8831 if (newcrush.rule_exists(name)) {
31f18b77
FG
8832 // The name is uniquely associated to a ruleid and the rule it contains
8833 // From the user point of view, the rule is more meaningfull.
8834 ss << "rule " << name << " already exists";
7c673cae
FG
8835 err = 0;
8836 } else {
224ce89b 8837 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
8838 pg_pool_t::TYPE_REPLICATED, &ss);
8839 if (ruleno < 0) {
8840 err = ruleno;
8841 goto reply;
8842 }
8843
8844 pending_inc.crush.clear();
8845 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8846 }
8847 getline(ss, rs);
8848 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8849 get_last_committed() + 1));
8850 return true;
8851
224ce89b
WB
8852 } else if (prefix == "osd crush rule create-replicated") {
8853 string name, root, type, device_class;
8854 cmd_getval(g_ceph_context, cmdmap, "name", name);
8855 cmd_getval(g_ceph_context, cmdmap, "root", root);
8856 cmd_getval(g_ceph_context, cmdmap, "type", type);
8857 cmd_getval(g_ceph_context, cmdmap, "class", device_class);
8858
8859 if (!device_class.empty()) {
8860 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8861 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8862 << "luminous' before using crush device classes";
8863 err = -EPERM;
8864 goto reply;
8865 }
8866 }
8867
8868 if (osdmap.crush->rule_exists(name)) {
8869 // The name is uniquely associated to a ruleid and the rule it contains
8870 // From the user point of view, the rule is more meaningfull.
8871 ss << "rule " << name << " already exists";
8872 err = 0;
8873 goto reply;
8874 }
8875
8876 CrushWrapper newcrush;
8877 _get_pending_crush(newcrush);
8878
8879 if (newcrush.rule_exists(name)) {
8880 // The name is uniquely associated to a ruleid and the rule it contains
8881 // From the user point of view, the rule is more meaningfull.
8882 ss << "rule " << name << " already exists";
8883 err = 0;
8884 } else {
8885 int ruleno = newcrush.add_simple_rule(
8886 name, root, type, device_class,
8887 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
8888 if (ruleno < 0) {
8889 err = ruleno;
8890 goto reply;
8891 }
8892
8893 pending_inc.crush.clear();
8894 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8895 }
8896 getline(ss, rs);
8897 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8898 get_last_committed() + 1));
8899 return true;
8900
7c673cae
FG
8901 } else if (prefix == "osd erasure-code-profile rm") {
8902 string name;
8903 cmd_getval(g_ceph_context, cmdmap, "name", name);
8904
8905 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
8906 goto wait;
8907
8908 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
8909 err = -EBUSY;
8910 goto reply;
8911 }
8912
8913 if (osdmap.has_erasure_code_profile(name) ||
8914 pending_inc.new_erasure_code_profiles.count(name)) {
8915 if (osdmap.has_erasure_code_profile(name)) {
8916 pending_inc.old_erasure_code_profiles.push_back(name);
8917 } else {
8918 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
8919 pending_inc.new_erasure_code_profiles.erase(name);
8920 }
8921
8922 getline(ss, rs);
8923 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8924 get_last_committed() + 1));
8925 return true;
8926 } else {
8927 ss << "erasure-code-profile " << name << " does not exist";
8928 err = 0;
8929 goto reply;
8930 }
8931
8932 } else if (prefix == "osd erasure-code-profile set") {
8933 string name;
8934 cmd_getval(g_ceph_context, cmdmap, "name", name);
8935 vector<string> profile;
8936 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8937 bool force;
8938 if (profile.size() > 0 && profile.back() == "--force") {
8939 profile.pop_back();
8940 force = true;
8941 } else {
8942 force = false;
8943 }
8944 map<string,string> profile_map;
8945 err = parse_erasure_code_profile(profile, &profile_map, &ss);
8946 if (err)
8947 goto reply;
8948 if (profile_map.find("plugin") == profile_map.end()) {
8949 ss << "erasure-code-profile " << profile_map
8950 << " must contain a plugin entry" << std::endl;
8951 err = -EINVAL;
8952 goto reply;
8953 }
8954 string plugin = profile_map["plugin"];
8955
8956 if (pending_inc.has_erasure_code_profile(name)) {
8957 dout(20) << "erasure code profile " << name << " try again" << dendl;
8958 goto wait;
8959 } else {
8960 if (plugin == "isa" || plugin == "lrc") {
8961 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
8962 if (err == -EAGAIN)
8963 goto wait;
8964 if (err)
8965 goto reply;
8966 } else if (plugin == "shec") {
8967 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
8968 if (err == -EAGAIN)
8969 goto wait;
8970 if (err)
8971 goto reply;
8972 }
8973 err = normalize_profile(name, profile_map, force, &ss);
8974 if (err)
8975 goto reply;
8976
8977 if (osdmap.has_erasure_code_profile(name)) {
8978 ErasureCodeProfile existing_profile_map =
8979 osdmap.get_erasure_code_profile(name);
8980 err = normalize_profile(name, existing_profile_map, force, &ss);
8981 if (err)
8982 goto reply;
8983
8984 if (existing_profile_map == profile_map) {
8985 err = 0;
8986 goto reply;
8987 }
8988 if (!force) {
8989 err = -EPERM;
8990 ss << "will not override erasure code profile " << name
8991 << " because the existing profile "
8992 << existing_profile_map
8993 << " is different from the proposed profile "
8994 << profile_map;
8995 goto reply;
8996 }
8997 }
8998
8999 dout(20) << "erasure code profile set " << name << "="
9000 << profile_map << dendl;
9001 pending_inc.set_erasure_code_profile(name, profile_map);
9002 }
9003
9004 getline(ss, rs);
9005 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9006 get_last_committed() + 1));
9007 return true;
9008
9009 } else if (prefix == "osd crush rule create-erasure") {
9010 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
9011 if (err == -EAGAIN)
9012 goto wait;
9013 if (err)
9014 goto reply;
9015 string name, poolstr;
9016 cmd_getval(g_ceph_context, cmdmap, "name", name);
9017 string profile;
9018 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
9019 if (profile == "")
9020 profile = "default";
9021 if (profile == "default") {
9022 if (!osdmap.has_erasure_code_profile(profile)) {
9023 if (pending_inc.has_erasure_code_profile(profile)) {
9024 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
9025 goto wait;
9026 }
9027
9028 map<string,string> profile_map;
9029 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
9030 profile_map,
9031 &ss);
9032 if (err)
9033 goto reply;
9034 err = normalize_profile(name, profile_map, true, &ss);
9035 if (err)
9036 goto reply;
9037 dout(20) << "erasure code profile set " << profile << "="
9038 << profile_map << dendl;
9039 pending_inc.set_erasure_code_profile(profile, profile_map);
9040 goto wait;
9041 }
9042 }
9043
31f18b77
FG
9044 int rule;
9045 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
9046 if (err < 0) {
9047 switch(err) {
9048 case -EEXIST: // return immediately
9049 ss << "rule " << name << " already exists";
9050 err = 0;
9051 goto reply;
9052 break;
9053 case -EALREADY: // wait for pending to be proposed
9054 ss << "rule " << name << " already exists";
9055 err = 0;
9056 break;
9057 default: // non recoverable error
9058 goto reply;
9059 break;
9060 }
9061 } else {
31f18b77 9062 ss << "created rule " << name << " at " << rule;
7c673cae
FG
9063 }
9064
9065 getline(ss, rs);
9066 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9067 get_last_committed() + 1));
9068 return true;
9069
9070 } else if (prefix == "osd crush rule rm") {
9071 string name;
9072 cmd_getval(g_ceph_context, cmdmap, "name", name);
9073
9074 if (!osdmap.crush->rule_exists(name)) {
9075 ss << "rule " << name << " does not exist";
9076 err = 0;
9077 goto reply;
9078 }
9079
9080 CrushWrapper newcrush;
9081 _get_pending_crush(newcrush);
9082
9083 if (!newcrush.rule_exists(name)) {
9084 ss << "rule " << name << " does not exist";
9085 err = 0;
9086 } else {
9087 int ruleno = newcrush.get_rule_id(name);
9088 assert(ruleno >= 0);
9089
9090 // make sure it is not in use.
9091 // FIXME: this is ok in some situations, but let's not bother with that
9092 // complexity now.
9093 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
3efd9988 9094 if (osdmap.crush_rule_in_use(ruleset)) {
7c673cae
FG
9095 ss << "crush ruleset " << name << " " << ruleset << " is in use";
9096 err = -EBUSY;
9097 goto reply;
9098 }
9099
9100 err = newcrush.remove_rule(ruleno);
9101 if (err < 0) {
9102 goto reply;
9103 }
9104
9105 pending_inc.crush.clear();
9106 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9107 }
9108 getline(ss, rs);
9109 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9110 get_last_committed() + 1));
9111 return true;
9112
b5b8bbf5
FG
9113 } else if (prefix == "osd crush rule rename") {
9114 string srcname;
9115 string dstname;
9116 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
9117 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
9118 if (srcname.empty() || dstname.empty()) {
9119 ss << "must specify both source rule name and destination rule name";
9120 err = -EINVAL;
9121 goto reply;
9122 }
9123 if (srcname == dstname) {
9124 ss << "destination rule name is equal to source rule name";
9125 err = 0;
9126 goto reply;
9127 }
9128
9129 CrushWrapper newcrush;
9130 _get_pending_crush(newcrush);
181888fb
FG
9131 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
9132 // srcname does not exist and dstname already exists
9133 // suppose this is a replay and return success
9134 // (so this command is idempotent)
9135 ss << "already renamed to '" << dstname << "'";
9136 err = 0;
9137 goto reply;
9138 }
9139
b5b8bbf5
FG
9140 err = newcrush.rename_rule(srcname, dstname, &ss);
9141 if (err < 0) {
9142 // ss has reason for failure
9143 goto reply;
9144 }
9145 pending_inc.crush.clear();
9146 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9147 getline(ss, rs);
9148 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9149 get_last_committed() + 1));
9150 return true;
9151
7c673cae
FG
9152 } else if (prefix == "osd setmaxosd") {
9153 int64_t newmax;
9154 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
9155 ss << "unable to parse 'newmax' value '"
9156 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
9157 err = -EINVAL;
9158 goto reply;
9159 }
9160
9161 if (newmax > g_conf->mon_max_osd) {
9162 err = -ERANGE;
9163 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
9164 << g_conf->mon_max_osd << ")";
9165 goto reply;
9166 }
9167
9168 // Don't allow shrinking OSD number as this will cause data loss
9169 // and may cause kernel crashes.
9170 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
9171 if (newmax < osdmap.get_max_osd()) {
9172 // Check if the OSDs exist between current max and new value.
9173 // If there are any OSDs exist, then don't allow shrinking number
9174 // of OSDs.
9175 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
9176 if (osdmap.exists(i)) {
9177 err = -EBUSY;
9178 ss << "cannot shrink max_osd to " << newmax
9179 << " because osd." << i << " (and possibly others) still in use";
9180 goto reply;
9181 }
9182 }
9183 }
9184
9185 pending_inc.new_max_osd = newmax;
9186 ss << "set new max_osd = " << pending_inc.new_max_osd;
9187 getline(ss, rs);
9188 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9189 get_last_committed() + 1));
9190 return true;
9191
9192 } else if (prefix == "osd set-full-ratio" ||
9193 prefix == "osd set-backfillfull-ratio" ||
9194 prefix == "osd set-nearfull-ratio") {
31f18b77 9195 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9196 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9197 << "luminous' before using the new interface";
7c673cae
FG
9198 err = -EPERM;
9199 goto reply;
9200 }
9201 double n;
9202 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
9203 ss << "unable to parse 'ratio' value '"
224ce89b 9204 << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
7c673cae
FG
9205 err = -EINVAL;
9206 goto reply;
9207 }
9208 if (prefix == "osd set-full-ratio")
9209 pending_inc.new_full_ratio = n;
9210 else if (prefix == "osd set-backfillfull-ratio")
9211 pending_inc.new_backfillfull_ratio = n;
9212 else if (prefix == "osd set-nearfull-ratio")
9213 pending_inc.new_nearfull_ratio = n;
9214 ss << prefix << " " << n;
9215 getline(ss, rs);
9216 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9217 get_last_committed() + 1));
9218 return true;
9219 } else if (prefix == "osd set-require-min-compat-client") {
31f18b77 9220 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9221 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9222 << "luminous' before using the new interface";
7c673cae
FG
9223 err = -EPERM;
9224 goto reply;
9225 }
9226 string v;
9227 cmd_getval(g_ceph_context, cmdmap, "version", v);
31f18b77
FG
9228 int vno = ceph_release_from_name(v.c_str());
9229 if (vno <= 0) {
7c673cae
FG
9230 ss << "version " << v << " is not recognized";
9231 err = -EINVAL;
9232 goto reply;
9233 }
9234 OSDMap newmap;
9235 newmap.deepish_copy_from(osdmap);
9236 newmap.apply_incremental(pending_inc);
31f18b77
FG
9237 newmap.require_min_compat_client = vno;
9238 auto mvno = newmap.get_min_compat_client();
9239 if (vno < mvno) {
9240 ss << "osdmap current utilizes features that require "
9241 << ceph_release_name(mvno)
9242 << "; cannot set require_min_compat_client below that to "
9243 << ceph_release_name(vno);
7c673cae
FG
9244 err = -EPERM;
9245 goto reply;
9246 }
31f18b77
FG
9247 string sure;
9248 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9249 if (sure != "--yes-i-really-mean-it") {
9250 FeatureMap m;
9251 mon->get_combined_feature_map(&m);
9252 uint64_t features = ceph_release_features(vno);
9253 bool first = true;
9254 bool ok = true;
9255 for (int type : {
9256 CEPH_ENTITY_TYPE_CLIENT,
9257 CEPH_ENTITY_TYPE_MDS,
9258 CEPH_ENTITY_TYPE_MGR }) {
9259 auto p = m.m.find(type);
9260 if (p == m.m.end()) {
9261 continue;
9262 }
9263 for (auto& q : p->second) {
9264 uint64_t missing = ~q.first & features;
9265 if (missing) {
9266 if (first) {
9267 ss << "cannot set require_min_compat_client to " << v << ": ";
9268 } else {
9269 ss << "; ";
9270 }
9271 first = false;
9272 ss << q.second << " connected " << ceph_entity_type_name(type)
9273 << "(s) look like " << ceph_release_name(
9274 ceph_release_from_features(q.first))
9275 << " (missing 0x" << std::hex << missing << std::dec << ")";
9276 ok = false;
9277 }
9278 }
9279 }
9280 if (!ok) {
9281 ss << "; add --yes-i-really-mean-it to do it anyway";
9282 err = -EPERM;
9283 goto reply;
9284 }
9285 }
9286 ss << "set require_min_compat_client to " << ceph_release_name(vno);
9287 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
9288 getline(ss, rs);
9289 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9290 get_last_committed() + 1));
9291 return true;
9292 } else if (prefix == "osd pause") {
9293 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9294
9295 } else if (prefix == "osd unpause") {
9296 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9297
9298 } else if (prefix == "osd set") {
3efd9988
FG
9299 string sure;
9300 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
7c673cae
FG
9301 string key;
9302 cmd_getval(g_ceph_context, cmdmap, "key", key);
9303 if (key == "full")
9304 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
9305 else if (key == "pause")
9306 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9307 else if (key == "noup")
9308 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
9309 else if (key == "nodown")
9310 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
9311 else if (key == "noout")
9312 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
9313 else if (key == "noin")
9314 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
9315 else if (key == "nobackfill")
9316 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
9317 else if (key == "norebalance")
9318 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
9319 else if (key == "norecover")
9320 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
9321 else if (key == "noscrub")
9322 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
9323 else if (key == "nodeep-scrub")
9324 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9325 else if (key == "notieragent")
9326 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
9327 else if (key == "sortbitwise") {
3efd9988
FG
9328 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9329 ss << "Not advisable to continue since no OSDs are up. Pass "
9330 << "--yes-i-really-mean-it if you really wish to continue.";
9331 err = -EPERM;
9332 goto reply;
9333 }
9334 if ((osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)
9335 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9336 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
9337 } else {
9338 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
9339 err = -EPERM;
31f18b77 9340 goto reply;
7c673cae 9341 }
c07f9fc5 9342 } else if (key == "recovery_deletes") {
3efd9988
FG
9343 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9344 ss << "Not advisable to continue since no OSDs are up. Pass "
9345 << "--yes-i-really-mean-it if you really wish to continue.";
9346 err = -EPERM;
9347 goto reply;
9348 }
9349 if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)
9350 || sure == "--yes-i-really-mean-it") {
c07f9fc5
FG
9351 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9352 } else {
9353 ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
9354 err = -EPERM;
9355 goto reply;
9356 }
7c673cae 9357 } else if (key == "require_jewel_osds") {
3efd9988
FG
9358 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9359 ss << "Not advisable to continue since no OSDs are up. Pass "
9360 << "--yes-i-really-mean-it if you really wish to continue.";
9361 err = -EPERM;
9362 goto reply;
9363 }
7c673cae
FG
9364 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9365 ss << "the sortbitwise flag must be set before require_jewel_osds";
9366 err = -EPERM;
31f18b77
FG
9367 goto reply;
9368 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
9369 ss << "require_osd_release is already >= jewel";
9370 err = 0;
9371 goto reply;
3efd9988
FG
9372 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)
9373 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9374 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
9375 } else {
9376 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
9377 err = -EPERM;
9378 }
9379 } else if (key == "require_kraken_osds") {
3efd9988
FG
9380 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9381 ss << "Not advisable to continue since no OSDs are up. Pass "
9382 << "--yes-i-really-mean-it if you really wish to continue.";
9383 err = -EPERM;
9384 goto reply;
9385 }
7c673cae
FG
9386 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9387 ss << "the sortbitwise flag must be set before require_kraken_osds";
9388 err = -EPERM;
31f18b77
FG
9389 goto reply;
9390 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
9391 ss << "require_osd_release is already >= kraken";
9392 err = 0;
9393 goto reply;
3efd9988
FG
9394 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)
9395 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9396 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
9397 // ensure JEWEL is also set
9398 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
9399 return r;
9400 } else {
9401 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
9402 err = -EPERM;
9403 }
7c673cae
FG
9404 } else {
9405 ss << "unrecognized flag '" << key << "'";
9406 err = -EINVAL;
9407 }
9408
9409 } else if (prefix == "osd unset") {
9410 string key;
9411 cmd_getval(g_ceph_context, cmdmap, "key", key);
9412 if (key == "full")
9413 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
9414 else if (key == "pause")
9415 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9416 else if (key == "noup")
9417 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
9418 else if (key == "nodown")
9419 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
9420 else if (key == "noout")
9421 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
9422 else if (key == "noin")
9423 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
9424 else if (key == "nobackfill")
9425 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
9426 else if (key == "norebalance")
9427 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
9428 else if (key == "norecover")
9429 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
9430 else if (key == "noscrub")
9431 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
9432 else if (key == "nodeep-scrub")
9433 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9434 else if (key == "notieragent")
9435 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
224ce89b 9436 else {
7c673cae
FG
9437 ss << "unrecognized flag '" << key << "'";
9438 err = -EINVAL;
9439 }
9440
31f18b77
FG
9441 } else if (prefix == "osd require-osd-release") {
9442 string release;
9443 cmd_getval(g_ceph_context, cmdmap, "release", release);
3efd9988
FG
9444 string sure;
9445 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
31f18b77
FG
9446 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9447 ss << "the sortbitwise flag must be set first";
9448 err = -EPERM;
9449 goto reply;
9450 }
9451 int rel = ceph_release_from_name(release.c_str());
9452 if (rel <= 0) {
9453 ss << "unrecognized release " << release;
9454 err = -EINVAL;
9455 goto reply;
9456 }
9457 if (rel < CEPH_RELEASE_LUMINOUS) {
9458 ss << "use this command only for luminous and later";
9459 err = -EINVAL;
9460 goto reply;
9461 }
d2e6a577
FG
9462 if (rel == osdmap.require_osd_release) {
9463 // idempotent
9464 err = 0;
9465 goto reply;
9466 }
31f18b77
FG
9467 if (rel == CEPH_RELEASE_LUMINOUS) {
9468 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
9469 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
9470 err = -EPERM;
9471 goto reply;
9472 }
9473 } else {
9474 ss << "not supported for this release yet";
9475 err = -EPERM;
9476 goto reply;
9477 }
9478 if (rel < osdmap.require_osd_release) {
9479 ss << "require_osd_release cannot be lowered once it has been set";
9480 err = -EPERM;
9481 goto reply;
9482 }
9483 pending_inc.new_require_osd_release = rel;
c07f9fc5
FG
9484 if (rel >= CEPH_RELEASE_LUMINOUS &&
9485 !osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
9486 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9487 }
31f18b77 9488 goto update;
7c673cae
FG
9489 } else if (prefix == "osd cluster_snap") {
9490 // ** DISABLE THIS FOR NOW **
9491 ss << "cluster snapshot currently disabled (broken implementation)";
9492 // ** DISABLE THIS FOR NOW **
9493
9494 } else if (prefix == "osd down" ||
9495 prefix == "osd out" ||
9496 prefix == "osd in" ||
9497 prefix == "osd rm") {
9498
9499 bool any = false;
31f18b77
FG
9500 bool stop = false;
9501 bool verbose = true;
7c673cae
FG
9502
9503 vector<string> idvec;
9504 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
31f18b77
FG
9505 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9506 set<int> osds;
9507
9508 // wildcard?
9509 if (j == 0 &&
9510 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9511 if (prefix == "osd in") {
9512 // touch out osds only
9513 osdmap.get_out_osds(osds);
9514 } else {
9515 osdmap.get_all_osds(osds);
9516 }
9517 stop = true;
9518 verbose = false; // so the output is less noisy.
9519 } else {
9520 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9521 if (osd < 0) {
9522 ss << "invalid osd id" << osd;
9523 err = -EINVAL;
9524 continue;
9525 } else if (!osdmap.exists(osd)) {
9526 ss << "osd." << osd << " does not exist. ";
9527 continue;
9528 }
9529
9530 osds.insert(osd);
7c673cae 9531 }
31f18b77
FG
9532
9533 for (auto &osd : osds) {
9534 if (prefix == "osd down") {
9535 if (osdmap.is_down(osd)) {
9536 if (verbose)
9537 ss << "osd." << osd << " is already down. ";
9538 } else {
9539 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
9540 ss << "marked down osd." << osd << ". ";
9541 any = true;
9542 }
9543 } else if (prefix == "osd out") {
9544 if (osdmap.is_out(osd)) {
9545 if (verbose)
9546 ss << "osd." << osd << " is already out. ";
9547 } else {
9548 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
9549 if (osdmap.osd_weight[osd]) {
9550 if (pending_inc.new_xinfo.count(osd) == 0) {
9551 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9552 }
9553 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 9554 }
31f18b77 9555 ss << "marked out osd." << osd << ". ";
224ce89b
WB
9556 std::ostringstream msg;
9557 msg << "Client " << op->get_session()->entity_name
9558 << " marked osd." << osd << " out";
9559 if (osdmap.is_up(osd)) {
9560 msg << ", while it was still marked up";
9561 } else {
3efd9988
FG
9562 auto period = ceph_clock_now() - down_pending_out[osd];
9563 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
9564 << " seconds";
9565 }
9566
9567 mon->clog->info() << msg.str();
31f18b77 9568 any = true;
7c673cae 9569 }
31f18b77
FG
9570 } else if (prefix == "osd in") {
9571 if (osdmap.is_in(osd)) {
9572 if (verbose)
9573 ss << "osd." << osd << " is already in. ";
9574 } else {
9575 if (osdmap.osd_xinfo[osd].old_weight > 0) {
9576 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
9577 if (pending_inc.new_xinfo.count(osd) == 0) {
9578 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9579 }
9580 pending_inc.new_xinfo[osd].old_weight = 0;
9581 } else {
9582 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 9583 }
31f18b77
FG
9584 ss << "marked in osd." << osd << ". ";
9585 any = true;
9586 }
9587 } else if (prefix == "osd rm") {
9588 err = prepare_command_osd_remove(osd);
9589
9590 if (err == -EBUSY) {
9591 if (any)
9592 ss << ", ";
9593 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 9594 } else {
31f18b77
FG
9595 assert(err == 0);
9596 if (any) {
9597 ss << ", osd." << osd;
9598 } else {
9599 ss << "removed osd." << osd;
9600 }
9601 any = true;
7c673cae 9602 }
31f18b77
FG
9603 }
9604 }
9605 }
9606 if (any) {
9607 getline(ss, rs);
9608 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9609 get_last_committed() + 1));
9610 return true;
9611 }
9612 } else if (prefix == "osd add-noup" ||
9613 prefix == "osd add-nodown" ||
9614 prefix == "osd add-noin" ||
9615 prefix == "osd add-noout") {
9616
9617 enum {
9618 OP_NOUP,
9619 OP_NODOWN,
9620 OP_NOIN,
9621 OP_NOOUT,
9622 } option;
9623
9624 if (prefix == "osd add-noup") {
9625 option = OP_NOUP;
9626 } else if (prefix == "osd add-nodown") {
9627 option = OP_NODOWN;
9628 } else if (prefix == "osd add-noin") {
9629 option = OP_NOIN;
9630 } else {
9631 option = OP_NOOUT;
9632 }
9633
9634 bool any = false;
9635 bool stop = false;
9636
9637 vector<string> idvec;
9638 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9639 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9640
9641 set<int> osds;
9642
9643 // wildcard?
9644 if (j == 0 &&
9645 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9646 osdmap.get_all_osds(osds);
9647 stop = true;
9648 } else {
9649 // try traditional single osd way
9650
9651 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9652 if (osd < 0) {
9653 // ss has reason for failure
9654 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9655 err = -EINVAL;
9656 continue;
9657 }
9658
9659 osds.insert(osd);
9660 }
9661
9662 for (auto &osd : osds) {
9663
9664 if (!osdmap.exists(osd)) {
9665 ss << "osd." << osd << " does not exist. ";
9666 continue;
9667 }
9668
9669 switch (option) {
9670 case OP_NOUP:
9671 if (osdmap.is_up(osd)) {
9672 ss << "osd." << osd << " is already up. ";
9673 continue;
9674 }
9675
9676 if (osdmap.is_noup(osd)) {
9677 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
9678 any = true;
7c673cae 9679 } else {
31f18b77
FG
9680 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9681 any = true;
7c673cae 9682 }
31f18b77
FG
9683
9684 break;
9685
9686 case OP_NODOWN:
9687 if (osdmap.is_down(osd)) {
9688 ss << "osd." << osd << " is already down. ";
9689 continue;
9690 }
9691
9692 if (osdmap.is_nodown(osd)) {
9693 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
9694 any = true;
9695 } else {
9696 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9697 any = true;
9698 }
9699
9700 break;
9701
9702 case OP_NOIN:
9703 if (osdmap.is_in(osd)) {
9704 ss << "osd." << osd << " is already in. ";
9705 continue;
9706 }
9707
9708 if (osdmap.is_noin(osd)) {
9709 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
9710 any = true;
9711 } else {
9712 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9713 any = true;
9714 }
9715
9716 break;
9717
9718 case OP_NOOUT:
9719 if (osdmap.is_out(osd)) {
9720 ss << "osd." << osd << " is already out. ";
9721 continue;
9722 }
9723
9724 if (osdmap.is_noout(osd)) {
9725 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
9726 any = true;
9727 } else {
9728 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9729 any = true;
9730 }
9731
9732 break;
9733
9734 default:
9735 assert(0 == "invalid option");
9736 }
7c673cae
FG
9737 }
9738 }
31f18b77 9739
7c673cae
FG
9740 if (any) {
9741 getline(ss, rs);
9742 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
31f18b77
FG
9743 get_last_committed() + 1));
9744 return true;
9745 }
9746 } else if (prefix == "osd rm-noup" ||
9747 prefix == "osd rm-nodown" ||
9748 prefix == "osd rm-noin" ||
9749 prefix == "osd rm-noout") {
9750
9751 enum {
9752 OP_NOUP,
9753 OP_NODOWN,
9754 OP_NOIN,
9755 OP_NOOUT,
9756 } option;
9757
9758 if (prefix == "osd rm-noup") {
9759 option = OP_NOUP;
9760 } else if (prefix == "osd rm-nodown") {
9761 option = OP_NODOWN;
9762 } else if (prefix == "osd rm-noin") {
9763 option = OP_NOIN;
9764 } else {
9765 option = OP_NOOUT;
9766 }
9767
9768 bool any = false;
9769 bool stop = false;
9770
9771 vector<string> idvec;
9772 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9773
9774 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9775
9776 vector<int> osds;
9777
9778 // wildcard?
9779 if (j == 0 &&
9780 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9781
9782 // touch previous noup/nodown/noin/noout osds only
9783 switch (option) {
9784 case OP_NOUP:
9785 osdmap.get_noup_osds(&osds);
9786 break;
9787 case OP_NODOWN:
9788 osdmap.get_nodown_osds(&osds);
9789 break;
9790 case OP_NOIN:
9791 osdmap.get_noin_osds(&osds);
9792 break;
9793 case OP_NOOUT:
9794 osdmap.get_noout_osds(&osds);
9795 break;
9796 default:
9797 assert(0 == "invalid option");
9798 }
9799
9800 // cancel any pending noup/nodown/noin/noout requests too
9801 vector<int> pending_state_osds;
9802 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
9803 for (auto &p : pending_state_osds) {
9804
9805 switch (option) {
9806 case OP_NOUP:
9807 if (!osdmap.is_noup(p) &&
9808 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
9809 any = true;
9810 }
9811 break;
9812
9813 case OP_NODOWN:
9814 if (!osdmap.is_nodown(p) &&
9815 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
9816 any = true;
9817 }
9818 break;
9819
9820 case OP_NOIN:
9821 if (!osdmap.is_noin(p) &&
9822 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
9823 any = true;
9824 }
9825 break;
9826
9827 case OP_NOOUT:
9828 if (!osdmap.is_noout(p) &&
9829 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
9830 any = true;
9831 }
9832 break;
9833
9834 default:
9835 assert(0 == "invalid option");
9836 }
9837 }
9838
9839 stop = true;
9840 } else {
9841 // try traditional single osd way
9842
9843 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9844 if (osd < 0) {
9845 // ss has reason for failure
9846 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9847 err = -EINVAL;
9848 continue;
9849 }
9850
9851 osds.push_back(osd);
9852 }
9853
9854 for (auto &osd : osds) {
9855
9856 if (!osdmap.exists(osd)) {
9857 ss << "osd." << osd << " does not exist. ";
9858 continue;
9859 }
9860
9861 switch (option) {
9862 case OP_NOUP:
9863 if (osdmap.is_noup(osd)) {
9864 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9865 any = true;
9866 } else if (pending_inc.pending_osd_state_clear(
9867 osd, CEPH_OSD_NOUP)) {
9868 any = true;
9869 }
9870 break;
9871
9872 case OP_NODOWN:
9873 if (osdmap.is_nodown(osd)) {
9874 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9875 any = true;
9876 } else if (pending_inc.pending_osd_state_clear(
9877 osd, CEPH_OSD_NODOWN)) {
9878 any = true;
9879 }
9880 break;
9881
9882 case OP_NOIN:
9883 if (osdmap.is_noin(osd)) {
9884 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9885 any = true;
9886 } else if (pending_inc.pending_osd_state_clear(
9887 osd, CEPH_OSD_NOIN)) {
9888 any = true;
9889 }
9890 break;
9891
9892 case OP_NOOUT:
9893 if (osdmap.is_noout(osd)) {
9894 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9895 any = true;
9896 } else if (pending_inc.pending_osd_state_clear(
9897 osd, CEPH_OSD_NOOUT)) {
9898 any = true;
9899 }
9900 break;
9901
9902 default:
9903 assert(0 == "invalid option");
9904 }
9905 }
9906 }
9907
9908 if (any) {
9909 getline(ss, rs);
9910 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9911 get_last_committed() + 1));
7c673cae
FG
9912 return true;
9913 }
9914 } else if (prefix == "osd pg-temp") {
9915 string pgidstr;
9916 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9917 ss << "unable to parse 'pgid' value '"
9918 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9919 err = -EINVAL;
9920 goto reply;
9921 }
9922 pg_t pgid;
9923 if (!pgid.parse(pgidstr.c_str())) {
9924 ss << "invalid pgid '" << pgidstr << "'";
9925 err = -EINVAL;
9926 goto reply;
9927 }
9928 if (!osdmap.pg_exists(pgid)) {
9929 ss << "pg " << pgid << " does not exist";
9930 err = -ENOENT;
9931 goto reply;
9932 }
9933 if (pending_inc.new_pg_temp.count(pgid)) {
9934 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
9935 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9936 return true;
9937 }
9938
9939 vector<int64_t> id_vec;
9940 vector<int32_t> new_pg_temp;
9941 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9942 ss << "unable to parse 'id' value(s) '"
9943 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9944 err = -EINVAL;
9945 goto reply;
9946 }
9947 for (auto osd : id_vec) {
9948 if (!osdmap.exists(osd)) {
9949 ss << "osd." << osd << " does not exist";
9950 err = -ENOENT;
9951 goto reply;
9952 }
9953 new_pg_temp.push_back(osd);
9954 }
9955
224ce89b
WB
9956 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9957 if ((int)new_pg_temp.size() < pool_min_size) {
9958 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
9959 << pool_min_size << ")";
9960 err = -EINVAL;
9961 goto reply;
9962 }
9963
9964 int pool_size = osdmap.get_pg_pool_size(pgid);
9965 if ((int)new_pg_temp.size() > pool_size) {
9966 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
9967 << pool_size << ")";
9968 err = -EINVAL;
9969 goto reply;
9970 }
9971
7c673cae
FG
9972 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
9973 new_pg_temp.begin(), new_pg_temp.end());
9974 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
9975 goto update;
9976 } else if (prefix == "osd primary-temp") {
9977 string pgidstr;
9978 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9979 ss << "unable to parse 'pgid' value '"
9980 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9981 err = -EINVAL;
9982 goto reply;
9983 }
9984 pg_t pgid;
9985 if (!pgid.parse(pgidstr.c_str())) {
9986 ss << "invalid pgid '" << pgidstr << "'";
9987 err = -EINVAL;
9988 goto reply;
9989 }
9990 if (!osdmap.pg_exists(pgid)) {
9991 ss << "pg " << pgid << " does not exist";
9992 err = -ENOENT;
9993 goto reply;
9994 }
9995
9996 int64_t osd;
9997 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
9998 ss << "unable to parse 'id' value '"
9999 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10000 err = -EINVAL;
10001 goto reply;
10002 }
10003 if (osd != -1 && !osdmap.exists(osd)) {
10004 ss << "osd." << osd << " does not exist";
10005 err = -ENOENT;
10006 goto reply;
10007 }
10008
31f18b77
FG
10009 if (osdmap.require_min_compat_client > 0 &&
10010 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10011 ss << "require_min_compat_client "
10012 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
10013 << " < firefly, which is required for primary-temp";
10014 err = -EPERM;
10015 goto reply;
10016 } else if (!g_conf->mon_osd_allow_primary_temp) {
10017 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
10018 err = -EPERM;
10019 goto reply;
10020 }
10021
10022 pending_inc.new_primary_temp[pgid] = osd;
10023 ss << "set " << pgid << " primary_temp mapping to " << osd;
10024 goto update;
224ce89b
WB
10025 } else if (prefix == "osd pg-upmap" ||
10026 prefix == "osd rm-pg-upmap" ||
10027 prefix == "osd pg-upmap-items" ||
10028 prefix == "osd rm-pg-upmap-items") {
31f18b77 10029 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
10030 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10031 << "luminous' before using the new interface";
7c673cae
FG
10032 err = -EPERM;
10033 goto reply;
10034 }
31f18b77
FG
10035 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
10036 ss << "min_compat_client "
10037 << ceph_release_name(osdmap.require_min_compat_client)
224ce89b
WB
10038 << " < luminous, which is required for pg-upmap. "
10039 << "Try 'ceph osd set-require-min-compat-client luminous' "
10040 << "before using the new interface";
7c673cae
FG
10041 err = -EPERM;
10042 goto reply;
10043 }
10044 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
10045 if (err == -EAGAIN)
10046 goto wait;
10047 if (err < 0)
10048 goto reply;
10049 string pgidstr;
10050 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
10051 ss << "unable to parse 'pgid' value '"
10052 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
10053 err = -EINVAL;
10054 goto reply;
10055 }
10056 pg_t pgid;
10057 if (!pgid.parse(pgidstr.c_str())) {
10058 ss << "invalid pgid '" << pgidstr << "'";
10059 err = -EINVAL;
10060 goto reply;
10061 }
10062 if (!osdmap.pg_exists(pgid)) {
10063 ss << "pg " << pgid << " does not exist";
10064 err = -ENOENT;
10065 goto reply;
10066 }
94b18763
FG
10067 if (pending_inc.old_pools.count(pgid.pool())) {
10068 ss << "pool of " << pgid << " is pending removal";
10069 err = -ENOENT;
10070 getline(ss, rs);
10071 wait_for_finished_proposal(op,
10072 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
10073 return true;
10074 }
224ce89b
WB
10075
10076 enum {
10077 OP_PG_UPMAP,
10078 OP_RM_PG_UPMAP,
10079 OP_PG_UPMAP_ITEMS,
10080 OP_RM_PG_UPMAP_ITEMS,
10081 } option;
10082
10083 if (prefix == "osd pg-upmap") {
10084 option = OP_PG_UPMAP;
10085 } else if (prefix == "osd rm-pg-upmap") {
10086 option = OP_RM_PG_UPMAP;
10087 } else if (prefix == "osd pg-upmap-items") {
10088 option = OP_PG_UPMAP_ITEMS;
10089 } else {
10090 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 10091 }
224ce89b
WB
10092
10093 // check pending upmap changes
10094 switch (option) {
10095 case OP_PG_UPMAP: // fall through
10096 case OP_RM_PG_UPMAP:
10097 if (pending_inc.new_pg_upmap.count(pgid) ||
10098 pending_inc.old_pg_upmap.count(pgid)) {
10099 dout(10) << __func__ << " waiting for pending update on "
10100 << pgid << dendl;
10101 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10102 return true;
7c673cae 10103 }
224ce89b 10104 break;
7c673cae 10105
224ce89b
WB
10106 case OP_PG_UPMAP_ITEMS: // fall through
10107 case OP_RM_PG_UPMAP_ITEMS:
10108 if (pending_inc.new_pg_upmap_items.count(pgid) ||
10109 pending_inc.old_pg_upmap_items.count(pgid)) {
10110 dout(10) << __func__ << " waiting for pending update on "
10111 << pgid << dendl;
10112 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10113 return true;
10114 }
10115 break;
7c673cae 10116
224ce89b
WB
10117 default:
10118 assert(0 == "invalid option");
7c673cae 10119 }
224ce89b
WB
10120
10121 switch (option) {
10122 case OP_PG_UPMAP:
10123 {
10124 vector<int64_t> id_vec;
10125 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
10126 ss << "unable to parse 'id' value(s) '"
10127 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10128 err = -EINVAL;
10129 goto reply;
10130 }
10131
10132 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10133 if ((int)id_vec.size() < pool_min_size) {
10134 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
10135 << pool_min_size << ")";
10136 err = -EINVAL;
10137 goto reply;
10138 }
10139
10140 int pool_size = osdmap.get_pg_pool_size(pgid);
10141 if ((int)id_vec.size() > pool_size) {
10142 ss << "num of osds (" << id_vec.size() <<") > pool size ("
10143 << pool_size << ")";
10144 err = -EINVAL;
10145 goto reply;
10146 }
10147
10148 vector<int32_t> new_pg_upmap;
10149 for (auto osd : id_vec) {
10150 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
10151 ss << "osd." << osd << " does not exist";
10152 err = -ENOENT;
10153 goto reply;
10154 }
10155 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
10156 if (it != new_pg_upmap.end()) {
10157 ss << "osd." << osd << " already exists, ";
10158 continue;
10159 }
10160 new_pg_upmap.push_back(osd);
10161 }
10162
10163 if (new_pg_upmap.empty()) {
10164 ss << "no valid upmap items(pairs) is specified";
10165 err = -EINVAL;
10166 goto reply;
10167 }
10168
10169 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
10170 new_pg_upmap.begin(), new_pg_upmap.end());
10171 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 10172 }
224ce89b
WB
10173 break;
10174
10175 case OP_RM_PG_UPMAP:
10176 {
10177 pending_inc.old_pg_upmap.insert(pgid);
10178 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 10179 }
224ce89b 10180 break;
7c673cae 10181
224ce89b
WB
10182 case OP_PG_UPMAP_ITEMS:
10183 {
10184 vector<int64_t> id_vec;
10185 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
10186 ss << "unable to parse 'id' value(s) '"
10187 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10188 err = -EINVAL;
10189 goto reply;
10190 }
10191
10192 if (id_vec.size() % 2) {
10193 ss << "you must specify pairs of osd ids to be remapped";
10194 err = -EINVAL;
10195 goto reply;
10196 }
10197
10198 int pool_size = osdmap.get_pg_pool_size(pgid);
10199 if ((int)(id_vec.size() / 2) > pool_size) {
10200 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
10201 << pool_size << ")";
10202 err = -EINVAL;
10203 goto reply;
10204 }
10205
10206 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
10207 ostringstream items;
10208 items << "[";
10209 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
10210 int from = *p++;
10211 int to = *p;
10212 if (from == to) {
10213 ss << "from osd." << from << " == to osd." << to << ", ";
10214 continue;
10215 }
10216 if (!osdmap.exists(from)) {
10217 ss << "osd." << from << " does not exist";
10218 err = -ENOENT;
10219 goto reply;
10220 }
10221 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
10222 ss << "osd." << to << " does not exist";
10223 err = -ENOENT;
10224 goto reply;
10225 }
c07f9fc5
FG
10226 pair<int32_t,int32_t> entry = make_pair(from, to);
10227 auto it = std::find(new_pg_upmap_items.begin(),
10228 new_pg_upmap_items.end(), entry);
10229 if (it != new_pg_upmap_items.end()) {
10230 ss << "osd." << from << " -> osd." << to << " already exists, ";
10231 continue;
10232 }
10233 new_pg_upmap_items.push_back(entry);
224ce89b
WB
10234 items << from << "->" << to << ",";
10235 }
10236 string out(items.str());
10237 out.resize(out.size() - 1); // drop last ','
10238 out += "]";
10239
10240 if (new_pg_upmap_items.empty()) {
10241 ss << "no valid upmap items(pairs) is specified";
10242 err = -EINVAL;
10243 goto reply;
10244 }
10245
10246 pending_inc.new_pg_upmap_items[pgid] =
10247 mempool::osdmap::vector<pair<int32_t,int32_t>>(
10248 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
10249 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
10250 }
10251 break;
10252
10253 case OP_RM_PG_UPMAP_ITEMS:
10254 {
10255 pending_inc.old_pg_upmap_items.insert(pgid);
10256 ss << "clear " << pgid << " pg_upmap_items mapping";
10257 }
10258 break;
10259
10260 default:
10261 assert(0 == "invalid option");
7c673cae
FG
10262 }
10263
7c673cae
FG
10264 goto update;
10265 } else if (prefix == "osd primary-affinity") {
10266 int64_t id;
10267 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10268 ss << "invalid osd id value '"
10269 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10270 err = -EINVAL;
10271 goto reply;
10272 }
10273 double w;
10274 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
10275 ss << "unable to parse 'weight' value '"
10276 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10277 err = -EINVAL;
10278 goto reply;
10279 }
10280 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
10281 if (ww < 0L) {
10282 ss << "weight must be >= 0";
10283 err = -EINVAL;
10284 goto reply;
10285 }
31f18b77
FG
10286 if (osdmap.require_min_compat_client > 0 &&
10287 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10288 ss << "require_min_compat_client "
10289 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
10290 << " < firefly, which is required for primary-affinity";
10291 err = -EPERM;
10292 goto reply;
10293 } else if (!g_conf->mon_osd_allow_primary_affinity) {
10294 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
10295 err = -EPERM;
10296 goto reply;
10297 }
10298 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
10299 if (err == -EAGAIN)
10300 goto wait;
10301 if (err < 0)
10302 goto reply;
10303 if (osdmap.exists(id)) {
10304 pending_inc.new_primary_affinity[id] = ww;
10305 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
10306 getline(ss, rs);
10307 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10308 get_last_committed() + 1));
10309 return true;
10310 } else {
10311 ss << "osd." << id << " does not exist";
10312 err = -ENOENT;
10313 goto reply;
10314 }
10315 } else if (prefix == "osd reweight") {
10316 int64_t id;
10317 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10318 ss << "unable to parse osd id value '"
10319 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10320 err = -EINVAL;
10321 goto reply;
10322 }
10323 double w;
10324 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
10325 ss << "unable to parse weight value '"
10326 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10327 err = -EINVAL;
10328 goto reply;
10329 }
10330 long ww = (int)((double)CEPH_OSD_IN*w);
10331 if (ww < 0L) {
10332 ss << "weight must be >= 0";
10333 err = -EINVAL;
10334 goto reply;
10335 }
10336 if (osdmap.exists(id)) {
10337 pending_inc.new_weight[id] = ww;
10338 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
10339 getline(ss, rs);
10340 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10341 get_last_committed() + 1));
10342 return true;
10343 } else {
10344 ss << "osd." << id << " does not exist";
10345 err = -ENOENT;
10346 goto reply;
10347 }
10348 } else if (prefix == "osd reweightn") {
10349 map<int32_t, uint32_t> weights;
10350 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
10351 if (err) {
10352 ss << "unable to parse 'weights' value '"
10353 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
10354 goto reply;
10355 }
10356 pending_inc.new_weight.insert(weights.begin(), weights.end());
10357 wait_for_finished_proposal(
10358 op,
10359 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 10360 return true;
7c673cae
FG
10361 } else if (prefix == "osd lost") {
10362 int64_t id;
10363 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10364 ss << "unable to parse osd id value '"
10365 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10366 err = -EINVAL;
10367 goto reply;
10368 }
10369 string sure;
10370 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
10371 ss << "are you SURE? this might mean real, permanent data loss. pass "
10372 "--yes-i-really-mean-it if you really do.";
10373 err = -EPERM;
10374 goto reply;
10375 } else if (!osdmap.exists(id)) {
10376 ss << "osd." << id << " does not exist";
10377 err = -ENOENT;
10378 goto reply;
10379 } else if (!osdmap.is_down(id)) {
10380 ss << "osd." << id << " is not down";
10381 err = -EBUSY;
10382 goto reply;
10383 } else {
10384 epoch_t e = osdmap.get_info(id).down_at;
10385 pending_inc.new_lost[id] = e;
10386 ss << "marked osd lost in epoch " << e;
10387 getline(ss, rs);
10388 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10389 get_last_committed() + 1));
10390 return true;
10391 }
10392
31f18b77
FG
10393 } else if (prefix == "osd destroy" || prefix == "osd purge") {
10394 /* Destroying an OSD means that we don't expect to further make use of
10395 * the OSDs data (which may even become unreadable after this operation),
10396 * and that we are okay with scrubbing all its cephx keys and config-key
10397 * data (which may include lockbox keys, thus rendering the osd's data
10398 * unreadable).
10399 *
10400 * The OSD will not be removed. Instead, we will mark it as destroyed,
10401 * such that a subsequent call to `create` will not reuse the osd id.
10402 * This will play into being able to recreate the OSD, at the same
10403 * crush location, with minimal data movement.
10404 */
10405
10406 // make sure authmon is writeable.
10407 if (!mon->authmon()->is_writeable()) {
10408 dout(10) << __func__ << " waiting for auth mon to be writeable for "
10409 << "osd destroy" << dendl;
10410 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10411 return false;
10412 }
10413
10414 int64_t id;
10415 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10416 ss << "unable to parse osd id value '"
10417 << cmd_vartype_stringify(cmdmap["id"]) << "";
10418 err = -EINVAL;
10419 goto reply;
10420 }
10421
10422 bool is_destroy = (prefix == "osd destroy");
10423 if (!is_destroy) {
10424 assert("osd purge" == prefix);
10425 }
10426
10427 string sure;
10428 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
10429 sure != "--yes-i-really-mean-it") {
10430 ss << "Are you SURE? This will mean real, permanent data loss, as well "
10431 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
10432 << "really do.";
10433 err = -EPERM;
10434 goto reply;
d2e6a577 10435 } else if (!osdmap.exists(id)) {
31f18b77 10436 ss << "osd." << id << " does not exist";
d2e6a577 10437 err = 0; // idempotent
31f18b77
FG
10438 goto reply;
10439 } else if (osdmap.is_up(id)) {
10440 ss << "osd." << id << " is not `down`.";
10441 err = -EBUSY;
10442 goto reply;
10443 } else if (is_destroy && osdmap.is_destroyed(id)) {
10444 ss << "destroyed osd." << id;
10445 err = 0;
10446 goto reply;
10447 }
10448
10449 bool goto_reply = false;
10450
10451 paxos->plug();
10452 if (is_destroy) {
10453 err = prepare_command_osd_destroy(id, ss);
10454 // we checked above that it should exist.
10455 assert(err != -ENOENT);
10456 } else {
10457 err = prepare_command_osd_purge(id, ss);
10458 if (err == -ENOENT) {
10459 err = 0;
10460 ss << "osd." << id << " does not exist.";
10461 goto_reply = true;
10462 }
10463 }
10464 paxos->unplug();
10465
10466 if (err < 0 || goto_reply) {
10467 goto reply;
10468 }
10469
10470 if (is_destroy) {
10471 ss << "destroyed osd." << id;
10472 } else {
10473 ss << "purged osd." << id;
10474 }
10475
10476 getline(ss, rs);
10477 wait_for_finished_proposal(op,
10478 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
10479 force_immediate_propose();
10480 return true;
10481
10482 } else if (prefix == "osd new") {
10483
10484 // make sure authmon is writeable.
10485 if (!mon->authmon()->is_writeable()) {
10486 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 10487 << "osd new" << dendl;
31f18b77
FG
10488 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10489 return false;
10490 }
10491
3a9019d9 10492 map<string,string> param_map;
31f18b77
FG
10493
10494 bufferlist bl = m->get_data();
3a9019d9
FG
10495 string param_json = bl.to_str();
10496 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 10497
3a9019d9 10498 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
10499 if (err < 0)
10500 goto reply;
10501
3a9019d9 10502 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77
FG
10503
10504 paxos->plug();
3a9019d9 10505 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
31f18b77
FG
10506 paxos->unplug();
10507
10508 if (err < 0) {
10509 goto reply;
10510 }
10511
10512 if (f) {
10513 f->flush(rdata);
10514 } else {
10515 rdata.append(ss);
10516 }
10517
10518 if (err == EEXIST) {
10519 // idempotent operation
10520 err = 0;
10521 goto reply;
10522 }
10523
10524 wait_for_finished_proposal(op,
10525 new Monitor::C_Command(mon, op, 0, rs, rdata,
10526 get_last_committed() + 1));
10527 force_immediate_propose();
10528 return true;
10529
7c673cae 10530 } else if (prefix == "osd create") {
7c673cae
FG
10531
10532 // optional id provided?
31f18b77
FG
10533 int64_t id = -1, cmd_id = -1;
10534 if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
10535 if (cmd_id < 0) {
10536 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
10537 err = -EINVAL;
10538 goto reply;
10539 }
31f18b77 10540 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
10541 }
10542
7c673cae
FG
10543 uuid_d uuid;
10544 string uuidstr;
10545 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
10546 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
10547 ss << "invalid uuid value '" << uuidstr << "'";
10548 err = -EINVAL;
10549 goto reply;
7c673cae 10550 }
31f18b77
FG
10551 // we only care about the id if we also have the uuid, to
10552 // ensure the operation's idempotency.
10553 id = cmd_id;
7c673cae
FG
10554 }
10555
31f18b77
FG
10556 int32_t new_id = -1;
10557 err = prepare_command_osd_create(id, uuid, &new_id, ss);
10558 if (err < 0) {
10559 if (err == -EAGAIN) {
10560 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10561 return true;
10562 }
10563 // a check has failed; reply to the user.
10564 goto reply;
10565
10566 } else if (err == EEXIST) {
10567 // this is an idempotent operation; we can go ahead and reply.
10568 if (f) {
10569 f->open_object_section("created_osd");
10570 f->dump_int("osdid", new_id);
10571 f->close_section();
10572 f->flush(rdata);
10573 } else {
10574 ss << new_id;
10575 rdata.append(ss);
7c673cae 10576 }
31f18b77
FG
10577 err = 0;
10578 goto reply;
7c673cae
FG
10579 }
10580
3a9019d9
FG
10581 string empty_device_class;
10582 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 10583
7c673cae
FG
10584 if (f) {
10585 f->open_object_section("created_osd");
31f18b77 10586 f->dump_int("osdid", new_id);
7c673cae
FG
10587 f->close_section();
10588 f->flush(rdata);
10589 } else {
31f18b77 10590 ss << new_id;
7c673cae
FG
10591 rdata.append(ss);
10592 }
31f18b77
FG
10593 wait_for_finished_proposal(op,
10594 new Monitor::C_Command(mon, op, 0, rs, rdata,
10595 get_last_committed() + 1));
7c673cae
FG
10596 return true;
10597
10598 } else if (prefix == "osd blacklist clear") {
10599 pending_inc.new_blacklist.clear();
10600 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
10601 osdmap.get_blacklist(&blacklist);
10602 for (const auto &entry : blacklist) {
10603 pending_inc.old_blacklist.push_back(entry.first);
10604 }
10605 ss << " removed all blacklist entries";
10606 getline(ss, rs);
10607 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10608 get_last_committed() + 1));
10609 return true;
10610 } else if (prefix == "osd blacklist") {
10611 string addrstr;
10612 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
10613 entity_addr_t addr;
10614 if (!addr.parse(addrstr.c_str(), 0)) {
10615 ss << "unable to parse address " << addrstr;
10616 err = -EINVAL;
10617 goto reply;
10618 }
10619 else {
10620 string blacklistop;
10621 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
10622 if (blacklistop == "add") {
10623 utime_t expires = ceph_clock_now();
10624 double d;
10625 // default one hour
224ce89b
WB
10626 cmd_getval(g_ceph_context, cmdmap, "expire", d,
10627 g_conf->mon_osd_blacklist_default_expire);
7c673cae
FG
10628 expires += d;
10629
10630 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
10631
10632 {
10633 // cancel any pending un-blacklisting request too
10634 auto it = std::find(pending_inc.old_blacklist.begin(),
10635 pending_inc.old_blacklist.end(), addr);
10636 if (it != pending_inc.old_blacklist.end()) {
10637 pending_inc.old_blacklist.erase(it);
10638 }
10639 }
10640
7c673cae
FG
10641 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
10642 getline(ss, rs);
10643 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10644 get_last_committed() + 1));
10645 return true;
10646 } else if (blacklistop == "rm") {
10647 if (osdmap.is_blacklisted(addr) ||
10648 pending_inc.new_blacklist.count(addr)) {
10649 if (osdmap.is_blacklisted(addr))
10650 pending_inc.old_blacklist.push_back(addr);
10651 else
10652 pending_inc.new_blacklist.erase(addr);
10653 ss << "un-blacklisting " << addr;
10654 getline(ss, rs);
10655 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10656 get_last_committed() + 1));
10657 return true;
10658 }
10659 ss << addr << " isn't blacklisted";
10660 err = 0;
10661 goto reply;
10662 }
10663 }
10664 } else if (prefix == "osd pool mksnap") {
10665 string poolstr;
10666 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10667 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10668 if (pool < 0) {
10669 ss << "unrecognized pool '" << poolstr << "'";
10670 err = -ENOENT;
10671 goto reply;
10672 }
10673 string snapname;
10674 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10675 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10676 if (p->is_unmanaged_snaps_mode()) {
10677 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10678 err = -EINVAL;
10679 goto reply;
10680 } else if (p->snap_exists(snapname.c_str())) {
10681 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10682 err = 0;
10683 goto reply;
10684 } else if (p->is_tier()) {
10685 ss << "pool " << poolstr << " is a cache tier";
10686 err = -EINVAL;
10687 goto reply;
10688 }
10689 pg_pool_t *pp = 0;
10690 if (pending_inc.new_pools.count(pool))
10691 pp = &pending_inc.new_pools[pool];
10692 if (!pp) {
10693 pp = &pending_inc.new_pools[pool];
10694 *pp = *p;
10695 }
10696 if (pp->snap_exists(snapname.c_str())) {
10697 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10698 } else {
10699 pp->add_snap(snapname.c_str(), ceph_clock_now());
10700 pp->set_snap_epoch(pending_inc.epoch);
10701 ss << "created pool " << poolstr << " snap " << snapname;
10702 }
10703 getline(ss, rs);
10704 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10705 get_last_committed() + 1));
10706 return true;
10707 } else if (prefix == "osd pool rmsnap") {
10708 string poolstr;
10709 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10710 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10711 if (pool < 0) {
10712 ss << "unrecognized pool '" << poolstr << "'";
10713 err = -ENOENT;
10714 goto reply;
10715 }
10716 string snapname;
10717 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10718 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10719 if (p->is_unmanaged_snaps_mode()) {
10720 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10721 err = -EINVAL;
10722 goto reply;
10723 } else if (!p->snap_exists(snapname.c_str())) {
10724 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
10725 err = 0;
10726 goto reply;
10727 }
10728 pg_pool_t *pp = 0;
10729 if (pending_inc.new_pools.count(pool))
10730 pp = &pending_inc.new_pools[pool];
10731 if (!pp) {
10732 pp = &pending_inc.new_pools[pool];
10733 *pp = *p;
10734 }
10735 snapid_t sn = pp->snap_exists(snapname.c_str());
10736 if (sn) {
10737 pp->remove_snap(sn);
10738 pp->set_snap_epoch(pending_inc.epoch);
10739 ss << "removed pool " << poolstr << " snap " << snapname;
10740 } else {
10741 ss << "already removed pool " << poolstr << " snap " << snapname;
10742 }
10743 getline(ss, rs);
10744 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10745 get_last_committed() + 1));
10746 return true;
10747 } else if (prefix == "osd pool create") {
10748 int64_t pg_num;
10749 int64_t pgp_num;
10750 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
10751 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
10752
10753 string pool_type_str;
10754 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
10755 if (pool_type_str.empty())
224ce89b 10756 pool_type_str = g_conf->osd_pool_default_type;
7c673cae
FG
10757
10758 string poolstr;
10759 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10760 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10761 if (pool_id >= 0) {
10762 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10763 if (pool_type_str != p->get_type_name()) {
10764 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
10765 err = -EINVAL;
10766 } else {
10767 ss << "pool '" << poolstr << "' already exists";
10768 err = 0;
10769 }
10770 goto reply;
10771 }
10772
10773 int pool_type;
10774 if (pool_type_str == "replicated") {
10775 pool_type = pg_pool_t::TYPE_REPLICATED;
10776 } else if (pool_type_str == "erasure") {
10777 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
10778 CEPH_FEATURE_OSD_ERASURE_CODES,
10779 ss);
10780 if (err == -EAGAIN)
10781 goto wait;
10782 if (err)
10783 goto reply;
10784 pool_type = pg_pool_t::TYPE_ERASURE;
10785 } else {
10786 ss << "unknown pool type '" << pool_type_str << "'";
10787 err = -EINVAL;
10788 goto reply;
10789 }
10790
31f18b77 10791 bool implicit_rule_creation = false;
94b18763 10792 int64_t expected_num_objects = 0;
31f18b77
FG
10793 string rule_name;
10794 cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
7c673cae
FG
10795 string erasure_code_profile;
10796 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
10797
10798 if (pool_type == pg_pool_t::TYPE_ERASURE) {
10799 if (erasure_code_profile == "")
10800 erasure_code_profile = "default";
10801 //handle the erasure code profile
10802 if (erasure_code_profile == "default") {
10803 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
10804 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
10805 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
10806 goto wait;
10807 }
10808
10809 map<string,string> profile_map;
10810 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
10811 profile_map,
10812 &ss);
10813 if (err)
10814 goto reply;
10815 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
10816 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
10817 goto wait;
10818 }
10819 }
31f18b77
FG
10820 if (rule_name == "") {
10821 implicit_rule_creation = true;
7c673cae 10822 if (erasure_code_profile == "default") {
31f18b77 10823 rule_name = "erasure-code";
7c673cae 10824 } else {
31f18b77 10825 dout(1) << "implicitly use rule named after the pool: "
7c673cae 10826 << poolstr << dendl;
31f18b77 10827 rule_name = poolstr;
7c673cae
FG
10828 }
10829 }
94b18763
FG
10830 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
10831 expected_num_objects, int64_t(0));
7c673cae 10832 } else {
31f18b77 10833 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
10834 // and put expected_num_objects to rule field
10835 if (erasure_code_profile != "") { // cmd is from CLI
10836 if (rule_name != "") {
10837 string interr;
10838 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
10839 if (interr.length()) {
10840 ss << "error parsing integer value '" << rule_name << "': " << interr;
10841 err = -EINVAL;
10842 goto reply;
10843 }
10844 }
10845 rule_name = erasure_code_profile;
10846 } else { // cmd is well-formed
10847 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
10848 expected_num_objects, int64_t(0));
10849 }
7c673cae
FG
10850 }
10851
31f18b77
FG
10852 if (!implicit_rule_creation && rule_name != "") {
10853 int rule;
10854 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
10855 if (err == -EAGAIN) {
10856 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10857 return true;
10858 }
10859 if (err)
10860 goto reply;
10861 }
10862
7c673cae
FG
10863 if (expected_num_objects < 0) {
10864 ss << "'expected_num_objects' must be non-negative";
10865 err = -EINVAL;
10866 goto reply;
10867 }
10868
10869 int64_t fast_read_param;
10870 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
10871 FastReadType fast_read = FAST_READ_DEFAULT;
10872 if (fast_read_param == 0)
10873 fast_read = FAST_READ_OFF;
10874 else if (fast_read_param > 0)
10875 fast_read = FAST_READ_ON;
10876
10877 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
10878 -1, // default crush rule
31f18b77 10879 rule_name,
7c673cae
FG
10880 pg_num, pgp_num,
10881 erasure_code_profile, pool_type,
10882 (uint64_t)expected_num_objects,
10883 fast_read,
10884 &ss);
10885 if (err < 0) {
10886 switch(err) {
10887 case -EEXIST:
10888 ss << "pool '" << poolstr << "' already exists";
10889 break;
10890 case -EAGAIN:
10891 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10892 return true;
10893 case -ERANGE:
10894 goto reply;
10895 default:
10896 goto reply;
10897 break;
10898 }
10899 } else {
10900 ss << "pool '" << poolstr << "' created";
10901 }
10902 getline(ss, rs);
10903 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10904 get_last_committed() + 1));
10905 return true;
10906
10907 } else if (prefix == "osd pool delete" ||
10908 prefix == "osd pool rm") {
10909 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10910 string poolstr, poolstr2, sure;
10911 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10912 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
10913 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10914 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10915 if (pool < 0) {
10916 ss << "pool '" << poolstr << "' does not exist";
10917 err = 0;
10918 goto reply;
10919 }
10920
10921 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
10922 if (poolstr2 != poolstr ||
10923 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
10924 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10925 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10926 << "followed by --yes-i-really-really-mean-it.";
10927 err = -EPERM;
10928 goto reply;
10929 }
10930 err = _prepare_remove_pool(pool, &ss, force_no_fake);
10931 if (err == -EAGAIN) {
10932 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10933 return true;
10934 }
10935 if (err < 0)
10936 goto reply;
10937 goto update;
10938 } else if (prefix == "osd pool rename") {
10939 string srcpoolstr, destpoolstr;
10940 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
10941 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
10942 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
10943 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
10944
10945 if (pool_src < 0) {
10946 if (pool_dst >= 0) {
10947 // src pool doesn't exist, dst pool does exist: to ensure idempotency
10948 // of operations, assume this rename succeeded, as it is not changing
10949 // the current state. Make sure we output something understandable
10950 // for whoever is issuing the command, if they are paying attention,
10951 // in case it was not intentional; or to avoid a "wtf?" and a bug
10952 // report in case it was intentional, while expecting a failure.
10953 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
10954 << destpoolstr << "' does -- assuming successful rename";
10955 err = 0;
10956 } else {
10957 ss << "unrecognized pool '" << srcpoolstr << "'";
10958 err = -ENOENT;
10959 }
10960 goto reply;
10961 } else if (pool_dst >= 0) {
10962 // source pool exists and so does the destination pool
10963 ss << "pool '" << destpoolstr << "' already exists";
10964 err = -EEXIST;
10965 goto reply;
10966 }
10967
10968 int ret = _prepare_rename_pool(pool_src, destpoolstr);
10969 if (ret == 0) {
10970 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
10971 } else {
10972 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
10973 << cpp_strerror(ret);
10974 }
10975 getline(ss, rs);
10976 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
10977 get_last_committed() + 1));
10978 return true;
10979
10980 } else if (prefix == "osd pool set") {
10981 err = prepare_command_pool_set(cmdmap, ss);
10982 if (err == -EAGAIN)
10983 goto wait;
10984 if (err < 0)
10985 goto reply;
10986
10987 getline(ss, rs);
10988 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10989 get_last_committed() + 1));
10990 return true;
10991 } else if (prefix == "osd tier add") {
10992 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10993 if (err == -EAGAIN)
10994 goto wait;
10995 if (err)
10996 goto reply;
10997 string poolstr;
10998 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10999 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11000 if (pool_id < 0) {
11001 ss << "unrecognized pool '" << poolstr << "'";
11002 err = -ENOENT;
11003 goto reply;
11004 }
11005 string tierpoolstr;
11006 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
11007 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11008 if (tierpool_id < 0) {
11009 ss << "unrecognized pool '" << tierpoolstr << "'";
11010 err = -ENOENT;
11011 goto reply;
11012 }
11013 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11014 assert(p);
11015 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11016 assert(tp);
11017
11018 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11019 goto reply;
11020 }
11021
11022 // make sure new tier is empty
11023 string force_nonempty;
11024 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
31f18b77
FG
11025 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
11026 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
11027 force_nonempty != "--force-nonempty") {
11028 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
11029 err = -ENOTEMPTY;
11030 goto reply;
11031 }
11032 if (tp->ec_pool()) {
11033 ss << "tier pool '" << tierpoolstr
11034 << "' is an ec pool, which cannot be a tier";
11035 err = -ENOTSUP;
11036 goto reply;
11037 }
11038 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
11039 ((force_nonempty != "--force-nonempty") ||
11040 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
11041 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
11042 err = -ENOTEMPTY;
11043 goto reply;
11044 }
11045 // go
11046 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11047 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11048 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11049 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11050 return true;
11051 }
11052 np->tiers.insert(tierpool_id);
11053 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11054 ntp->tier_of = pool_id;
11055 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
11056 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11057 get_last_committed() + 1));
11058 return true;
11059 } else if (prefix == "osd tier remove" ||
11060 prefix == "osd tier rm") {
11061 string poolstr;
11062 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11063 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11064 if (pool_id < 0) {
11065 ss << "unrecognized pool '" << poolstr << "'";
11066 err = -ENOENT;
11067 goto reply;
11068 }
11069 string tierpoolstr;
11070 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
11071 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11072 if (tierpool_id < 0) {
11073 ss << "unrecognized pool '" << tierpoolstr << "'";
11074 err = -ENOENT;
11075 goto reply;
11076 }
11077 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11078 assert(p);
11079 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11080 assert(tp);
11081
11082 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
11083 goto reply;
11084 }
11085
11086 if (p->tiers.count(tierpool_id) == 0) {
11087 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11088 err = 0;
11089 goto reply;
11090 }
11091 if (tp->tier_of != pool_id) {
11092 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
11093 << osdmap.get_pool_name(tp->tier_of) << "': "
11094 // be scary about it; this is an inconsistency and bells must go off
11095 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11096 err = -EINVAL;
11097 goto reply;
11098 }
11099 if (p->read_tier == tierpool_id) {
11100 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
11101 err = -EBUSY;
11102 goto reply;
11103 }
11104 // go
11105 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11106 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11107 if (np->tiers.count(tierpool_id) == 0 ||
11108 ntp->tier_of != pool_id ||
11109 np->read_tier == tierpool_id) {
11110 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11111 return true;
11112 }
11113 np->tiers.erase(tierpool_id);
11114 ntp->clear_tier();
11115 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11116 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11117 get_last_committed() + 1));
11118 return true;
11119 } else if (prefix == "osd tier set-overlay") {
11120 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11121 if (err == -EAGAIN)
11122 goto wait;
11123 if (err)
11124 goto reply;
11125 string poolstr;
11126 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11127 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11128 if (pool_id < 0) {
11129 ss << "unrecognized pool '" << poolstr << "'";
11130 err = -ENOENT;
11131 goto reply;
11132 }
11133 string overlaypoolstr;
11134 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
11135 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
11136 if (overlaypool_id < 0) {
11137 ss << "unrecognized pool '" << overlaypoolstr << "'";
11138 err = -ENOENT;
11139 goto reply;
11140 }
11141 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11142 assert(p);
11143 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11144 assert(overlay_p);
11145 if (p->tiers.count(overlaypool_id) == 0) {
11146 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
11147 err = -EINVAL;
11148 goto reply;
11149 }
11150 if (p->read_tier == overlaypool_id) {
11151 err = 0;
11152 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11153 goto reply;
11154 }
11155 if (p->has_read_tier()) {
11156 ss << "pool '" << poolstr << "' has overlay '"
11157 << osdmap.get_pool_name(p->read_tier)
11158 << "'; please remove-overlay first";
11159 err = -EINVAL;
11160 goto reply;
11161 }
11162
11163 // go
11164 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11165 np->read_tier = overlaypool_id;
11166 np->write_tier = overlaypool_id;
11167 np->set_last_force_op_resend(pending_inc.epoch);
11168 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
11169 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
11170 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11171 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
11172 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
11173 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11174 get_last_committed() + 1));
11175 return true;
11176 } else if (prefix == "osd tier remove-overlay" ||
11177 prefix == "osd tier rm-overlay") {
11178 string poolstr;
11179 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11180 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11181 if (pool_id < 0) {
11182 ss << "unrecognized pool '" << poolstr << "'";
11183 err = -ENOENT;
11184 goto reply;
11185 }
11186 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11187 assert(p);
11188 if (!p->has_read_tier()) {
11189 err = 0;
11190 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11191 goto reply;
11192 }
11193
11194 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
11195 goto reply;
11196 }
11197
11198 // go
11199 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11200 if (np->has_read_tier()) {
11201 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
11202 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
11203 nop->set_last_force_op_resend(pending_inc.epoch);
11204 }
11205 if (np->has_write_tier()) {
11206 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
11207 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
11208 nop->set_last_force_op_resend(pending_inc.epoch);
11209 }
11210 np->clear_read_tier();
11211 np->clear_write_tier();
11212 np->set_last_force_op_resend(pending_inc.epoch);
11213 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11214 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11215 get_last_committed() + 1));
11216 return true;
11217 } else if (prefix == "osd tier cache-mode") {
11218 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11219 if (err == -EAGAIN)
11220 goto wait;
11221 if (err)
11222 goto reply;
11223 string poolstr;
11224 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11225 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11226 if (pool_id < 0) {
11227 ss << "unrecognized pool '" << poolstr << "'";
11228 err = -ENOENT;
11229 goto reply;
11230 }
11231 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11232 assert(p);
11233 if (!p->is_tier()) {
11234 ss << "pool '" << poolstr << "' is not a tier";
11235 err = -EINVAL;
11236 goto reply;
11237 }
11238 string modestr;
11239 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
11240 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11241 if (mode < 0) {
11242 ss << "'" << modestr << "' is not a valid cache mode";
11243 err = -EINVAL;
11244 goto reply;
11245 }
11246
11247 string sure;
11248 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
11249 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11250 mode != pg_pool_t::CACHEMODE_NONE &&
11251 mode != pg_pool_t::CACHEMODE_PROXY &&
11252 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11253 sure != "--yes-i-really-mean-it") {
11254 ss << "'" << modestr << "' is not a well-supported cache mode and may "
11255 << "corrupt your data. pass --yes-i-really-mean-it to force.";
11256 err = -EPERM;
11257 goto reply;
11258 }
11259
11260 // pool already has this cache-mode set and there are no pending changes
11261 if (p->cache_mode == mode &&
11262 (pending_inc.new_pools.count(pool_id) == 0 ||
11263 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
11264 ss << "set cache-mode for pool '" << poolstr << "'"
11265 << " to " << pg_pool_t::get_cache_mode_name(mode);
11266 err = 0;
11267 goto reply;
11268 }
11269
11270 /* Mode description:
11271 *
11272 * none: No cache-mode defined
11273 * forward: Forward all reads and writes to base pool
11274 * writeback: Cache writes, promote reads from base pool
11275 * readonly: Forward writes to base pool
11276 * readforward: Writes are in writeback mode, Reads are in forward mode
11277 * proxy: Proxy all reads and writes to base pool
11278 * readproxy: Writes are in writeback mode, Reads are in proxy mode
11279 *
11280 * Hence, these are the allowed transitions:
11281 *
11282 * none -> any
11283 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11284 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11285 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
11286 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
11287 * writeback -> readforward || readproxy || forward || proxy
11288 * readonly -> any
11289 */
11290
11291 // We check if the transition is valid against the current pool mode, as
11292 // it is the only committed state thus far. We will blantly squash
11293 // whatever mode is on the pending state.
11294
11295 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
11296 (mode != pg_pool_t::CACHEMODE_FORWARD &&
11297 mode != pg_pool_t::CACHEMODE_PROXY &&
11298 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11299 mode != pg_pool_t::CACHEMODE_READPROXY)) {
11300 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
11301 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
11302 << "' pool; only '"
11303 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
11304 << "','"
11305 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
11306 << "','"
11307 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
11308 << "','"
11309 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
11310 << "' allowed.";
11311 err = -EINVAL;
11312 goto reply;
11313 }
11314 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
11315 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11316 mode != pg_pool_t::CACHEMODE_FORWARD &&
11317 mode != pg_pool_t::CACHEMODE_PROXY &&
11318 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11319
11320 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
11321 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11322 mode != pg_pool_t::CACHEMODE_FORWARD &&
11323 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11324 mode != pg_pool_t::CACHEMODE_PROXY)) ||
11325
11326 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
11327 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11328 mode != pg_pool_t::CACHEMODE_FORWARD &&
11329 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11330 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11331
11332 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
11333 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11334 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11335 mode != pg_pool_t::CACHEMODE_PROXY &&
11336 mode != pg_pool_t::CACHEMODE_READPROXY))) {
11337
31f18b77
FG
11338 const pool_stat_t* pstats =
11339 mon->pgservice->get_pool_stat(pool_id);
7c673cae 11340
31f18b77 11341 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
11342 ss << "unable to set cache-mode '"
11343 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
11344 << "': dirty objects found";
11345 err = -EBUSY;
11346 goto reply;
11347 }
11348 }
11349 // go
11350 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11351 np->cache_mode = mode;
11352 // set this both when moving to and from cache_mode NONE. this is to
11353 // capture legacy pools that were set up before this flag existed.
11354 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
11355 ss << "set cache-mode for pool '" << poolstr
11356 << "' to " << pg_pool_t::get_cache_mode_name(mode);
11357 if (mode == pg_pool_t::CACHEMODE_NONE) {
11358 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11359 assert(base_pool);
11360 if (base_pool->read_tier == pool_id ||
11361 base_pool->write_tier == pool_id)
11362 ss <<" (WARNING: pool is still configured as read or write tier)";
11363 }
11364 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11365 get_last_committed() + 1));
11366 return true;
11367 } else if (prefix == "osd tier add-cache") {
11368 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11369 if (err == -EAGAIN)
11370 goto wait;
11371 if (err)
11372 goto reply;
11373 string poolstr;
11374 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11375 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11376 if (pool_id < 0) {
11377 ss << "unrecognized pool '" << poolstr << "'";
11378 err = -ENOENT;
11379 goto reply;
11380 }
11381 string tierpoolstr;
11382 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
11383 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11384 if (tierpool_id < 0) {
11385 ss << "unrecognized pool '" << tierpoolstr << "'";
11386 err = -ENOENT;
11387 goto reply;
11388 }
11389 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11390 assert(p);
11391 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11392 assert(tp);
11393
11394 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11395 goto reply;
11396 }
11397
11398 int64_t size = 0;
11399 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
11400 ss << "unable to parse 'size' value '"
11401 << cmd_vartype_stringify(cmdmap["size"]) << "'";
11402 err = -EINVAL;
11403 goto reply;
11404 }
11405 // make sure new tier is empty
31f18b77
FG
11406 const pool_stat_t *pstats =
11407 mon->pgservice->get_pool_stat(tierpool_id);
11408 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
11409 ss << "tier pool '" << tierpoolstr << "' is not empty";
11410 err = -ENOTEMPTY;
11411 goto reply;
11412 }
11413 string modestr = g_conf->osd_tier_default_cache_mode;
11414 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11415 if (mode < 0) {
11416 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
11417 err = -EINVAL;
11418 goto reply;
11419 }
11420 HitSet::Params hsp;
11421 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
11422 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11423 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
11424 hsp = HitSet::Params(bsp);
11425 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
11426 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11427 }
11428 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
11429 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
11430 } else {
11431 ss << "osd tier cache default hit set type '" <<
11432 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
11433 err = -EINVAL;
11434 goto reply;
11435 }
11436 // go
11437 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11438 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11439 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11440 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11441 return true;
11442 }
11443 np->tiers.insert(tierpool_id);
11444 np->read_tier = np->write_tier = tierpool_id;
11445 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11446 np->set_last_force_op_resend(pending_inc.epoch);
11447 ntp->set_last_force_op_resend(pending_inc.epoch);
11448 ntp->tier_of = pool_id;
11449 ntp->cache_mode = mode;
11450 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
11451 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
11452 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
11453 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
11454 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
11455 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
11456 ntp->hit_set_params = hsp;
11457 ntp->target_max_bytes = size;
11458 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
11459 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11460 get_last_committed() + 1));
11461 return true;
11462 } else if (prefix == "osd pool set-quota") {
11463 string poolstr;
11464 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11465 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11466 if (pool_id < 0) {
11467 ss << "unrecognized pool '" << poolstr << "'";
11468 err = -ENOENT;
11469 goto reply;
11470 }
11471
11472 string field;
11473 cmd_getval(g_ceph_context, cmdmap, "field", field);
11474 if (field != "max_objects" && field != "max_bytes") {
11475 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
11476 err = -EINVAL;
11477 goto reply;
11478 }
11479
11480 // val could contain unit designations, so we treat as a string
11481 string val;
11482 cmd_getval(g_ceph_context, cmdmap, "val", val);
11483 stringstream tss;
11484 int64_t value = unit_to_bytesize(val, &tss);
11485 if (value < 0) {
11486 ss << "error parsing value '" << value << "': " << tss.str();
11487 err = value;
11488 goto reply;
11489 }
11490
11491 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
11492 if (field == "max_objects") {
11493 pi->quota_max_objects = value;
11494 } else if (field == "max_bytes") {
11495 pi->quota_max_bytes = value;
11496 } else {
11497 assert(0 == "unrecognized option");
11498 }
11499 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
11500 rs = ss.str();
11501 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11502 get_last_committed() + 1));
11503 return true;
c07f9fc5
FG
11504 } else if (prefix == "osd pool application enable" ||
11505 prefix == "osd pool application disable" ||
11506 prefix == "osd pool application set" ||
11507 prefix == "osd pool application rm") {
11508 err = prepare_command_pool_application(prefix, cmdmap, ss);
11509 if (err == -EAGAIN)
11510 goto wait;
11511 if (err < 0)
11512 goto reply;
7c673cae 11513
c07f9fc5
FG
11514 getline(ss, rs);
11515 wait_for_finished_proposal(
11516 op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11517 return true;
7c673cae
FG
11518 } else if (prefix == "osd reweight-by-pg" ||
11519 prefix == "osd reweight-by-utilization" ||
11520 prefix == "osd test-reweight-by-pg" ||
11521 prefix == "osd test-reweight-by-utilization") {
11522 bool by_pg =
11523 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
11524 bool dry_run =
11525 prefix == "osd test-reweight-by-pg" ||
11526 prefix == "osd test-reweight-by-utilization";
11527 int64_t oload;
11528 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
11529 set<int64_t> pools;
11530 vector<string> poolnamevec;
11531 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
11532 for (unsigned j = 0; j < poolnamevec.size(); j++) {
11533 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
11534 if (pool < 0) {
11535 ss << "pool '" << poolnamevec[j] << "' does not exist";
11536 err = -ENOENT;
11537 goto reply;
11538 }
11539 pools.insert(pool);
11540 }
11541 double max_change = g_conf->mon_reweight_max_change;
11542 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
11543 if (max_change <= 0.0) {
11544 ss << "max_change " << max_change << " must be positive";
11545 err = -EINVAL;
11546 goto reply;
11547 }
11548 int64_t max_osds = g_conf->mon_reweight_max_osds;
11549 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
11550 if (max_osds <= 0) {
11551 ss << "max_osds " << max_osds << " must be positive";
11552 err = -EINVAL;
11553 goto reply;
11554 }
11555 string no_increasing;
11556 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
11557 string out_str;
11558 mempool::osdmap::map<int32_t, uint32_t> new_weights;
31f18b77
FG
11559 err = mon->pgservice->reweight_by_utilization(osdmap,
11560 oload,
11561 max_change,
11562 max_osds,
11563 by_pg,
11564 pools.empty() ? NULL : &pools,
11565 no_increasing == "--no-increasing",
11566 &new_weights,
11567 &ss, &out_str, f.get());
7c673cae
FG
11568 if (err >= 0) {
11569 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
11570 }
11571 if (f)
11572 f->flush(rdata);
11573 else
11574 rdata.append(out_str);
11575 if (err < 0) {
11576 ss << "FAILED reweight-by-pg";
11577 } else if (err == 0 || dry_run) {
11578 ss << "no change";
11579 } else {
11580 ss << "SUCCESSFUL reweight-by-pg";
11581 pending_inc.new_weight = std::move(new_weights);
11582 wait_for_finished_proposal(
11583 op,
11584 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11585 return true;
11586 }
c07f9fc5
FG
11587 } else if (prefix == "osd force-create-pg") {
11588 pg_t pgid;
11589 string pgidstr;
11590 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
11591 if (!pgid.parse(pgidstr.c_str())) {
11592 ss << "invalid pgid '" << pgidstr << "'";
11593 err = -EINVAL;
11594 goto reply;
11595 }
94b18763
FG
11596 if (!osdmap.pg_exists(pgid)) {
11597 ss << "pg " << pgid << " should not exist";
11598 err = -ENOENT;
11599 goto reply;
11600 }
c07f9fc5
FG
11601 bool creating_now;
11602 {
11603 std::lock_guard<std::mutex> l(creating_pgs_lock);
11604 auto emplaced = creating_pgs.pgs.emplace(pgid,
11605 make_pair(osdmap.get_epoch(),
11606 ceph_clock_now()));
11607 creating_now = emplaced.second;
11608 }
11609 if (creating_now) {
11610 ss << "pg " << pgidstr << " now creating, ok";
11611 err = 0;
11612 goto update;
11613 } else {
11614 ss << "pg " << pgid << " already creating";
11615 err = 0;
11616 goto reply;
11617 }
7c673cae
FG
11618 } else {
11619 err = -EINVAL;
11620 }
11621
11622 reply:
11623 getline(ss, rs);
11624 if (err < 0 && rs.length() == 0)
11625 rs = cpp_strerror(err);
11626 mon->reply_command(op, err, rs, rdata, get_last_committed());
11627 return ret;
11628
11629 update:
11630 getline(ss, rs);
11631 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11632 get_last_committed() + 1));
11633 return true;
11634
11635 wait:
11636 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11637 return true;
11638}
11639
28e407b8 11640bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
7c673cae
FG
11641{
11642 op->mark_osdmon_event(__func__);
28e407b8 11643
7c673cae 11644 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
28e407b8
AA
11645 MonSession *session = m->get_session();
11646 if (!session) {
11647 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11648 return true;
11649 }
11650
11651 switch (m->op) {
11652 case POOL_OP_CREATE_UNMANAGED_SNAP:
11653 case POOL_OP_DELETE_UNMANAGED_SNAP:
11654 {
11655 const std::string* pool_name = nullptr;
11656 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
11657 if (pg_pool != nullptr) {
11658 pool_name = &osdmap.get_pool_name(m->pool);
11659 }
11660
11661 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
11662 session->entity_name, session->caps,
11663 pool_name)) {
11664 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
11665 << "privileges. message: " << *m << std::endl
11666 << "caps: " << session->caps << dendl;
11667 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11668 return true;
11669 }
11670 }
11671 break;
11672 default:
11673 if (!session->is_capable("osd", MON_CAP_W)) {
11674 dout(0) << "got pool op from entity with insufficient privileges. "
11675 << "message: " << *m << std::endl
11676 << "caps: " << session->caps << dendl;
11677 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11678 return true;
11679 }
11680 break;
11681 }
11682
11683 return false;
11684}
11685
11686bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
11687{
11688 op->mark_osdmon_event(__func__);
11689 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11690
11691 if (enforce_pool_op_caps(op)) {
11692 return true;
11693 }
11694
7c673cae
FG
11695 if (m->fsid != mon->monmap->fsid) {
11696 dout(0) << __func__ << " drop message on fsid " << m->fsid
11697 << " != " << mon->monmap->fsid << " for " << *m << dendl;
11698 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11699 return true;
11700 }
11701
11702 if (m->op == POOL_OP_CREATE)
11703 return preprocess_pool_op_create(op);
11704
11705 if (!osdmap.get_pg_pool(m->pool)) {
11706 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11707 _pool_op_reply(op, 0, osdmap.get_epoch());
11708 return true;
11709 }
11710
11711 // check if the snap and snapname exist
11712 bool snap_exists = false;
11713 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
11714 if (p->snap_exists(m->name.c_str()))
11715 snap_exists = true;
11716
11717 switch (m->op) {
11718 case POOL_OP_CREATE_SNAP:
11719 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
11720 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11721 return true;
11722 }
11723 if (snap_exists) {
11724 _pool_op_reply(op, 0, osdmap.get_epoch());
11725 return true;
11726 }
11727 return false;
11728 case POOL_OP_CREATE_UNMANAGED_SNAP:
11729 if (p->is_pool_snaps_mode()) {
11730 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11731 return true;
11732 }
11733 return false;
11734 case POOL_OP_DELETE_SNAP:
11735 if (p->is_unmanaged_snaps_mode()) {
11736 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11737 return true;
11738 }
11739 if (!snap_exists) {
11740 _pool_op_reply(op, 0, osdmap.get_epoch());
11741 return true;
11742 }
11743 return false;
11744 case POOL_OP_DELETE_UNMANAGED_SNAP:
11745 if (p->is_pool_snaps_mode()) {
11746 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11747 return true;
11748 }
11749 if (p->is_removed_snap(m->snapid)) {
11750 _pool_op_reply(op, 0, osdmap.get_epoch());
11751 return true;
11752 }
11753 return false;
11754 case POOL_OP_DELETE:
11755 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
11756 _pool_op_reply(op, 0, osdmap.get_epoch());
11757 return true;
11758 }
11759 return false;
11760 case POOL_OP_AUID_CHANGE:
11761 return false;
11762 default:
11763 ceph_abort();
11764 break;
11765 }
11766
11767 return false;
11768}
11769
11770bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
11771{
11772 op->mark_osdmon_event(__func__);
11773 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
7c673cae
FG
11774 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
11775 if (pool >= 0) {
11776 _pool_op_reply(op, 0, osdmap.get_epoch());
11777 return true;
11778 }
11779
11780 return false;
11781}
11782
11783bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
11784{
11785 op->mark_osdmon_event(__func__);
11786 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11787 dout(10) << "prepare_pool_op " << *m << dendl;
11788 if (m->op == POOL_OP_CREATE) {
11789 return prepare_pool_op_create(op);
11790 } else if (m->op == POOL_OP_DELETE) {
11791 return prepare_pool_op_delete(op);
11792 }
11793
11794 int ret = 0;
11795 bool changed = false;
11796
11797 if (!osdmap.have_pg_pool(m->pool)) {
11798 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11799 return false;
11800 }
11801
11802 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
11803
11804 switch (m->op) {
11805 case POOL_OP_CREATE_SNAP:
11806 if (pool->is_tier()) {
11807 ret = -EINVAL;
11808 _pool_op_reply(op, ret, osdmap.get_epoch());
11809 return false;
11810 } // else, fall through
11811 case POOL_OP_DELETE_SNAP:
11812 if (!pool->is_unmanaged_snaps_mode()) {
11813 bool snap_exists = pool->snap_exists(m->name.c_str());
11814 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
11815 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
11816 ret = 0;
11817 } else {
11818 break;
11819 }
11820 } else {
11821 ret = -EINVAL;
11822 }
11823 _pool_op_reply(op, ret, osdmap.get_epoch());
11824 return false;
11825
11826 case POOL_OP_DELETE_UNMANAGED_SNAP:
11827 // we won't allow removal of an unmanaged snapshot from a pool
11828 // not in unmanaged snaps mode.
11829 if (!pool->is_unmanaged_snaps_mode()) {
11830 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
11831 return false;
11832 }
11833 /* fall-thru */
11834 case POOL_OP_CREATE_UNMANAGED_SNAP:
11835 // but we will allow creating an unmanaged snapshot on any pool
11836 // as long as it is not in 'pool' snaps mode.
11837 if (pool->is_pool_snaps_mode()) {
11838 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11839 return false;
11840 }
11841 }
11842
11843 // projected pool info
11844 pg_pool_t pp;
11845 if (pending_inc.new_pools.count(m->pool))
11846 pp = pending_inc.new_pools[m->pool];
11847 else
11848 pp = *osdmap.get_pg_pool(m->pool);
11849
11850 bufferlist reply_data;
11851
11852 // pool snaps vs unmanaged snaps are mutually exclusive
11853 switch (m->op) {
11854 case POOL_OP_CREATE_SNAP:
11855 case POOL_OP_DELETE_SNAP:
11856 if (pp.is_unmanaged_snaps_mode()) {
11857 ret = -EINVAL;
11858 goto out;
11859 }
11860 break;
11861
11862 case POOL_OP_CREATE_UNMANAGED_SNAP:
11863 case POOL_OP_DELETE_UNMANAGED_SNAP:
11864 if (pp.is_pool_snaps_mode()) {
11865 ret = -EINVAL;
11866 goto out;
11867 }
11868 }
11869
11870 switch (m->op) {
11871 case POOL_OP_CREATE_SNAP:
11872 if (!pp.snap_exists(m->name.c_str())) {
11873 pp.add_snap(m->name.c_str(), ceph_clock_now());
11874 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
11875 changed = true;
11876 }
11877 break;
11878
11879 case POOL_OP_DELETE_SNAP:
11880 {
11881 snapid_t s = pp.snap_exists(m->name.c_str());
11882 if (s) {
11883 pp.remove_snap(s);
11884 changed = true;
11885 }
11886 }
11887 break;
11888
11889 case POOL_OP_CREATE_UNMANAGED_SNAP:
11890 {
11891 uint64_t snapid;
11892 pp.add_unmanaged_snap(snapid);
11893 ::encode(snapid, reply_data);
11894 changed = true;
11895 }
11896 break;
11897
11898 case POOL_OP_DELETE_UNMANAGED_SNAP:
11899 if (!pp.is_removed_snap(m->snapid)) {
28e407b8
AA
11900 if (m->snapid > pp.get_snap_seq()) {
11901 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11902 return false;
11903 }
7c673cae
FG
11904 pp.remove_unmanaged_snap(m->snapid);
11905 changed = true;
11906 }
11907 break;
11908
11909 case POOL_OP_AUID_CHANGE:
11910 if (pp.auid != m->auid) {
11911 pp.auid = m->auid;
11912 changed = true;
11913 }
11914 break;
11915
11916 default:
11917 ceph_abort();
11918 break;
11919 }
11920
11921 if (changed) {
11922 pp.set_snap_epoch(pending_inc.epoch);
11923 pending_inc.new_pools[m->pool] = pp;
11924 }
11925
11926 out:
11927 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
11928 return true;
11929}
11930
11931bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
11932{
11933 op->mark_osdmon_event(__func__);
11934 int err = prepare_new_pool(op);
11935 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
11936 return true;
11937}
11938
11939int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
11940 ostream *ss)
11941{
11942 const string& poolstr = osdmap.get_pool_name(pool_id);
11943
11944 // If the Pool is in use by CephFS, refuse to delete it
28e407b8 11945 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
11946 if (pending_fsmap.pool_in_use(pool_id)) {
11947 *ss << "pool '" << poolstr << "' is in use by CephFS";
11948 return -EBUSY;
11949 }
11950
11951 if (pool.tier_of >= 0) {
11952 *ss << "pool '" << poolstr << "' is a tier of '"
11953 << osdmap.get_pool_name(pool.tier_of) << "'";
11954 return -EBUSY;
11955 }
11956 if (!pool.tiers.empty()) {
11957 *ss << "pool '" << poolstr << "' has tiers";
11958 for(auto tier : pool.tiers) {
11959 *ss << " " << osdmap.get_pool_name(tier);
11960 }
11961 return -EBUSY;
11962 }
11963
11964 if (!g_conf->mon_allow_pool_delete) {
11965 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
11966 return -EPERM;
11967 }
11968
11969 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
11970 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
11971 return -EPERM;
11972 }
11973
11974 *ss << "pool '" << poolstr << "' removed";
11975 return 0;
11976}
11977
11978/**
11979 * Check if it is safe to add a tier to a base pool
11980 *
11981 * @return
11982 * True if the operation should proceed, false if we should abort here
11983 * (abort doesn't necessarily mean error, could be idempotency)
11984 */
11985bool OSDMonitor::_check_become_tier(
11986 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
11987 const int64_t base_pool_id, const pg_pool_t *base_pool,
11988 int *err,
11989 ostream *ss) const
11990{
11991 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
11992 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11993
28e407b8 11994 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae
FG
11995 if (pending_fsmap.pool_in_use(tier_pool_id)) {
11996 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
11997 *err = -EBUSY;
11998 return false;
11999 }
12000
12001 if (base_pool->tiers.count(tier_pool_id)) {
12002 assert(tier_pool->tier_of == base_pool_id);
12003 *err = 0;
12004 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
12005 << base_pool_name << "'";
12006 return false;
12007 }
12008
12009 if (base_pool->is_tier()) {
12010 *ss << "pool '" << base_pool_name << "' is already a tier of '"
12011 << osdmap.get_pool_name(base_pool->tier_of) << "', "
12012 << "multiple tiers are not yet supported.";
12013 *err = -EINVAL;
12014 return false;
12015 }
12016
12017 if (tier_pool->has_tiers()) {
12018 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
12019 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
12020 it != tier_pool->tiers.end(); ++it)
12021 *ss << "'" << osdmap.get_pool_name(*it) << "',";
12022 *ss << " multiple tiers are not yet supported.";
12023 *err = -EINVAL;
12024 return false;
12025 }
12026
12027 if (tier_pool->is_tier()) {
12028 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
12029 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
12030 *err = -EINVAL;
12031 return false;
12032 }
12033
12034 *err = 0;
12035 return true;
12036}
12037
12038
12039/**
12040 * Check if it is safe to remove a tier from this base pool
12041 *
12042 * @return
12043 * True if the operation should proceed, false if we should abort here
12044 * (abort doesn't necessarily mean error, could be idempotency)
12045 */
12046bool OSDMonitor::_check_remove_tier(
12047 const int64_t base_pool_id, const pg_pool_t *base_pool,
12048 const pg_pool_t *tier_pool,
12049 int *err, ostream *ss) const
12050{
12051 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12052
12053 // Apply CephFS-specific checks
28e407b8 12054 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
7c673cae 12055 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
12056 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
12057 // If the underlying pool is erasure coded and does not allow EC
12058 // overwrites, we can't permit the removal of the replicated tier that
12059 // CephFS relies on to access it
12060 *ss << "pool '" << base_pool_name <<
12061 "' does not allow EC overwrites and is in use by CephFS"
12062 " via its tier";
7c673cae
FG
12063 *err = -EBUSY;
12064 return false;
12065 }
12066
12067 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
12068 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
12069 "tier is still in use as a writeback cache. Change the cache "
12070 "mode and flush the cache before removing it";
12071 *err = -EBUSY;
12072 return false;
12073 }
12074 }
12075
12076 *err = 0;
12077 return true;
12078}
12079
12080int OSDMonitor::_prepare_remove_pool(
12081 int64_t pool, ostream *ss, bool no_fake)
12082{
224ce89b 12083 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
12084 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12085 int r = _check_remove_pool(pool, *p, ss);
12086 if (r < 0)
12087 return r;
12088
12089 auto new_pool = pending_inc.new_pools.find(pool);
12090 if (new_pool != pending_inc.new_pools.end()) {
12091 // if there is a problem with the pending info, wait and retry
12092 // this op.
12093 const auto& p = new_pool->second;
12094 int r = _check_remove_pool(pool, p, ss);
12095 if (r < 0)
12096 return -EAGAIN;
12097 }
12098
12099 if (pending_inc.old_pools.count(pool)) {
224ce89b 12100 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
12101 << dendl;
12102 return 0;
12103 }
12104
12105 if (g_conf->mon_fake_pool_delete && !no_fake) {
12106 string old_name = osdmap.get_pool_name(pool);
12107 string new_name = old_name + "." + stringify(pool) + ".DELETED";
12108 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
12109 << old_name << " -> " << new_name << dendl;
12110 pending_inc.new_pool_names[pool] = new_name;
12111 return 0;
12112 }
12113
12114 // remove
12115 pending_inc.old_pools.insert(pool);
12116
224ce89b 12117 // remove any pg_temp mappings for this pool
7c673cae
FG
12118 for (auto p = osdmap.pg_temp->begin();
12119 p != osdmap.pg_temp->end();
12120 ++p) {
12121 if (p->first.pool() == (uint64_t)pool) {
224ce89b 12122 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
12123 << p->first << dendl;
12124 pending_inc.new_pg_temp[p->first].clear();
12125 }
12126 }
224ce89b 12127 // remove any primary_temp mappings for this pool
7c673cae
FG
12128 for (auto p = osdmap.primary_temp->begin();
12129 p != osdmap.primary_temp->end();
12130 ++p) {
12131 if (p->first.pool() == (uint64_t)pool) {
224ce89b 12132 dout(10) << __func__ << " " << pool
7c673cae
FG
12133 << " removing obsolete primary_temp" << p->first << dendl;
12134 pending_inc.new_primary_temp[p->first] = -1;
12135 }
12136 }
224ce89b
WB
12137 // remove any pg_upmap mappings for this pool
12138 for (auto& p : osdmap.pg_upmap) {
12139 if (p.first.pool() == (uint64_t)pool) {
12140 dout(10) << __func__ << " " << pool
12141 << " removing obsolete pg_upmap "
12142 << p.first << dendl;
12143 pending_inc.old_pg_upmap.insert(p.first);
12144 }
12145 }
94b18763
FG
12146 // remove any pending pg_upmap mappings for this pool
12147 {
12148 auto it = pending_inc.new_pg_upmap.begin();
12149 while (it != pending_inc.new_pg_upmap.end()) {
12150 if (it->first.pool() == (uint64_t)pool) {
12151 dout(10) << __func__ << " " << pool
12152 << " removing pending pg_upmap "
12153 << it->first << dendl;
12154 it = pending_inc.new_pg_upmap.erase(it);
12155 } else {
12156 it++;
12157 }
12158 }
12159 }
224ce89b
WB
12160 // remove any pg_upmap_items mappings for this pool
12161 for (auto& p : osdmap.pg_upmap_items) {
12162 if (p.first.pool() == (uint64_t)pool) {
12163 dout(10) << __func__ << " " << pool
12164 << " removing obsolete pg_upmap_items " << p.first
12165 << dendl;
12166 pending_inc.old_pg_upmap_items.insert(p.first);
12167 }
12168 }
94b18763
FG
12169 // remove any pending pg_upmap mappings for this pool
12170 {
12171 auto it = pending_inc.new_pg_upmap_items.begin();
12172 while (it != pending_inc.new_pg_upmap_items.end()) {
12173 if (it->first.pool() == (uint64_t)pool) {
12174 dout(10) << __func__ << " " << pool
12175 << " removing pending pg_upmap_items "
12176 << it->first << dendl;
12177 it = pending_inc.new_pg_upmap_items.erase(it);
12178 } else {
12179 it++;
12180 }
12181 }
12182 }
35e4c445
FG
12183
12184 // remove any choose_args for this pool
12185 CrushWrapper newcrush;
12186 _get_pending_crush(newcrush);
12187 if (newcrush.have_choose_args(pool)) {
12188 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
12189 newcrush.rm_choose_args(pool);
12190 pending_inc.crush.clear();
12191 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
12192 }
7c673cae
FG
12193 return 0;
12194}
12195
12196int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
12197{
12198 dout(10) << "_prepare_rename_pool " << pool << dendl;
12199 if (pending_inc.old_pools.count(pool)) {
12200 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
12201 return -ENOENT;
12202 }
12203 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
12204 p != pending_inc.new_pool_names.end();
12205 ++p) {
12206 if (p->second == newname && p->first != pool) {
12207 return -EEXIST;
12208 }
12209 }
12210
12211 pending_inc.new_pool_names[pool] = newname;
12212 return 0;
12213}
12214
12215bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
12216{
12217 op->mark_osdmon_event(__func__);
12218 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12219 ostringstream ss;
12220 int ret = _prepare_remove_pool(m->pool, &ss, false);
12221 if (ret == -EAGAIN) {
12222 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12223 return true;
12224 }
12225 if (ret < 0)
12226 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
12227 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
12228 pending_inc.epoch));
12229 return true;
12230}
12231
12232void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
12233 int ret, epoch_t epoch, bufferlist *blp)
12234{
12235 op->mark_osdmon_event(__func__);
12236 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12237 dout(20) << "_pool_op_reply " << ret << dendl;
12238 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
12239 ret, epoch, get_last_committed(), blp);
12240 mon->send_reply(op, reply);
12241}