]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
update sources to v12.2.0
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b
WB
20#include <boost/algorithm/string.hpp>
21#include <locale>
7c673cae
FG
22#include <sstream>
23
31f18b77
FG
24#include "mon/OSDMonitor.h"
25#include "mon/Monitor.h"
26#include "mon/MDSMonitor.h"
27#include "mon/PGMonitor.h"
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
42#include "messages/MOSDFull.h"
43#include "messages/MOSDMap.h"
44#include "messages/MMonGetOSDMap.h"
45#include "messages/MOSDBoot.h"
46#include "messages/MOSDAlive.h"
47#include "messages/MPoolOp.h"
48#include "messages/MPoolOpReply.h"
49#include "messages/MOSDPGCreate.h"
50#include "messages/MOSDPGCreated.h"
51#include "messages/MOSDPGTemp.h"
52#include "messages/MMonCommand.h"
53#include "messages/MRemoveSnaps.h"
54#include "messages/MOSDScrub.h"
55#include "messages/MRoute.h"
56
57#include "common/TextTable.h"
58#include "common/Timer.h"
59#include "common/ceph_argparse.h"
60#include "common/perf_counters.h"
61#include "common/strtol.h"
62
63#include "common/config.h"
64#include "common/errno.h"
65
66#include "erasure-code/ErasureCodePlugin.h"
67#include "compressor/Compressor.h"
68#include "common/Checksummer.h"
69
70#include "include/compat.h"
71#include "include/assert.h"
72#include "include/stringify.h"
73#include "include/util.h"
74#include "common/cmdparse.h"
75#include "include/str_list.h"
76#include "include/str_map.h"
224ce89b 77#include "include/scope_guard.h"
7c673cae
FG
78
79#include "json_spirit/json_spirit_reader.h"
80
c07f9fc5
FG
81#include <boost/algorithm/string/predicate.hpp>
82
7c673cae
FG
83#define dout_subsys ceph_subsys_mon
84#define OSD_PG_CREATING_PREFIX "osd_pg_creating"
85
c07f9fc5
FG
86namespace {
87
88const uint32_t MAX_POOL_APPLICATIONS = 4;
89const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
90const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
91
92} // anonymous namespace
93
7c673cae
FG
94void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
95{
96 if (epoch_by_pg.size() <= ps) {
97 epoch_by_pg.resize(ps + 1, 0);
98 }
99 const auto old_lec = epoch_by_pg[ps];
100 if (old_lec >= last_epoch_clean) {
101 // stale lec
102 return;
103 }
104 epoch_by_pg[ps] = last_epoch_clean;
105 if (last_epoch_clean < floor) {
106 floor = last_epoch_clean;
107 } else if (last_epoch_clean > floor) {
108 if (old_lec == floor) {
109 // probably should increase floor?
110 auto new_floor = std::min_element(std::begin(epoch_by_pg),
111 std::end(epoch_by_pg));
112 floor = *new_floor;
113 }
114 }
115 if (ps != next_missing) {
116 return;
117 }
118 for (; next_missing < epoch_by_pg.size(); next_missing++) {
119 if (epoch_by_pg[next_missing] == 0) {
120 break;
121 }
122 }
123}
124
125void LastEpochClean::remove_pool(uint64_t pool)
126{
127 report_by_pool.erase(pool);
128}
129
130void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
131{
132 auto& lec = report_by_pool[pg.pool()];
133 return lec.report(pg.ps(), last_epoch_clean);
134}
135
136epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
137{
138 auto floor = latest.get_epoch();
139 for (auto& pool : latest.get_pools()) {
140 auto reported = report_by_pool.find(pool.first);
141 if (reported == report_by_pool.end()) {
142 return 0;
143 }
144 if (reported->second.next_missing < pool.second.get_pg_num()) {
145 return 0;
146 }
147 if (reported->second.floor < floor) {
148 floor = reported->second.floor;
149 }
150 }
151 return floor;
152}
153
154
155struct C_UpdateCreatingPGs : public Context {
156 OSDMonitor *osdmon;
157 utime_t start;
158 epoch_t epoch;
159 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
160 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
161 void finish(int r) override {
162 if (r >= 0) {
163 utime_t end = ceph_clock_now();
164 dout(10) << "osdmap epoch " << epoch << " mapping took "
165 << (end - start) << " seconds" << dendl;
166 osdmon->update_creating_pgs();
167 osdmon->check_pg_creates_subs();
168 }
169 }
170};
171
172#undef dout_prefix
173#define dout_prefix _prefix(_dout, mon, osdmap)
174static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
175 return *_dout << "mon." << mon->name << "@" << mon->rank
176 << "(" << mon->get_state_name()
177 << ").osd e" << osdmap.get_epoch() << " ";
178}
179
180OSDMonitor::OSDMonitor(
181 CephContext *cct,
182 Monitor *mn,
183 Paxos *p,
184 const string& service_name)
185 : PaxosService(mn, p, service_name),
186 cct(cct),
187 inc_osd_cache(g_conf->mon_osd_cache_size),
188 full_osd_cache(g_conf->mon_osd_cache_size),
189 last_attempted_minwait_time(utime_t()),
190 mapper(mn->cct, &mn->cpu_tp),
191 op_tracker(cct, true, 1)
192{}
193
194bool OSDMonitor::_have_pending_crush()
195{
196 return pending_inc.crush.length() > 0;
197}
198
199CrushWrapper &OSDMonitor::_get_stable_crush()
200{
201 return *osdmap.crush;
202}
203
204void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
205{
206 bufferlist bl;
207 if (pending_inc.crush.length())
208 bl = pending_inc.crush;
209 else
210 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
211
212 bufferlist::iterator p = bl.begin();
213 newcrush.decode(p);
214}
215
216void OSDMonitor::create_initial()
217{
218 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
219
220 OSDMap newmap;
221
222 bufferlist bl;
223 mon->store->get("mkfs", "osdmap", bl);
224
225 if (bl.length()) {
226 newmap.decode(bl);
227 newmap.set_fsid(mon->monmap->fsid);
228 } else {
224ce89b 229 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0);
7c673cae
FG
230 }
231 newmap.set_epoch(1);
232 newmap.created = newmap.modified = ceph_clock_now();
233
234 // new clusters should sort bitwise by default.
235 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
236
237 // new cluster should require latest by default
31f18b77
FG
238 if (g_conf->mon_debug_no_require_luminous) {
239 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
240 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
241 } else {
242 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
c07f9fc5 243 newmap.flags |= CEPH_OSDMAP_RECOVERY_DELETES;
7c673cae
FG
244 newmap.full_ratio = g_conf->mon_osd_full_ratio;
245 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
246 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
247 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
248 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
249 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
31f18b77
FG
250 int r = ceph_release_from_name(
251 g_conf->mon_osd_initial_require_min_compat_client.c_str());
252 if (r <= 0) {
253 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
254 }
255 newmap.require_min_compat_client = r;
7c673cae
FG
256 }
257
258 // encode into pending incremental
259 newmap.encode(pending_inc.fullmap,
260 mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
261 pending_inc.full_crc = newmap.get_crc();
262 dout(20) << " full crc " << pending_inc.full_crc << dendl;
263}
264
265void OSDMonitor::get_store_prefixes(std::set<string>& s)
266{
267 s.insert(service_name);
268 s.insert(OSD_PG_CREATING_PREFIX);
269}
270
271void OSDMonitor::update_from_paxos(bool *need_bootstrap)
272{
273 version_t version = get_last_committed();
274 if (version == osdmap.epoch)
275 return;
276 assert(version > osdmap.epoch);
277
278 dout(15) << "update_from_paxos paxos e " << version
279 << ", my e " << osdmap.epoch << dendl;
280
31f18b77
FG
281 if (mapping_job) {
282 if (!mapping_job->is_done()) {
283 dout(1) << __func__ << " mapping job "
284 << mapping_job.get() << " did not complete, "
285 << mapping_job->shards << " left, canceling" << dendl;
286 mapping_job->abort();
287 }
288 mapping_job.reset();
289 }
7c673cae 290
224ce89b
WB
291 load_health();
292
7c673cae
FG
293 /*
294 * We will possibly have a stashed latest that *we* wrote, and we will
295 * always be sure to have the oldest full map in the first..last range
296 * due to encode_trim_extra(), which includes the oldest full map in the trim
297 * transaction.
298 *
299 * encode_trim_extra() does not however write the full map's
300 * version to 'full_latest'. This is only done when we are building the
301 * full maps from the incremental versions. But don't panic! We make sure
302 * that the following conditions find whichever full map version is newer.
303 */
304 version_t latest_full = get_version_latest_full();
305 if (latest_full == 0 && get_first_committed() > 1)
306 latest_full = get_first_committed();
307
308 if (get_first_committed() > 1 &&
309 latest_full < get_first_committed()) {
310 // the monitor could be just sync'ed with its peer, and the latest_full key
311 // is not encoded in the paxos commits in encode_pending(), so we need to
312 // make sure we get it pointing to a proper version.
313 version_t lc = get_last_committed();
314 version_t fc = get_first_committed();
315
316 dout(10) << __func__ << " looking for valid full map in interval"
317 << " [" << fc << ", " << lc << "]" << dendl;
318
319 latest_full = 0;
320 for (version_t v = lc; v >= fc; v--) {
321 string full_key = "full_" + stringify(v);
322 if (mon->store->exists(get_service_name(), full_key)) {
323 dout(10) << __func__ << " found latest full map v " << v << dendl;
324 latest_full = v;
325 break;
326 }
327 }
328
329 assert(latest_full > 0);
330 auto t(std::make_shared<MonitorDBStore::Transaction>());
331 put_version_latest_full(t, latest_full);
332 mon->store->apply_transaction(t);
333 dout(10) << __func__ << " updated the on-disk full map version to "
334 << latest_full << dendl;
335 }
336
337 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
338 bufferlist latest_bl;
339 get_version_full(latest_full, latest_bl);
340 assert(latest_bl.length() != 0);
341 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
342 osdmap.decode(latest_bl);
343 }
344
345 if (mon->monmap->get_required_features().contains_all(
346 ceph::features::mon::FEATURE_LUMINOUS)) {
347 bufferlist bl;
348 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
349 auto p = bl.begin();
350 std::lock_guard<std::mutex> l(creating_pgs_lock);
351 creating_pgs.decode(p);
352 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
353 << creating_pgs.last_scan_epoch
354 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
355 } else {
356 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
357 << dendl;
358 }
359 }
360
31f18b77
FG
361 // make sure we're using the right pg service.. remove me post-luminous!
362 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
363 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
364 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
365 } else {
366 dout(10) << __func__ << " pgservice is pg" << dendl;
367 mon->pgservice = mon->pgmon()->get_pg_stat_service();
368 }
369
7c673cae
FG
370 // walk through incrementals
371 MonitorDBStore::TransactionRef t;
372 size_t tx_size = 0;
373 while (version > osdmap.epoch) {
374 bufferlist inc_bl;
375 int err = get_version(osdmap.epoch+1, inc_bl);
376 assert(err == 0);
377 assert(inc_bl.length());
378
379 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
380 << dendl;
381 OSDMap::Incremental inc(inc_bl);
382 err = osdmap.apply_incremental(inc);
383 assert(err == 0);
384
385 if (!t)
386 t.reset(new MonitorDBStore::Transaction);
387
388 // Write out the full map for all past epochs. Encode the full
389 // map with the same features as the incremental. If we don't
390 // know, use the quorum features. If we don't know those either,
391 // encode with all features.
392 uint64_t f = inc.encode_features;
393 if (!f)
394 f = mon->get_quorum_con_features();
395 if (!f)
396 f = -1;
397 bufferlist full_bl;
398 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
399 tx_size += full_bl.length();
400
401 bufferlist orig_full_bl;
402 get_version_full(osdmap.epoch, orig_full_bl);
403 if (orig_full_bl.length()) {
404 // the primary provided the full map
405 assert(inc.have_crc);
406 if (inc.full_crc != osdmap.crc) {
407 // This will happen if the mons were running mixed versions in
408 // the past or some other circumstance made the full encoded
409 // maps divergent. Reloading here will bring us back into
410 // sync with the primary for this and all future maps. OSDs
411 // will also be brought back into sync when they discover the
412 // crc mismatch and request a full map from a mon.
413 derr << __func__ << " full map CRC mismatch, resetting to canonical"
414 << dendl;
415 osdmap = OSDMap();
416 osdmap.decode(orig_full_bl);
417 }
418 } else {
419 assert(!inc.have_crc);
420 put_version_full(t, osdmap.epoch, full_bl);
421 }
422 put_version_latest_full(t, osdmap.epoch);
423
424 // share
425 dout(1) << osdmap << dendl;
426
427 if (osdmap.epoch == 1) {
428 t->erase("mkfs", "osdmap");
429 }
430
31f18b77
FG
431 // make sure we're using the right pg service.. remove me post-luminous!
432 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
433 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
434 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
435 } else {
436 dout(10) << __func__ << " pgservice is pg" << dendl;
437 mon->pgservice = mon->pgmon()->get_pg_stat_service();
438 }
439
7c673cae
FG
440 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
441 mon->store->apply_transaction(t);
442 t = MonitorDBStore::TransactionRef();
443 tx_size = 0;
444 }
445 if (mon->monmap->get_required_features().contains_all(
446 ceph::features::mon::FEATURE_LUMINOUS)) {
7c673cae
FG
447 for (const auto &osd_state : inc.new_state) {
448 if (osd_state.second & CEPH_OSD_UP) {
449 // could be marked up *or* down, but we're too lazy to check which
450 last_osd_report.erase(osd_state.first);
451 }
452 if (osd_state.second & CEPH_OSD_EXISTS) {
453 // could be created *or* destroyed, but we can safely drop it
454 osd_epochs.erase(osd_state.first);
455 }
456 }
457 }
458 }
459
460 if (t) {
461 mon->store->apply_transaction(t);
462 }
463
464 for (int o = 0; o < osdmap.get_max_osd(); o++) {
465 if (osdmap.is_out(o))
466 continue;
467 auto found = down_pending_out.find(o);
468 if (osdmap.is_down(o)) {
469 // populate down -> out map
470 if (found == down_pending_out.end()) {
471 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
472 down_pending_out[o] = ceph_clock_now();
473 }
474 } else {
475 if (found != down_pending_out.end()) {
476 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
477 down_pending_out.erase(found);
478 }
479 }
480 }
481 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
482
483 if (mon->is_leader()) {
484 // kick pgmon, make sure it's seen the latest map
485 mon->pgmon()->check_osd_map(osdmap.epoch);
486 }
487
488 check_osdmap_subs();
489 check_pg_creates_subs();
490
491 share_map_with_random_osd();
492 update_logger();
493
494 process_failures();
495
496 // make sure our feature bits reflect the latest map
497 update_msgr_features();
498
499 if (!mon->is_leader()) {
500 // will be called by on_active() on the leader, avoid doing so twice
501 start_mapping();
502 }
503}
504
505void OSDMonitor::start_mapping()
506{
507 // initiate mapping job
508 if (mapping_job) {
509 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
510 << dendl;
511 mapping_job->abort();
512 }
224ce89b
WB
513 if (!osdmap.get_pools().empty()) {
514 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
515 mapping_job = mapping.start_update(osdmap, mapper,
516 g_conf->mon_osd_mapping_pgs_per_chunk);
517 dout(10) << __func__ << " started mapping job " << mapping_job.get()
518 << " at " << fin->start << dendl;
519 mapping_job->set_finish_event(fin);
520 } else {
521 dout(10) << __func__ << " no pools, no mapping job" << dendl;
522 mapping_job = nullptr;
523 }
7c673cae
FG
524}
525
526void OSDMonitor::update_msgr_features()
527{
528 set<int> types;
529 types.insert((int)entity_name_t::TYPE_OSD);
530 types.insert((int)entity_name_t::TYPE_CLIENT);
531 types.insert((int)entity_name_t::TYPE_MDS);
532 types.insert((int)entity_name_t::TYPE_MON);
533 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
534 uint64_t mask;
535 uint64_t features = osdmap.get_features(*q, &mask);
536 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
537 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
538 Messenger::Policy p = mon->messenger->get_policy(*q);
539 p.features_required = (p.features_required & ~mask) | features;
540 mon->messenger->set_policy(*q, p);
541 }
542 }
543}
544
545void OSDMonitor::on_active()
546{
547 update_logger();
548
549 if (mon->is_leader()) {
224ce89b 550 mon->clog->debug() << "osdmap " << osdmap;
7c673cae
FG
551 } else {
552 list<MonOpRequestRef> ls;
553 take_all_failures(ls);
554 while (!ls.empty()) {
555 MonOpRequestRef op = ls.front();
556 op->mark_osdmon_event(__func__);
557 dispatch(op);
558 ls.pop_front();
559 }
560 }
561 start_mapping();
562}
563
564void OSDMonitor::on_restart()
565{
566 last_osd_report.clear();
31f18b77
FG
567
568 if (mon->is_leader()) {
569 // fix ruleset != ruleid
570 if (osdmap.crush->has_legacy_rulesets() &&
571 !osdmap.crush->has_multirule_rulesets()) {
572 CrushWrapper newcrush;
573 _get_pending_crush(newcrush);
574 int r = newcrush.renumber_rules_by_ruleset();
575 if (r >= 0) {
576 dout(1) << __func__ << " crush map has ruleset != rule id; fixing" << dendl;
577 pending_inc.crush.clear();
578 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
579 } else {
580 dout(10) << __func__ << " unable to renumber rules by ruleset" << dendl;
581 }
582 }
583 }
7c673cae
FG
584}
585
586void OSDMonitor::on_shutdown()
587{
588 dout(10) << __func__ << dendl;
589 if (mapping_job) {
590 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
591 << dendl;
592 mapping_job->abort();
593 }
594
595 // discard failure info, waiters
596 list<MonOpRequestRef> ls;
597 take_all_failures(ls);
598 ls.clear();
599}
600
601void OSDMonitor::update_logger()
602{
603 dout(10) << "update_logger" << dendl;
604
605 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
606 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
607 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
608 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
609}
610
7c673cae
FG
611void OSDMonitor::create_pending()
612{
613 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
614 pending_inc.fsid = mon->monmap->fsid;
615
616 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
617
618 // clean up pg_temp, primary_temp
619 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
620 dout(10) << "create_pending did clean_temps" << dendl;
621
622 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
623 // instead of osd_backfill_full_ratio config
624 if (osdmap.backfillfull_ratio <= 0) {
625 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
626 if (pending_inc.new_backfillfull_ratio > 1.0)
627 pending_inc.new_backfillfull_ratio /= 100;
628 dout(1) << __func__ << " setting backfillfull_ratio = "
629 << pending_inc.new_backfillfull_ratio << dendl;
630 }
31f18b77
FG
631 if (osdmap.get_epoch() > 0 &&
632 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae 633 // transition full ratios from PGMap to OSDMap (on upgrade)
31f18b77
FG
634 float full_ratio = mon->pgservice->get_full_ratio();
635 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
636 if (osdmap.full_ratio != full_ratio) {
7c673cae 637 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
31f18b77
FG
638 << " -> " << full_ratio << " (from pgmap)" << dendl;
639 pending_inc.new_full_ratio = full_ratio;
7c673cae 640 }
31f18b77 641 if (osdmap.nearfull_ratio != nearfull_ratio) {
7c673cae 642 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
31f18b77
FG
643 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
644 pending_inc.new_nearfull_ratio = nearfull_ratio;
7c673cae
FG
645 }
646 } else {
647 // safety check (this shouldn't really happen)
648 if (osdmap.full_ratio <= 0) {
649 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
650 if (pending_inc.new_full_ratio > 1.0)
651 pending_inc.new_full_ratio /= 100;
652 dout(1) << __func__ << " setting full_ratio = "
653 << pending_inc.new_full_ratio << dendl;
654 }
655 if (osdmap.nearfull_ratio <= 0) {
656 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
657 if (pending_inc.new_nearfull_ratio > 1.0)
658 pending_inc.new_nearfull_ratio /= 100;
659 dout(1) << __func__ << " setting nearfull_ratio = "
660 << pending_inc.new_nearfull_ratio << dendl;
661 }
662 }
663}
664
665creating_pgs_t
666OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
667{
31f18b77 668 dout(10) << __func__ << dendl;
7c673cae
FG
669 creating_pgs_t pending_creatings;
670 {
671 std::lock_guard<std::mutex> l(creating_pgs_lock);
672 pending_creatings = creating_pgs;
673 }
31f18b77
FG
674 // check for new or old pools
675 if (pending_creatings.last_scan_epoch < inc.epoch) {
676 if (osdmap.get_epoch() &&
677 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
678 auto added =
679 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
680 osdmap.get_pools(),
681 &pending_creatings);
682 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
683 }
684 unsigned queued = 0;
685 queued += scan_for_creating_pgs(osdmap.get_pools(),
686 inc.old_pools,
687 inc.modified,
688 &pending_creatings);
689 queued += scan_for_creating_pgs(inc.new_pools,
690 inc.old_pools,
691 inc.modified,
692 &pending_creatings);
693 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
694 for (auto deleted_pool : inc.old_pools) {
695 auto removed = pending_creatings.remove_pool(deleted_pool);
696 dout(10) << __func__ << " " << removed
697 << " pg removed because containing pool deleted: "
698 << deleted_pool << dendl;
699 last_epoch_clean.remove_pool(deleted_pool);
700 }
701 // pgmon updates its creating_pgs in check_osd_map() which is called by
702 // on_active() and check_osd_map() could be delayed if lease expires, so its
703 // creating_pgs could be stale in comparison with the one of osdmon. let's
704 // trim them here. otherwise, they will be added back after being erased.
705 unsigned removed = 0;
706 for (auto& pg : pending_created_pgs) {
707 dout(20) << __func__ << " noting created pg " << pg << dendl;
708 pending_creatings.created_pools.insert(pg.pool());
709 removed += pending_creatings.pgs.erase(pg);
710 }
711 pending_created_pgs.clear();
712 dout(10) << __func__ << " " << removed
713 << " pgs removed because they're created" << dendl;
714 pending_creatings.last_scan_epoch = osdmap.get_epoch();
715 }
716
717 // process queue
718 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
719 const auto total = pending_creatings.pgs.size();
720 while (pending_creatings.pgs.size() < max &&
721 !pending_creatings.queue.empty()) {
722 auto p = pending_creatings.queue.begin();
723 int64_t poolid = p->first;
724 dout(10) << __func__ << " pool " << poolid
725 << " created " << p->second.created
726 << " modified " << p->second.modified
727 << " [" << p->second.start << "-" << p->second.end << ")"
728 << dendl;
729 int n = MIN(max - pending_creatings.pgs.size(),
730 p->second.end - p->second.start);
731 ps_t first = p->second.start;
732 ps_t end = first + n;
733 for (ps_t ps = first; ps < end; ++ps) {
734 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
735 // NOTE: use the *current* epoch as the PG creation epoch so that the
736 // OSD does not have to generate a long set of PastIntervals.
737 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
738 p->second.modified));
739 dout(10) << __func__ << " adding " << pgid << dendl;
740 }
741 p->second.start = end;
742 if (p->second.done()) {
743 dout(10) << __func__ << " done with queue for " << poolid << dendl;
744 pending_creatings.queue.erase(p);
745 } else {
746 dout(10) << __func__ << " pool " << poolid
747 << " now [" << p->second.start << "-" << p->second.end << ")"
748 << dendl;
749 }
750 }
751 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
752 << " pools" << dendl;
c07f9fc5
FG
753 dout(10) << __func__
754 << " " << (pending_creatings.pgs.size() - total)
755 << "/" << pending_creatings.pgs.size()
31f18b77 756 << " pgs added from queued pools" << dendl;
7c673cae
FG
757 return pending_creatings;
758}
759
760void OSDMonitor::maybe_prime_pg_temp()
761{
762 bool all = false;
763 if (pending_inc.crush.length()) {
764 dout(10) << __func__ << " new crush map, all" << dendl;
765 all = true;
766 }
767
768 if (!pending_inc.new_up_client.empty()) {
769 dout(10) << __func__ << " new up osds, all" << dendl;
770 all = true;
771 }
772
773 // check for interesting OSDs
774 set<int> osds;
31f18b77 775 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
776 !all && p != pending_inc.new_state.end();
777 ++p) {
778 if ((p->second & CEPH_OSD_UP) &&
779 osdmap.is_up(p->first)) {
780 osds.insert(p->first);
781 }
782 }
783 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
784 !all && p != pending_inc.new_weight.end();
785 ++p) {
786 if (p->second < osdmap.get_weight(p->first)) {
787 // weight reduction
788 osds.insert(p->first);
789 } else {
790 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
791 << dendl;
792 all = true;
793 }
794 }
795
796 if (!all && osds.empty())
797 return;
798
799 if (!all) {
800 unsigned estimate =
801 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
802 if (estimate > mapping.get_num_pgs() *
803 g_conf->mon_osd_prime_pg_temp_max_estimate) {
804 dout(10) << __func__ << " estimate " << estimate << " pgs on "
805 << osds.size() << " osds >= "
806 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
807 << mapping.get_num_pgs() << " pgs, all"
808 << dendl;
809 all = true;
810 } else {
811 dout(10) << __func__ << " estimate " << estimate << " pgs on "
812 << osds.size() << " osds" << dendl;
813 }
814 }
815
816 OSDMap next;
817 next.deepish_copy_from(osdmap);
818 next.apply_incremental(pending_inc);
819
224ce89b
WB
820 if (next.get_pools().empty()) {
821 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
822 } else if (all) {
7c673cae
FG
823 PrimeTempJob job(next, this);
824 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
825 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
826 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
827 } else {
828 dout(10) << __func__ << " did not finish in "
829 << g_conf->mon_osd_prime_pg_temp_max_time
830 << ", stopping" << dendl;
831 job.abort();
832 }
833 } else {
834 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
835 utime_t stop = ceph_clock_now();
836 stop += g_conf->mon_osd_prime_pg_temp_max_time;
837 const int chunk = 1000;
838 int n = chunk;
839 std::unordered_set<pg_t> did_pgs;
840 for (auto osd : osds) {
841 auto& pgs = mapping.get_osd_acting_pgs(osd);
842 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
843 for (auto pgid : pgs) {
844 if (!did_pgs.insert(pgid).second) {
845 continue;
846 }
847 prime_pg_temp(next, pgid);
848 if (--n <= 0) {
849 n = chunk;
850 if (ceph_clock_now() > stop) {
851 dout(10) << __func__ << " consumed more than "
852 << g_conf->mon_osd_prime_pg_temp_max_time
853 << " seconds, stopping"
854 << dendl;
855 return;
856 }
857 }
858 }
859 }
860 }
861}
862
863void OSDMonitor::prime_pg_temp(
864 const OSDMap& next,
865 pg_t pgid)
866{
867 if (mon->monmap->get_required_features().contains_all(
868 ceph::features::mon::FEATURE_LUMINOUS)) {
31f18b77 869 // TODO: remove this creating_pgs direct access?
7c673cae
FG
870 if (creating_pgs.pgs.count(pgid)) {
871 return;
872 }
873 } else {
31f18b77 874 if (mon->pgservice->is_creating_pg(pgid)) {
7c673cae
FG
875 return;
876 }
877 }
878 if (!osdmap.pg_exists(pgid)) {
879 return;
880 }
881
882 vector<int> up, acting;
883 mapping.get(pgid, &up, nullptr, &acting, nullptr);
884
885 vector<int> next_up, next_acting;
886 int next_up_primary, next_acting_primary;
887 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
888 &next_acting, &next_acting_primary);
c07f9fc5 889 if (acting == next_acting && next_up != next_acting)
7c673cae
FG
890 return; // no change since last epoch
891
892 if (acting.empty())
893 return; // if previously empty now we can be no worse off
894 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
895 if (pool && acting.size() < pool->min_size)
896 return; // can be no worse off than before
897
c07f9fc5
FG
898 if (next_up == next_acting) {
899 acting.clear();
900 dout(20) << __func__ << "next_up === next_acting now, clear pg_temp"
901 << dendl;
902 }
903
7c673cae
FG
904 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
905 << " -> " << next_up << "/" << next_acting
906 << ", priming " << acting
907 << dendl;
908 {
909 Mutex::Locker l(prime_pg_temp_lock);
910 // do not touch a mapping if a change is pending
911 pending_inc.new_pg_temp.emplace(
912 pgid,
913 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
914 }
915}
916
917/**
918 * @note receiving a transaction in this function gives a fair amount of
919 * freedom to the service implementation if it does need it. It shouldn't.
920 */
921void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
922{
923 dout(10) << "encode_pending e " << pending_inc.epoch
924 << dendl;
925
926 // finalize up pending_inc
927 pending_inc.modified = ceph_clock_now();
928
929 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
930 assert(r == 0);
931
932 if (mapping_job) {
933 if (!mapping_job->is_done()) {
934 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
935 << mapping_job.get() << " did not complete, "
936 << mapping_job->shards << " left" << dendl;
937 mapping_job->abort();
938 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
939 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
940 << mapping_job.get() << " is prior epoch "
941 << mapping.get_epoch() << dendl;
942 } else {
943 if (g_conf->mon_osd_prime_pg_temp) {
944 maybe_prime_pg_temp();
945 }
946 }
947 } else if (g_conf->mon_osd_prime_pg_temp) {
948 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
949 << dendl;
950 }
951 mapping_job.reset();
952
c07f9fc5
FG
953 // ensure we don't have blank new_state updates. these are interrpeted as
954 // CEPH_OSD_UP (and almost certainly not what we want!).
955 auto p = pending_inc.new_state.begin();
956 while (p != pending_inc.new_state.end()) {
957 if (p->second == 0) {
958 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
959 p = pending_inc.new_state.erase(p);
960 } else {
961 ++p;
962 }
963 }
964
7c673cae
FG
965 bufferlist bl;
966
967 {
968 OSDMap tmp;
969 tmp.deepish_copy_from(osdmap);
970 tmp.apply_incremental(pending_inc);
971
31f18b77 972 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
973 // set or clear full/nearfull?
974 int full, backfill, nearfull;
975 tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
976 if (full > 0) {
977 if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
978 dout(10) << __func__ << " setting full flag" << dendl;
979 add_flag(CEPH_OSDMAP_FULL);
980 remove_flag(CEPH_OSDMAP_NEARFULL);
981 }
982 } else {
983 if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
984 dout(10) << __func__ << " clearing full flag" << dendl;
985 remove_flag(CEPH_OSDMAP_FULL);
986 }
987 if (nearfull > 0) {
988 if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
989 dout(10) << __func__ << " setting nearfull flag" << dendl;
990 add_flag(CEPH_OSDMAP_NEARFULL);
991 }
992 } else {
993 if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
994 dout(10) << __func__ << " clearing nearfull flag" << dendl;
995 remove_flag(CEPH_OSDMAP_NEARFULL);
996 }
997 }
998 }
999
1000 // min_compat_client?
31f18b77 1001 if (tmp.require_min_compat_client == 0) {
7c673cae 1002 auto mv = tmp.get_min_compat_client();
31f18b77
FG
1003 dout(1) << __func__ << " setting require_min_compat_client to currently "
1004 << "required " << ceph_release_name(mv) << dendl;
1005 mon->clog->info() << "setting require_min_compat_client to currently "
1006 << "required " << ceph_release_name(mv);
1007 pending_inc.new_require_min_compat_client = mv;
7c673cae 1008 }
224ce89b
WB
1009
1010 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1011 // convert ec profile ruleset-* -> crush-*
1012 for (auto& p : tmp.erasure_code_profiles) {
1013 bool changed = false;
1014 map<string,string> newprofile;
1015 for (auto& q : p.second) {
1016 if (q.first.find("ruleset-") == 0) {
1017 string key = "crush-";
1018 key += q.first.substr(8);
1019 newprofile[key] = q.second;
1020 changed = true;
1021 dout(20) << " updating ec profile " << p.first
1022 << " key " << q.first << " -> " << key << dendl;
1023 } else {
1024 newprofile[q.first] = q.second;
1025 }
1026 }
1027 if (changed) {
1028 dout(10) << " updated ec profile " << p.first << ": "
1029 << newprofile << dendl;
1030 pending_inc.new_erasure_code_profiles[p.first] = newprofile;
1031 }
1032 }
c07f9fc5
FG
1033
1034 // auto-enable pool applications upon upgrade
1035 // NOTE: this can be removed post-Luminous assuming upgrades need to
1036 // proceed through Luminous
1037 for (auto &pool_pair : tmp.pools) {
1038 int64_t pool_id = pool_pair.first;
1039 pg_pool_t pg_pool = pool_pair.second;
1040 if (pg_pool.is_tier()) {
1041 continue;
1042 }
1043
1044 std::string pool_name = tmp.get_pool_name(pool_id);
1045 uint32_t match_count = 0;
1046
1047 // CephFS
1048 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
1049 if (pending_fsmap.pool_in_use(pool_id)) {
1050 dout(10) << __func__ << " auto-enabling CephFS on pool '"
1051 << pool_name << "'" << dendl;
1052 pg_pool.application_metadata.insert(
1053 {pg_pool_t::APPLICATION_NAME_CEPHFS, {}});
1054 ++match_count;
1055 }
1056
1057 // RBD heuristics (default OpenStack pool names from docs and
1058 // ceph-ansible)
1059 if (boost::algorithm::contains(pool_name, "rbd") ||
1060 pool_name == "images" || pool_name == "volumes" ||
1061 pool_name == "backups" || pool_name == "vms") {
1062 dout(10) << __func__ << " auto-enabling RBD on pool '"
1063 << pool_name << "'" << dendl;
1064 pg_pool.application_metadata.insert(
1065 {pg_pool_t::APPLICATION_NAME_RBD, {}});
1066 ++match_count;
1067 }
1068
1069 // RGW heuristics
1070 if (boost::algorithm::contains(pool_name, ".rgw") ||
1071 boost::algorithm::contains(pool_name, ".log") ||
1072 boost::algorithm::contains(pool_name, ".intent-log") ||
1073 boost::algorithm::contains(pool_name, ".usage") ||
1074 boost::algorithm::contains(pool_name, ".users")) {
1075 dout(10) << __func__ << " auto-enabling RGW on pool '"
1076 << pool_name << "'" << dendl;
1077 pg_pool.application_metadata.insert(
1078 {pg_pool_t::APPLICATION_NAME_RGW, {}});
1079 ++match_count;
1080 }
1081
1082 // OpenStack gnocchi (from ceph-ansible)
1083 if (pool_name == "metrics" && match_count == 0) {
1084 dout(10) << __func__ << " auto-enabling OpenStack Gnocchi on pool '"
1085 << pool_name << "'" << dendl;
1086 pg_pool.application_metadata.insert({"openstack_gnocchi", {}});
1087 ++match_count;
1088 }
1089
1090 if (match_count == 1) {
1091 pg_pool.last_change = pending_inc.epoch;
1092 pending_inc.new_pools[pool_id] = pg_pool;
1093 } else if (match_count > 1) {
1094 auto pstat = mon->pgservice->get_pool_stat(pool_id);
1095 if (pstat != nullptr && pstat->stats.sum.num_objects > 0) {
1096 mon->clog->info() << "unable to auto-enable application for pool "
1097 << "'" << pool_name << "'";
1098 }
1099 }
1100 }
224ce89b 1101 }
7c673cae
FG
1102 }
1103 }
1104
1105 // tell me about it
31f18b77 1106 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1107 i != pending_inc.new_state.end();
1108 ++i) {
1109 int s = i->second ? i->second : CEPH_OSD_UP;
1110 if (s & CEPH_OSD_UP)
1111 dout(2) << " osd." << i->first << " DOWN" << dendl;
1112 if (s & CEPH_OSD_EXISTS)
1113 dout(2) << " osd." << i->first << " DNE" << dendl;
1114 }
1115 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1116 i != pending_inc.new_up_client.end();
1117 ++i) {
1118 //FIXME: insert cluster addresses too
1119 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1120 }
1121 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1122 i != pending_inc.new_weight.end();
1123 ++i) {
1124 if (i->second == CEPH_OSD_OUT) {
1125 dout(2) << " osd." << i->first << " OUT" << dendl;
1126 } else if (i->second == CEPH_OSD_IN) {
1127 dout(2) << " osd." << i->first << " IN" << dendl;
1128 } else {
1129 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1130 }
1131 }
1132
1133 // features for osdmap and its incremental
1134 uint64_t features = mon->get_quorum_con_features();
1135
1136 // encode full map and determine its crc
1137 OSDMap tmp;
1138 {
1139 tmp.deepish_copy_from(osdmap);
1140 tmp.apply_incremental(pending_inc);
1141
1142 // determine appropriate features
31f18b77 1143 if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1144 dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
1145 << dendl;
1146 features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
1147 }
31f18b77 1148 if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
7c673cae
FG
1149 dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
1150 << "MSG_ADDR2" << dendl;
1151 features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1152 CEPH_FEATURE_MSG_ADDR2);
1153 }
31f18b77
FG
1154 if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
1155 dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
1156 features &= ~CEPH_FEATURE_SERVER_JEWEL;
1157 }
7c673cae
FG
1158 dout(10) << __func__ << " encoding full map with " << features << dendl;
1159
1160 bufferlist fullbl;
1161 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1162 pending_inc.full_crc = tmp.get_crc();
1163
1164 // include full map in the txn. note that old monitors will
1165 // overwrite this. new ones will now skip the local full map
1166 // encode and reload from this.
1167 put_version_full(t, pending_inc.epoch, fullbl);
1168 }
1169
1170 // encode
1171 assert(get_last_committed() + 1 == pending_inc.epoch);
1172 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1173
1174 dout(20) << " full_crc " << tmp.get_crc()
1175 << " inc_crc " << pending_inc.inc_crc << dendl;
1176
1177 /* put everything in the transaction */
1178 put_version(t, pending_inc.epoch, bl);
1179 put_last_committed(t, pending_inc.epoch);
1180
1181 // metadata, too!
1182 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1183 p != pending_metadata.end();
1184 ++p)
1185 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1186 for (set<int>::iterator p = pending_metadata_rm.begin();
1187 p != pending_metadata_rm.end();
1188 ++p)
1189 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1190 pending_metadata.clear();
1191 pending_metadata_rm.clear();
1192
1193 // and pg creating, also!
1194 if (mon->monmap->get_required_features().contains_all(
1195 ceph::features::mon::FEATURE_LUMINOUS)) {
1196 auto pending_creatings = update_pending_pgs(pending_inc);
31f18b77
FG
1197 if (osdmap.get_epoch() &&
1198 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1199 dout(7) << __func__ << " in the middle of upgrading, "
1200 << " trimming pending creating_pgs using pgmap" << dendl;
31f18b77 1201 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
7c673cae
FG
1202 }
1203 bufferlist creatings_bl;
1204 ::encode(pending_creatings, creatings_bl);
1205 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1206 }
224ce89b
WB
1207
1208 // health
1209 health_check_map_t next;
1210 tmp.check_health(&next);
1211 encode_health(next, t);
7c673cae
FG
1212}
1213
1214void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
31f18b77 1215 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
7c673cae
FG
1216{
1217 auto p = creating_pgs->pgs.begin();
1218 while (p != creating_pgs->pgs.end()) {
31f18b77
FG
1219 auto q = pg_stat.find(p->first);
1220 if (q != pg_stat.end() &&
7c673cae
FG
1221 !(q->second.state & PG_STATE_CREATING)) {
1222 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1223 << dendl;
1224 p = creating_pgs->pgs.erase(p);
7c673cae
FG
1225 } else {
1226 ++p;
1227 }
1228 }
1229}
1230
1231int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1232{
1233 bufferlist bl;
1234 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1235 if (r < 0)
1236 return r;
1237 try {
1238 bufferlist::iterator p = bl.begin();
1239 ::decode(m, p);
1240 }
1241 catch (buffer::error& e) {
1242 if (err)
1243 *err << "osd." << osd << " metadata is corrupt";
1244 return -EIO;
1245 }
1246 return 0;
1247}
1248
c07f9fc5 1249void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 1250{
31f18b77
FG
1251 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1252 if (osdmap.is_up(osd)) {
1253 map<string,string> meta;
1254 load_metadata(osd, meta, nullptr);
1255 auto p = meta.find(field);
1256 if (p == meta.end()) {
c07f9fc5 1257 (*out)["unknown"]++;
31f18b77 1258 } else {
c07f9fc5 1259 (*out)[p->second]++;
31f18b77
FG
1260 }
1261 }
1262 }
c07f9fc5
FG
1263}
1264
1265void OSDMonitor::count_metadata(const string& field, Formatter *f)
1266{
1267 map<string,int> by_val;
1268 count_metadata(field, &by_val);
31f18b77
FG
1269 f->open_object_section(field.c_str());
1270 for (auto& p : by_val) {
1271 f->dump_int(p.first.c_str(), p.second);
1272 }
1273 f->close_section();
1274}
1275
7c673cae
FG
1276int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1277{
1278 map<string, string> metadata;
1279 int r = load_metadata(osd, metadata, nullptr);
1280 if (r < 0)
1281 return r;
1282
1283 auto it = metadata.find("osd_objectstore");
1284 if (it == metadata.end())
1285 return -ENOENT;
1286 *type = it->second;
1287 return 0;
1288}
1289
1290bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1291 const pg_pool_t &pool,
1292 ostream *err)
1293{
1294 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1295 // since filestore osds could always join the pool later
1296 set<int> checked_osds;
1297 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1298 vector<int> up, acting;
1299 pg_t pgid(ps, pool_id, -1);
1300 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1301 for (int osd : up) {
1302 if (checked_osds.find(osd) != checked_osds.end())
1303 continue;
1304 string objectstore_type;
1305 int r = get_osd_objectstore_type(osd, &objectstore_type);
1306 // allow with missing metadata, e.g. due to an osd never booting yet
1307 if (r < 0 || objectstore_type == "bluestore") {
1308 checked_osds.insert(osd);
1309 continue;
1310 }
1311 *err << "osd." << osd << " uses " << objectstore_type;
1312 return false;
1313 }
1314 }
1315 return true;
1316}
1317
1318int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1319{
1320 map<string,string> m;
1321 if (int r = load_metadata(osd, m, err))
1322 return r;
1323 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1324 f->dump_string(p->first.c_str(), p->second);
1325 return 0;
1326}
1327
1328void OSDMonitor::print_nodes(Formatter *f)
1329{
1330 // group OSDs by their hosts
1331 map<string, list<int> > osds; // hostname => osd
1332 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1333 map<string, string> m;
1334 if (load_metadata(osd, m, NULL)) {
1335 continue;
1336 }
1337 map<string, string>::iterator hostname = m.find("hostname");
1338 if (hostname == m.end()) {
1339 // not likely though
1340 continue;
1341 }
1342 osds[hostname->second].push_back(osd);
1343 }
1344
1345 dump_services(f, osds, "osd");
1346}
1347
1348void OSDMonitor::share_map_with_random_osd()
1349{
1350 if (osdmap.get_num_up_osds() == 0) {
1351 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1352 return;
1353 }
1354
1355 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1356 if (!s) {
1357 dout(10) << __func__ << " no up osd on our session map" << dendl;
1358 return;
1359 }
1360
1361 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
1362 // whatev, they'll request more if they need it
1363 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
1364 s->con->send_message(m);
1365 // NOTE: do *not* record osd has up to this epoch (as we do
1366 // elsewhere) as they may still need to request older values.
1367}
1368
1369version_t OSDMonitor::get_trim_to()
1370{
31f18b77
FG
1371 if (mon->get_quorum().empty()) {
1372 dout(10) << __func__ << ": quorum not formed" << dendl;
1373 return 0;
1374 }
7c673cae 1375
31f18b77 1376 epoch_t floor;
7c673cae
FG
1377 if (mon->monmap->get_required_features().contains_all(
1378 ceph::features::mon::FEATURE_LUMINOUS)) {
1379 {
31f18b77 1380 // TODO: Get this hidden in PGStatService
7c673cae
FG
1381 std::lock_guard<std::mutex> l(creating_pgs_lock);
1382 if (!creating_pgs.pgs.empty()) {
1383 return 0;
1384 }
1385 }
1386 floor = get_min_last_epoch_clean();
1387 } else {
31f18b77 1388 if (!mon->pgservice->is_readable())
7c673cae 1389 return 0;
31f18b77 1390 if (mon->pgservice->have_creating_pgs()) {
7c673cae
FG
1391 return 0;
1392 }
31f18b77 1393 floor = mon->pgservice->get_min_last_epoch_clean();
7c673cae
FG
1394 }
1395 {
1396 dout(10) << " min_last_epoch_clean " << floor << dendl;
1397 if (g_conf->mon_osd_force_trim_to > 0 &&
1398 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1399 floor = g_conf->mon_osd_force_trim_to;
1400 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1401 }
1402 unsigned min = g_conf->mon_min_osdmap_epochs;
1403 if (floor + min > get_last_committed()) {
1404 if (min < get_last_committed())
1405 floor = get_last_committed() - min;
1406 else
1407 floor = 0;
1408 }
1409 if (floor > get_first_committed())
1410 return floor;
1411 }
1412 return 0;
1413}
1414
1415epoch_t OSDMonitor::get_min_last_epoch_clean() const
1416{
1417 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1418 // also scan osd epochs
1419 // don't trim past the oldest reported osd epoch
1420 for (auto& osd_epoch : osd_epochs) {
1421 if (osd_epoch.second < floor) {
1422 floor = osd_epoch.second;
1423 }
1424 }
1425 return floor;
1426}
1427
1428void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1429 version_t first)
1430{
1431 dout(10) << __func__ << " including full map for e " << first << dendl;
1432 bufferlist bl;
1433 get_version_full(first, bl);
1434 put_version_full(tx, first, bl);
1435}
1436
1437// -------------
1438
1439bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1440{
1441 op->mark_osdmon_event(__func__);
1442 Message *m = op->get_req();
1443 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1444
1445 switch (m->get_type()) {
1446 // READs
1447 case MSG_MON_COMMAND:
1448 return preprocess_command(op);
1449 case CEPH_MSG_MON_GET_OSDMAP:
1450 return preprocess_get_osdmap(op);
1451
1452 // damp updates
1453 case MSG_OSD_MARK_ME_DOWN:
1454 return preprocess_mark_me_down(op);
1455 case MSG_OSD_FULL:
1456 return preprocess_full(op);
1457 case MSG_OSD_FAILURE:
1458 return preprocess_failure(op);
1459 case MSG_OSD_BOOT:
1460 return preprocess_boot(op);
1461 case MSG_OSD_ALIVE:
1462 return preprocess_alive(op);
1463 case MSG_OSD_PG_CREATED:
1464 return preprocess_pg_created(op);
1465 case MSG_OSD_PGTEMP:
1466 return preprocess_pgtemp(op);
1467 case MSG_OSD_BEACON:
1468 return preprocess_beacon(op);
1469
1470 case CEPH_MSG_POOLOP:
1471 return preprocess_pool_op(op);
1472
1473 case MSG_REMOVE_SNAPS:
1474 return preprocess_remove_snaps(op);
1475
1476 default:
1477 ceph_abort();
1478 return true;
1479 }
1480}
1481
1482bool OSDMonitor::prepare_update(MonOpRequestRef op)
1483{
1484 op->mark_osdmon_event(__func__);
1485 Message *m = op->get_req();
1486 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1487
1488 switch (m->get_type()) {
1489 // damp updates
1490 case MSG_OSD_MARK_ME_DOWN:
1491 return prepare_mark_me_down(op);
1492 case MSG_OSD_FULL:
1493 return prepare_full(op);
1494 case MSG_OSD_FAILURE:
1495 return prepare_failure(op);
1496 case MSG_OSD_BOOT:
1497 return prepare_boot(op);
1498 case MSG_OSD_ALIVE:
1499 return prepare_alive(op);
1500 case MSG_OSD_PG_CREATED:
1501 return prepare_pg_created(op);
1502 case MSG_OSD_PGTEMP:
1503 return prepare_pgtemp(op);
1504 case MSG_OSD_BEACON:
1505 return prepare_beacon(op);
1506
1507 case MSG_MON_COMMAND:
1508 return prepare_command(op);
1509
1510 case CEPH_MSG_POOLOP:
1511 return prepare_pool_op(op);
1512
1513 case MSG_REMOVE_SNAPS:
1514 return prepare_remove_snaps(op);
1515
1516
1517 default:
1518 ceph_abort();
1519 }
1520
1521 return false;
1522}
1523
1524bool OSDMonitor::should_propose(double& delay)
1525{
1526 dout(10) << "should_propose" << dendl;
1527
1528 // if full map, propose immediately! any subsequent changes will be clobbered.
1529 if (pending_inc.fullmap.length())
1530 return true;
1531
1532 // adjust osd weights?
1533 if (!osd_weight.empty() &&
1534 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1535 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1536 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1537 delay = 0.0;
1538 osd_weight.clear();
1539 return true;
1540 }
1541
1542 // propose as fast as possible if updating up_thru or pg_temp
1543 // want to merge OSDMap changes as much as possible
1544 if ((pending_inc.new_primary_temp.size() == 1
1545 || pending_inc.new_up_thru.size() == 1)
1546 && pending_inc.new_state.size() < 2) {
1547 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
1548
1549 utime_t now = ceph_clock_now();
1550 if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
1551 && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
1552 delay = g_conf->paxos_min_wait;
1553 last_attempted_minwait_time = now;
1554 return true;
1555 }
1556 }
1557
1558 return PaxosService::should_propose(delay);
1559}
1560
1561
1562
1563// ---------------------------
1564// READs
1565
1566bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1567{
1568 op->mark_osdmon_event(__func__);
1569 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
1570 dout(10) << __func__ << " " << *m << dendl;
1571 MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
1572 epoch_t first = get_first_committed();
1573 epoch_t last = osdmap.get_epoch();
1574 int max = g_conf->osd_map_message_max;
1575 for (epoch_t e = MAX(first, m->get_full_first());
1576 e <= MIN(last, m->get_full_last()) && max > 0;
1577 ++e, --max) {
1578 int r = get_version_full(e, reply->maps[e]);
1579 assert(r >= 0);
1580 }
1581 for (epoch_t e = MAX(first, m->get_inc_first());
1582 e <= MIN(last, m->get_inc_last()) && max > 0;
1583 ++e, --max) {
1584 int r = get_version(e, reply->incremental_maps[e]);
1585 assert(r >= 0);
1586 }
1587 reply->oldest_map = first;
1588 reply->newest_map = last;
1589 mon->send_reply(op, reply);
1590 return true;
1591}
1592
1593
1594// ---------------------------
1595// UPDATEs
1596
1597// failure --
1598
1599bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1600 // check permissions
1601 MonSession *session = m->get_session();
1602 if (!session)
1603 return true;
1604 if (!session->is_capable("osd", MON_CAP_X)) {
1605 dout(0) << "got MOSDFailure from entity with insufficient caps "
1606 << session->caps << dendl;
1607 return true;
1608 }
1609 if (fsid != mon->monmap->fsid) {
1610 dout(0) << "check_source: on fsid " << fsid
1611 << " != " << mon->monmap->fsid << dendl;
1612 return true;
1613 }
1614 return false;
1615}
1616
1617
1618bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1619{
1620 op->mark_osdmon_event(__func__);
1621 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1622 // who is target_osd
1623 int badboy = m->get_target().name.num();
1624
1625 // check permissions
1626 if (check_source(m, m->fsid))
1627 goto didit;
1628
1629 // first, verify the reporting host is valid
1630 if (m->get_orig_source().is_osd()) {
1631 int from = m->get_orig_source().num();
1632 if (!osdmap.exists(from) ||
1633 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1634 (osdmap.is_down(from) && m->if_osd_failed())) {
1635 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1636 send_incremental(op, m->get_epoch()+1);
1637 goto didit;
1638 }
1639 }
1640
1641
1642 // weird?
1643 if (osdmap.is_down(badboy)) {
1644 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1645 if (m->get_epoch() < osdmap.get_epoch())
1646 send_incremental(op, m->get_epoch()+1);
1647 goto didit;
1648 }
1649 if (osdmap.get_inst(badboy) != m->get_target()) {
1650 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1651 << ", from " << m->get_orig_source_inst() << dendl;
1652 if (m->get_epoch() < osdmap.get_epoch())
1653 send_incremental(op, m->get_epoch()+1);
1654 goto didit;
1655 }
1656
1657 // already reported?
1658 if (osdmap.is_down(badboy) ||
1659 osdmap.get_up_from(badboy) > m->get_epoch()) {
1660 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1661 if (m->get_epoch() < osdmap.get_epoch())
1662 send_incremental(op, m->get_epoch()+1);
1663 goto didit;
1664 }
1665
1666 if (!can_mark_down(badboy)) {
1667 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1668 goto didit;
1669 }
1670
1671 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1672 return false;
1673
1674 didit:
1675 return true;
1676}
1677
1678class C_AckMarkedDown : public C_MonOp {
1679 OSDMonitor *osdmon;
1680public:
1681 C_AckMarkedDown(
1682 OSDMonitor *osdmon,
1683 MonOpRequestRef op)
1684 : C_MonOp(op), osdmon(osdmon) {}
1685
1686 void _finish(int) override {
1687 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1688 osdmon->mon->send_reply(
1689 op,
1690 new MOSDMarkMeDown(
1691 m->fsid,
1692 m->get_target(),
1693 m->get_epoch(),
1694 false)); // ACK itself does not request an ack
1695 }
1696 ~C_AckMarkedDown() override {
1697 }
1698};
1699
1700bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1701{
1702 op->mark_osdmon_event(__func__);
1703 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1704 int requesting_down = m->get_target().name.num();
1705 int from = m->get_orig_source().num();
1706
1707 // check permissions
1708 if (check_source(m, m->fsid))
1709 goto reply;
1710
1711 // first, verify the reporting host is valid
1712 if (!m->get_orig_source().is_osd())
1713 goto reply;
1714
1715 if (!osdmap.exists(from) ||
1716 osdmap.is_down(from) ||
1717 osdmap.get_addr(from) != m->get_target().addr) {
1718 dout(5) << "preprocess_mark_me_down from dead osd."
1719 << from << ", ignoring" << dendl;
1720 send_incremental(op, m->get_epoch()+1);
1721 goto reply;
1722 }
1723
1724 // no down might be set
1725 if (!can_mark_down(requesting_down))
1726 goto reply;
1727
1728 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1729 return false;
1730
1731 reply:
1732 if (m->request_ack) {
1733 Context *c(new C_AckMarkedDown(this, op));
1734 c->complete(0);
1735 }
1736 return true;
1737}
1738
1739bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
1740{
1741 op->mark_osdmon_event(__func__);
1742 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1743 int target_osd = m->get_target().name.num();
1744
1745 assert(osdmap.is_up(target_osd));
1746 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1747
1748 mon->clog->info() << "osd." << target_osd << " marked itself down";
1749 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1750 if (m->request_ack)
1751 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
1752 return true;
1753}
1754
1755bool OSDMonitor::can_mark_down(int i)
1756{
1757 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
31f18b77
FG
1758 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
1759 << " down" << dendl;
1760 return false;
1761 }
1762
1763 if (osdmap.is_nodown(i)) {
1764 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
1765 << "will not mark it down" << dendl;
7c673cae
FG
1766 return false;
1767 }
31f18b77 1768
7c673cae
FG
1769 int num_osds = osdmap.get_num_osds();
1770 if (num_osds == 0) {
31f18b77 1771 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
1772 return false;
1773 }
1774 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
1775 float up_ratio = (float)up / (float)num_osds;
1776 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
31f18b77 1777 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
7c673cae
FG
1778 << g_conf->mon_osd_min_up_ratio
1779 << ", will not mark osd." << i << " down" << dendl;
1780 return false;
1781 }
1782 return true;
1783}
1784
1785bool OSDMonitor::can_mark_up(int i)
1786{
1787 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
31f18b77
FG
1788 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
1789 << " up" << dendl;
1790 return false;
1791 }
1792
1793 if (osdmap.is_noup(i)) {
1794 dout(5) << __func__ << " osd." << i << " is marked as noup, "
1795 << "will not mark it up" << dendl;
7c673cae
FG
1796 return false;
1797 }
31f18b77 1798
7c673cae
FG
1799 return true;
1800}
1801
1802/**
1803 * @note the parameter @p i apparently only exists here so we can output the
1804 * osd's id on messages.
1805 */
1806bool OSDMonitor::can_mark_out(int i)
1807{
1808 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
1809 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
1810 return false;
1811 }
31f18b77
FG
1812
1813 if (osdmap.is_noout(i)) {
1814 dout(5) << __func__ << " osd." << i << " is marked as noout, "
1815 << "will not mark it out" << dendl;
1816 return false;
1817 }
1818
7c673cae
FG
1819 int num_osds = osdmap.get_num_osds();
1820 if (num_osds == 0) {
1821 dout(5) << __func__ << " no osds" << dendl;
1822 return false;
1823 }
1824 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
1825 float in_ratio = (float)in / (float)num_osds;
1826 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
1827 if (i >= 0)
1828 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1829 << g_conf->mon_osd_min_in_ratio
1830 << ", will not mark osd." << i << " out" << dendl;
1831 else
1832 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1833 << g_conf->mon_osd_min_in_ratio
1834 << ", will not mark osds out" << dendl;
1835 return false;
1836 }
1837
1838 return true;
1839}
1840
1841bool OSDMonitor::can_mark_in(int i)
1842{
1843 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
31f18b77
FG
1844 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
1845 << " in" << dendl;
1846 return false;
1847 }
1848
1849 if (osdmap.is_noin(i)) {
1850 dout(5) << __func__ << " osd." << i << " is marked as noin, "
1851 << "will not mark it in" << dendl;
7c673cae
FG
1852 return false;
1853 }
31f18b77 1854
7c673cae
FG
1855 return true;
1856}
1857
1858bool OSDMonitor::check_failures(utime_t now)
1859{
1860 bool found_failure = false;
1861 for (map<int,failure_info_t>::iterator p = failure_info.begin();
1862 p != failure_info.end();
1863 ++p) {
1864 if (can_mark_down(p->first)) {
1865 found_failure |= check_failure(now, p->first, p->second);
1866 }
1867 }
1868 return found_failure;
1869}
1870
1871bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
1872{
1873 // already pending failure?
1874 if (pending_inc.new_state.count(target_osd) &&
1875 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1876 dout(10) << " already pending failure" << dendl;
1877 return true;
1878 }
1879
1880 set<string> reporters_by_subtree;
1881 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
1882 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
1883 utime_t max_failed_since = fi.get_failed_since();
1884 utime_t failed_for = now - max_failed_since;
1885
1886 utime_t grace = orig_grace;
1887 double my_grace = 0, peer_grace = 0;
1888 double decay_k = 0;
1889 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1890 double halflife = (double)g_conf->mon_osd_laggy_halflife;
1891 decay_k = ::log(.5) / halflife;
1892
1893 // scale grace period based on historical probability of 'lagginess'
1894 // (false positive failures due to slowness).
1895 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
1896 double decay = exp((double)failed_for * decay_k);
1897 dout(20) << " halflife " << halflife << " decay_k " << decay_k
1898 << " failed_for " << failed_for << " decay " << decay << dendl;
1899 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
1900 grace += my_grace;
1901 }
1902
1903 // consider the peers reporting a failure a proxy for a potential
1904 // 'subcluster' over the overall cluster that is similarly
1905 // laggy. this is clearly not true in all cases, but will sometimes
1906 // help us localize the grace correction to a subset of the system
1907 // (say, a rack with a bad switch) that is unhappy.
1908 assert(fi.reporters.size());
1909 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
1910 p != fi.reporters.end();
1911 ++p) {
1912 // get the parent bucket whose type matches with "reporter_subtree_level".
1913 // fall back to OSD if the level doesn't exist.
1914 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
1915 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
1916 if (iter == reporter_loc.end()) {
1917 reporters_by_subtree.insert("osd." + to_string(p->first));
1918 } else {
1919 reporters_by_subtree.insert(iter->second);
1920 }
1921 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1922 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
1923 utime_t elapsed = now - xi.down_stamp;
1924 double decay = exp((double)elapsed * decay_k);
1925 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
1926 }
1927 }
1928
1929 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1930 peer_grace /= (double)fi.reporters.size();
1931 grace += peer_grace;
1932 }
1933
1934 dout(10) << " osd." << target_osd << " has "
1935 << fi.reporters.size() << " reporters, "
1936 << grace << " grace (" << orig_grace << " + " << my_grace
1937 << " + " << peer_grace << "), max_failed_since " << max_failed_since
1938 << dendl;
1939
1940 if (failed_for >= grace &&
1941 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
1942 dout(1) << " we have enough reporters to mark osd." << target_osd
1943 << " down" << dendl;
1944 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1945
31f18b77
FG
1946 mon->clog->info() << "osd." << target_osd << " failed ("
1947 << osdmap.crush->get_full_location_ordered_string(
1948 target_osd)
1949 << ") ("
1950 << (int)reporters_by_subtree.size()
1951 << " reporters from different "
7c673cae
FG
1952 << reporter_subtree_level << " after "
1953 << failed_for << " >= grace " << grace << ")";
1954 return true;
1955 }
1956 return false;
1957}
1958
224ce89b 1959void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
1960{
1961 // already pending failure?
1962 if (pending_inc.new_state.count(target_osd) &&
1963 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1964 dout(10) << " already pending failure" << dendl;
1965 return;
1966 }
1967
1968 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
1969 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1970
31f18b77
FG
1971 mon->clog->info() << "osd." << target_osd << " failed ("
1972 << osdmap.crush->get_full_location_ordered_string(target_osd)
1973 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
1974 return;
1975}
1976
1977bool OSDMonitor::prepare_failure(MonOpRequestRef op)
1978{
1979 op->mark_osdmon_event(__func__);
1980 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1981 dout(1) << "prepare_failure " << m->get_target()
1982 << " from " << m->get_orig_source_inst()
1983 << " is reporting failure:" << m->if_osd_failed() << dendl;
1984
1985 int target_osd = m->get_target().name.num();
1986 int reporter = m->get_orig_source().num();
1987 assert(osdmap.is_up(target_osd));
1988 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1989
1990 if (m->if_osd_failed()) {
1991 // calculate failure time
1992 utime_t now = ceph_clock_now();
1993 utime_t failed_since =
1994 m->get_recv_stamp() - utime_t(m->failed_for, 0);
1995
1996 // add a report
1997 if (m->is_immediate()) {
1998 mon->clog->debug() << m->get_target() << " reported immediately failed by "
1999 << m->get_orig_source_inst();
224ce89b 2000 force_failure(target_osd, reporter);
7c673cae
FG
2001 return true;
2002 }
2003 mon->clog->debug() << m->get_target() << " reported failed by "
2004 << m->get_orig_source_inst();
2005
2006 failure_info_t& fi = failure_info[target_osd];
2007 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2008 if (old_op) {
2009 mon->no_reply(old_op);
2010 }
2011
2012 return check_failure(now, target_osd, fi);
2013 } else {
2014 // remove the report
2015 mon->clog->debug() << m->get_target() << " failure report canceled by "
2016 << m->get_orig_source_inst();
2017 if (failure_info.count(target_osd)) {
2018 failure_info_t& fi = failure_info[target_osd];
2019 MonOpRequestRef report_op = fi.cancel_report(reporter);
2020 if (report_op) {
2021 mon->no_reply(report_op);
2022 }
2023 if (fi.reporters.empty()) {
2024 dout(10) << " removing last failure_info for osd." << target_osd
2025 << dendl;
2026 failure_info.erase(target_osd);
2027 } else {
2028 dout(10) << " failure_info for osd." << target_osd << " now "
2029 << fi.reporters.size() << " reporters" << dendl;
2030 }
2031 } else {
2032 dout(10) << " no failure_info for osd." << target_osd << dendl;
2033 }
2034 mon->no_reply(op);
2035 }
2036
2037 return false;
2038}
2039
2040void OSDMonitor::process_failures()
2041{
2042 map<int,failure_info_t>::iterator p = failure_info.begin();
2043 while (p != failure_info.end()) {
2044 if (osdmap.is_up(p->first)) {
2045 ++p;
2046 } else {
2047 dout(10) << "process_failures osd." << p->first << dendl;
2048 list<MonOpRequestRef> ls;
2049 p->second.take_report_messages(ls);
2050 failure_info.erase(p++);
2051
2052 while (!ls.empty()) {
2053 MonOpRequestRef o = ls.front();
2054 if (o) {
2055 o->mark_event(__func__);
2056 MOSDFailure *m = o->get_req<MOSDFailure>();
2057 send_latest(o, m->get_epoch());
2058 }
2059 ls.pop_front();
2060 }
2061 }
2062 }
2063}
2064
2065void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2066{
2067 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2068
2069 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2070 p != failure_info.end();
2071 ++p) {
2072 p->second.take_report_messages(ls);
2073 }
2074 failure_info.clear();
2075}
2076
2077
2078// boot --
2079
2080bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2081{
2082 op->mark_osdmon_event(__func__);
2083 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2084 int from = m->get_orig_source_inst().name.num();
2085
2086 // check permissions, ignore if failed (no response expected)
2087 MonSession *session = m->get_session();
2088 if (!session)
2089 goto ignore;
2090 if (!session->is_capable("osd", MON_CAP_X)) {
2091 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2092 << session->caps << dendl;
2093 goto ignore;
2094 }
2095
2096 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2097 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2098 << " != " << mon->monmap->fsid << dendl;
2099 goto ignore;
2100 }
2101
2102 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2103 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2104 goto ignore;
2105 }
2106
2107 assert(m->get_orig_source_inst().name.is_osd());
2108
2109 // check if osd has required features to boot
2110 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2111 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2112 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2113 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2114 << m->get_orig_source_inst()
2115 << " doesn't announce support -- ignore" << dendl;
2116 goto ignore;
2117 }
2118
2119 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2120 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2121 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2122 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2123 << m->get_orig_source_inst()
2124 << " doesn't announce support -- ignore" << dendl;
2125 goto ignore;
2126 }
2127
2128 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2129 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2130 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2131 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2132 << m->get_orig_source_inst()
2133 << " doesn't announce support -- ignore" << dendl;
2134 goto ignore;
2135 }
2136
31f18b77 2137 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
2138 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2139 mon->clog->info() << "disallowing boot of OSD "
2140 << m->get_orig_source_inst()
2141 << " because the osdmap requires"
2142 << " CEPH_FEATURE_SERVER_LUMINOUS"
2143 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2144 goto ignore;
2145 }
2146
31f18b77 2147 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
7c673cae
FG
2148 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2149 mon->clog->info() << "disallowing boot of OSD "
2150 << m->get_orig_source_inst()
2151 << " because the osdmap requires"
2152 << " CEPH_FEATURE_SERVER_JEWEL"
2153 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2154 goto ignore;
2155 }
2156
31f18b77 2157 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
7c673cae
FG
2158 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2159 mon->clog->info() << "disallowing boot of OSD "
2160 << m->get_orig_source_inst()
2161 << " because the osdmap requires"
2162 << " CEPH_FEATURE_SERVER_KRAKEN"
2163 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2164 goto ignore;
2165 }
2166
2167 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2168 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2169 mon->clog->info() << "disallowing boot of OSD "
2170 << m->get_orig_source_inst()
2171 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2172 goto ignore;
2173 }
2174
c07f9fc5
FG
2175 if (osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES) &&
2176 !(m->osd_features & CEPH_FEATURE_OSD_RECOVERY_DELETES)) {
2177 mon->clog->info() << "disallowing boot of OSD "
2178 << m->get_orig_source_inst()
2179 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2180 goto ignore;
2181 }
2182
7c673cae
FG
2183 if (any_of(osdmap.get_pools().begin(),
2184 osdmap.get_pools().end(),
2185 [](const std::pair<int64_t,pg_pool_t>& pool)
2186 { return pool.second.use_gmt_hitset; })) {
2187 assert(osdmap.get_num_up_osds() == 0 ||
2188 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2189 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2190 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2191 << m->get_orig_source_inst()
2192 << " doesn't announce support -- ignore" << dendl;
2193 goto ignore;
2194 }
2195 }
2196
2197 // make sure upgrades stop at luminous
2198 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
31f18b77 2199 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
2200 mon->clog->info() << "disallowing boot of post-luminous OSD "
2201 << m->get_orig_source_inst()
31f18b77 2202 << " because require_osd_release < luminous";
7c673cae
FG
2203 goto ignore;
2204 }
2205
2206 // make sure upgrades stop at jewel
2207 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
31f18b77 2208 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
2209 mon->clog->info() << "disallowing boot of post-jewel OSD "
2210 << m->get_orig_source_inst()
31f18b77 2211 << " because require_osd_release < jewel";
7c673cae
FG
2212 goto ignore;
2213 }
2214
2215 // make sure upgrades stop at hammer
2216 // * HAMMER_0_94_4 is the required hammer feature
2217 // * MON_METADATA is the first post-hammer feature
2218 if (osdmap.get_num_up_osds() > 0) {
2219 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2220 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2221 mon->clog->info() << "disallowing boot of post-hammer OSD "
2222 << m->get_orig_source_inst()
2223 << " because one or more up OSDs is pre-hammer v0.94.4";
2224 goto ignore;
2225 }
2226 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2227 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2228 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2229 << m->get_orig_source_inst()
2230 << " because all up OSDs are post-hammer";
2231 goto ignore;
2232 }
2233 }
2234
2235 // already booted?
2236 if (osdmap.is_up(from) &&
2237 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2238 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2239 // yup.
2240 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2241 << " == " << osdmap.get_inst(from) << dendl;
2242 _booted(op, false);
2243 return true;
2244 }
2245
2246 if (osdmap.exists(from) &&
2247 !osdmap.get_uuid(from).is_zero() &&
2248 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2249 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2250 << " clashes with existing osd: different fsid"
2251 << " (ours: " << osdmap.get_uuid(from)
2252 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2253 goto ignore;
2254 }
2255
2256 if (osdmap.exists(from) &&
2257 osdmap.get_info(from).up_from > m->version &&
2258 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2259 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2260 send_latest(op, m->sb.current_epoch+1);
2261 return true;
2262 }
2263
2264 // noup?
2265 if (!can_mark_up(from)) {
2266 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2267 send_latest(op, m->sb.current_epoch+1);
2268 return true;
2269 }
2270
2271 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2272 return false;
2273
2274 ignore:
2275 return true;
2276}
2277
2278bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2279{
2280 op->mark_osdmon_event(__func__);
2281 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2282 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2283 << " cluster_addr " << m->cluster_addr
2284 << " hb_back_addr " << m->hb_back_addr
2285 << " hb_front_addr " << m->hb_front_addr
2286 << dendl;
2287
2288 assert(m->get_orig_source().is_osd());
2289 int from = m->get_orig_source().num();
2290
2291 // does this osd exist?
2292 if (from >= osdmap.get_max_osd()) {
2293 dout(1) << "boot from osd." << from << " >= max_osd "
2294 << osdmap.get_max_osd() << dendl;
2295 return false;
2296 }
2297
2298 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2299 if (pending_inc.new_state.count(from))
2300 oldstate ^= pending_inc.new_state[from];
2301
2302 // already up? mark down first?
2303 if (osdmap.is_up(from)) {
2304 dout(7) << __func__ << " was up, first marking down "
2305 << osdmap.get_inst(from) << dendl;
2306 // preprocess should have caught these; if not, assert.
2307 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2308 osdmap.get_cluster_addr(from) != m->cluster_addr);
2309 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2310
2311 if (pending_inc.new_state.count(from) == 0 ||
2312 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2313 // mark previous guy down
2314 pending_inc.new_state[from] = CEPH_OSD_UP;
2315 }
2316 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2317 } else if (pending_inc.new_up_client.count(from)) {
2318 // already prepared, just wait
2319 dout(7) << __func__ << " already prepared, waiting on "
2320 << m->get_orig_source_addr() << dendl;
2321 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2322 } else {
2323 // mark new guy up.
2324 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2325 if (!m->cluster_addr.is_blank_ip())
2326 pending_inc.new_up_cluster[from] = m->cluster_addr;
2327 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2328 if (!m->hb_front_addr.is_blank_ip())
2329 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2330
2331 down_pending_out.erase(from); // if any
2332
2333 if (m->sb.weight)
2334 osd_weight[from] = m->sb.weight;
2335
2336 // set uuid?
2337 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2338 << dendl;
2339 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2340 // preprocess should have caught this; if not, assert.
2341 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2342 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2343 }
2344
2345 // fresh osd?
2346 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2347 const osd_info_t& i = osdmap.get_info(from);
2348 if (i.up_from > i.lost_at) {
2349 dout(10) << " fresh osd; marking lost_at too" << dendl;
2350 pending_inc.new_lost[from] = osdmap.get_epoch();
2351 }
2352 }
2353
2354 // metadata
2355 bufferlist osd_metadata;
2356 ::encode(m->metadata, osd_metadata);
2357 pending_metadata[from] = osd_metadata;
31f18b77 2358 pending_metadata_rm.erase(from);
7c673cae
FG
2359
2360 // adjust last clean unmount epoch?
2361 const osd_info_t& info = osdmap.get_info(from);
2362 dout(10) << " old osd_info: " << info << dendl;
2363 if (m->sb.mounted > info.last_clean_begin ||
2364 (m->sb.mounted == info.last_clean_begin &&
2365 m->sb.clean_thru > info.last_clean_end)) {
2366 epoch_t begin = m->sb.mounted;
2367 epoch_t end = m->sb.clean_thru;
2368
2369 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2370 << "[" << info.last_clean_begin << "," << info.last_clean_end
2371 << ") -> [" << begin << "-" << end << ")"
2372 << dendl;
2373 pending_inc.new_last_clean_interval[from] =
2374 pair<epoch_t,epoch_t>(begin, end);
2375 }
2376
2377 osd_xinfo_t xi = osdmap.get_xinfo(from);
2378 if (m->boot_epoch == 0) {
2379 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2380 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2381 dout(10) << " not laggy, new xi " << xi << dendl;
2382 } else {
2383 if (xi.down_stamp.sec()) {
2384 int interval = ceph_clock_now().sec() -
2385 xi.down_stamp.sec();
2386 if (g_conf->mon_osd_laggy_max_interval &&
2387 (interval > g_conf->mon_osd_laggy_max_interval)) {
2388 interval = g_conf->mon_osd_laggy_max_interval;
2389 }
2390 xi.laggy_interval =
2391 interval * g_conf->mon_osd_laggy_weight +
2392 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2393 }
2394 xi.laggy_probability =
2395 g_conf->mon_osd_laggy_weight +
2396 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2397 dout(10) << " laggy, now xi " << xi << dendl;
2398 }
2399
2400 // set features shared by the osd
2401 if (m->osd_features)
2402 xi.features = m->osd_features;
2403 else
2404 xi.features = m->get_connection()->get_features();
2405
2406 // mark in?
2407 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2408 (oldstate & CEPH_OSD_AUTOOUT)) ||
2409 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2410 (g_conf->mon_osd_auto_mark_in)) {
2411 if (can_mark_in(from)) {
2412 if (osdmap.osd_xinfo[from].old_weight > 0) {
2413 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2414 xi.old_weight = 0;
2415 } else {
2416 pending_inc.new_weight[from] = CEPH_OSD_IN;
2417 }
2418 } else {
2419 dout(7) << __func__ << " NOIN set, will not mark in "
2420 << m->get_orig_source_addr() << dendl;
2421 }
2422 }
2423
2424 pending_inc.new_xinfo[from] = xi;
2425
2426 // wait
2427 wait_for_finished_proposal(op, new C_Booted(this, op));
2428 }
2429 return true;
2430}
2431
2432void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2433{
2434 op->mark_osdmon_event(__func__);
2435 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2436 dout(7) << "_booted " << m->get_orig_source_inst()
2437 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2438
2439 if (logit) {
2440 mon->clog->info() << m->get_orig_source_inst() << " boot";
2441 }
2442
2443 send_latest(op, m->sb.current_epoch+1);
2444}
2445
2446
2447// -------------
2448// full
2449
2450bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2451{
2452 op->mark_osdmon_event(__func__);
2453 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2454 int from = m->get_orig_source().num();
2455 set<string> state;
2456 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2457
2458 // check permissions, ignore if failed
2459 MonSession *session = m->get_session();
2460 if (!session)
2461 goto ignore;
2462 if (!session->is_capable("osd", MON_CAP_X)) {
2463 dout(0) << "MOSDFull from entity with insufficient privileges:"
2464 << session->caps << dendl;
2465 goto ignore;
2466 }
2467
2468 // ignore a full message from the osd instance that already went down
2469 if (!osdmap.exists(from)) {
2470 dout(7) << __func__ << " ignoring full message from nonexistent "
2471 << m->get_orig_source_inst() << dendl;
2472 goto ignore;
2473 }
2474 if ((!osdmap.is_up(from) &&
2475 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2476 (osdmap.is_up(from) &&
2477 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2478 dout(7) << __func__ << " ignoring full message from down "
2479 << m->get_orig_source_inst() << dendl;
2480 goto ignore;
2481 }
2482
2483 OSDMap::calc_state_set(osdmap.get_state(from), state);
2484
2485 if ((osdmap.get_state(from) & mask) == m->state) {
2486 dout(7) << __func__ << " state already " << state << " for osd." << from
2487 << " " << m->get_orig_source_inst() << dendl;
2488 _reply_map(op, m->version);
2489 goto ignore;
2490 }
2491
2492 dout(10) << __func__ << " want state " << state << " for osd." << from
2493 << " " << m->get_orig_source_inst() << dendl;
2494 return false;
2495
2496 ignore:
2497 return true;
2498}
2499
2500bool OSDMonitor::prepare_full(MonOpRequestRef op)
2501{
2502 op->mark_osdmon_event(__func__);
2503 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2504 const int from = m->get_orig_source().num();
2505
2506 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2507 const unsigned want_state = m->state & mask; // safety first
2508
2509 unsigned cur_state = osdmap.get_state(from);
2510 auto p = pending_inc.new_state.find(from);
2511 if (p != pending_inc.new_state.end()) {
2512 cur_state ^= p->second;
2513 }
2514 cur_state &= mask;
2515
2516 set<string> want_state_set, cur_state_set;
2517 OSDMap::calc_state_set(want_state, want_state_set);
2518 OSDMap::calc_state_set(cur_state, cur_state_set);
2519
2520 if (cur_state != want_state) {
2521 if (p != pending_inc.new_state.end()) {
2522 p->second &= ~mask;
2523 } else {
2524 pending_inc.new_state[from] = 0;
2525 }
2526 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2527 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2528 << " -> " << want_state_set << dendl;
2529 } else {
2530 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2531 << " = wanted " << want_state_set << ", just waiting" << dendl;
2532 }
2533
2534 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2535 return true;
2536}
2537
2538// -------------
2539// alive
2540
2541bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2542{
2543 op->mark_osdmon_event(__func__);
2544 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2545 int from = m->get_orig_source().num();
2546
2547 // check permissions, ignore if failed
2548 MonSession *session = m->get_session();
2549 if (!session)
2550 goto ignore;
2551 if (!session->is_capable("osd", MON_CAP_X)) {
2552 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2553 << session->caps << dendl;
2554 goto ignore;
2555 }
2556
2557 if (!osdmap.is_up(from) ||
2558 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2559 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2560 goto ignore;
2561 }
2562
2563 if (osdmap.get_up_thru(from) >= m->want) {
2564 // yup.
2565 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2566 _reply_map(op, m->version);
2567 return true;
2568 }
2569
2570 dout(10) << "preprocess_alive want up_thru " << m->want
2571 << " from " << m->get_orig_source_inst() << dendl;
2572 return false;
2573
2574 ignore:
2575 return true;
2576}
2577
2578bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2579{
2580 op->mark_osdmon_event(__func__);
2581 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2582 int from = m->get_orig_source().num();
2583
2584 if (0) { // we probably don't care much about these
2585 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2586 }
2587
2588 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2589 << " from " << m->get_orig_source_inst() << dendl;
2590
2591 update_up_thru(from, m->version); // set to the latest map the OSD has
2592 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2593 return true;
2594}
2595
2596void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2597{
2598 op->mark_osdmon_event(__func__);
2599 dout(7) << "_reply_map " << e
2600 << " from " << op->get_req()->get_orig_source_inst()
2601 << dendl;
2602 send_latest(op, e);
2603}
2604
2605// pg_created
2606bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2607{
2608 op->mark_osdmon_event(__func__);
2609 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2610 dout(10) << __func__ << " " << *m << dendl;
2611 auto session = m->get_session();
2612 if (!session) {
2613 dout(10) << __func__ << ": no monitor session!" << dendl;
2614 return true;
2615 }
2616 if (!session->is_capable("osd", MON_CAP_X)) {
2617 derr << __func__ << " received from entity "
2618 << "with insufficient privileges " << session->caps << dendl;
2619 return true;
2620 }
2621 // always forward the "created!" to the leader
2622 return false;
2623}
2624
2625bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2626{
2627 op->mark_osdmon_event(__func__);
2628 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2629 dout(10) << __func__ << " " << *m << dendl;
2630 auto src = m->get_orig_source();
2631 auto from = src.num();
2632 if (!src.is_osd() ||
2633 !mon->osdmon()->osdmap.is_up(from) ||
2634 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2635 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2636 return false;
2637 }
2638 pending_created_pgs.push_back(m->pgid);
2639 return true;
2640}
2641
2642// -------------
2643// pg_temp changes
2644
2645bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2646{
2647 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2648 dout(10) << "preprocess_pgtemp " << *m << dendl;
2649 mempool::osdmap::vector<int> empty;
2650 int from = m->get_orig_source().num();
2651 size_t ignore_cnt = 0;
2652
2653 // check caps
2654 MonSession *session = m->get_session();
2655 if (!session)
2656 goto ignore;
2657 if (!session->is_capable("osd", MON_CAP_X)) {
2658 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2659 << session->caps << dendl;
2660 goto ignore;
2661 }
2662
2663 if (!osdmap.is_up(from) ||
2664 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2665 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2666 goto ignore;
2667 }
2668
2669 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2670 dout(20) << " " << p->first
31f18b77 2671 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
2672 << " -> " << p->second << dendl;
2673
2674 // does the pool exist?
2675 if (!osdmap.have_pg_pool(p->first.pool())) {
2676 /*
2677 * 1. If the osdmap does not have the pool, it means the pool has been
2678 * removed in-between the osd sending this message and us handling it.
2679 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2680 * not exist in the pending either, as the osds would not send a
2681 * message about a pool they know nothing about (yet).
2682 * 3. However, if the pool does exist in the pending, then it must be a
2683 * new pool, and not relevant to this message (see 1).
2684 */
2685 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2686 << ": pool has been removed" << dendl;
2687 ignore_cnt++;
2688 continue;
2689 }
2690
2691 int acting_primary = -1;
2692 osdmap.pg_to_up_acting_osds(
2693 p->first, nullptr, nullptr, nullptr, &acting_primary);
2694 if (acting_primary != from) {
2695 /* If the source isn't the primary based on the current osdmap, we know
2696 * that the interval changed and that we can discard this message.
2697 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2698 * which of two pg temp mappings on the same pg is more recent.
2699 */
2700 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2701 << ": primary has changed" << dendl;
2702 ignore_cnt++;
2703 continue;
2704 }
2705
2706 // removal?
2707 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2708 osdmap.primary_temp->count(p->first)))
2709 return false;
2710 // change?
2711 // NOTE: we assume that this will clear pg_primary, so consider
2712 // an existing pg_primary field to imply a change
2713 if (p->second.size() &&
2714 (osdmap.pg_temp->count(p->first) == 0 ||
31f18b77 2715 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
7c673cae
FG
2716 osdmap.primary_temp->count(p->first)))
2717 return false;
2718 }
2719
2720 // should we ignore all the pgs?
2721 if (ignore_cnt == m->pg_temp.size())
2722 goto ignore;
2723
2724 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
2725 _reply_map(op, m->map_epoch);
2726 return true;
2727
2728 ignore:
2729 return true;
2730}
2731
2732void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
2733{
2734 epoch_t old_up_thru = osdmap.get_up_thru(from);
2735 auto ut = pending_inc.new_up_thru.find(from);
2736 if (ut != pending_inc.new_up_thru.end()) {
2737 old_up_thru = ut->second;
2738 }
2739 if (up_thru > old_up_thru) {
2740 // set up_thru too, so the osd doesn't have to ask again
2741 pending_inc.new_up_thru[from] = up_thru;
2742 }
2743}
2744
2745bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
2746{
2747 op->mark_osdmon_event(__func__);
2748 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2749 int from = m->get_orig_source().num();
2750 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
2751 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2752 uint64_t pool = p->first.pool();
2753 if (pending_inc.old_pools.count(pool)) {
2754 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2755 << ": pool pending removal" << dendl;
2756 continue;
2757 }
2758 if (!osdmap.have_pg_pool(pool)) {
2759 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2760 << ": pool has been removed" << dendl;
2761 continue;
2762 }
2763 pending_inc.new_pg_temp[p->first] =
2764 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
2765
2766 // unconditionally clear pg_primary (until this message can encode
2767 // a change for that, too.. at which point we need to also fix
2768 // preprocess_pg_temp)
2769 if (osdmap.primary_temp->count(p->first) ||
2770 pending_inc.new_primary_temp.count(p->first))
2771 pending_inc.new_primary_temp[p->first] = -1;
2772 }
2773
2774 // set up_thru too, so the osd doesn't have to ask again
2775 update_up_thru(from, m->map_epoch);
2776
2777 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
2778 return true;
2779}
2780
2781
2782// ---
2783
2784bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
2785{
2786 op->mark_osdmon_event(__func__);
2787 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2788 dout(7) << "preprocess_remove_snaps " << *m << dendl;
2789
2790 // check privilege, ignore if failed
2791 MonSession *session = m->get_session();
2792 if (!session)
2793 goto ignore;
2794 if (!session->caps.is_capable(
2795 g_ceph_context,
2796 CEPH_ENTITY_TYPE_MON,
2797 session->entity_name,
2798 "osd", "osd pool rmsnap", {}, true, true, false)) {
2799 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2800 << session->caps << dendl;
2801 goto ignore;
2802 }
2803
2804 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
2805 q != m->snaps.end();
2806 ++q) {
2807 if (!osdmap.have_pg_pool(q->first)) {
2808 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
2809 continue;
2810 }
2811 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
2812 for (vector<snapid_t>::iterator p = q->second.begin();
2813 p != q->second.end();
2814 ++p) {
2815 if (*p > pi->get_snap_seq() ||
2816 !pi->removed_snaps.contains(*p))
2817 return false;
2818 }
2819 }
2820
2821 ignore:
2822 return true;
2823}
2824
2825bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
2826{
2827 op->mark_osdmon_event(__func__);
2828 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2829 dout(7) << "prepare_remove_snaps " << *m << dendl;
2830
2831 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
2832 p != m->snaps.end();
2833 ++p) {
2834
2835 if (!osdmap.have_pg_pool(p->first)) {
2836 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
2837 continue;
2838 }
2839
2840 pg_pool_t& pi = osdmap.pools[p->first];
2841 for (vector<snapid_t>::iterator q = p->second.begin();
2842 q != p->second.end();
2843 ++q) {
2844 if (!pi.removed_snaps.contains(*q) &&
2845 (!pending_inc.new_pools.count(p->first) ||
2846 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
2847 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
2848 newpi->removed_snaps.insert(*q);
2849 dout(10) << " pool " << p->first << " removed_snaps added " << *q
2850 << " (now " << newpi->removed_snaps << ")" << dendl;
2851 if (*q > newpi->get_snap_seq()) {
2852 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
2853 newpi->set_snap_seq(*q);
2854 }
2855 newpi->set_snap_epoch(pending_inc.epoch);
2856 }
2857 }
2858 }
2859 return true;
2860}
2861
2862// osd beacon
2863bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
2864{
2865 op->mark_osdmon_event(__func__);
2866 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2867 // check caps
2868 auto session = beacon->get_session();
2869 if (!session) {
2870 dout(10) << __func__ << " no monitor session!" << dendl;
2871 return true;
2872 }
2873 if (!session->is_capable("osd", MON_CAP_X)) {
2874 derr << __func__ << " received from entity "
2875 << "with insufficient privileges " << session->caps << dendl;
2876 return true;
2877 }
2878 // Always forward the beacon to the leader, even if they are the same as
2879 // the old one. The leader will mark as down osds that haven't sent
2880 // beacon for a few minutes.
2881 return false;
2882}
2883
2884bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
2885{
2886 op->mark_osdmon_event(__func__);
2887 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2888 const auto src = beacon->get_orig_source();
2889 dout(10) << __func__ << " " << *beacon
2890 << " from " << src << dendl;
2891 int from = src.num();
2892
2893 if (!src.is_osd() ||
2894 !osdmap.is_up(from) ||
2895 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
2896 dout(1) << " ignoring beacon from non-active osd." << dendl;
2897 return false;
2898 }
2899
2900 last_osd_report[from] = ceph_clock_now();
2901 osd_epochs[from] = beacon->version;
2902
2903 for (const auto& pg : beacon->pgs) {
2904 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
2905 }
2906 return false;
2907}
2908
2909// ---------------
2910// map helpers
2911
2912void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
2913{
2914 op->mark_osdmon_event(__func__);
2915 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
2916 << " start " << start << dendl;
2917 if (start == 0)
2918 send_full(op);
2919 else
2920 send_incremental(op, start);
2921}
2922
2923
2924MOSDMap *OSDMonitor::build_latest_full()
2925{
2926 MOSDMap *r = new MOSDMap(mon->monmap->fsid);
2927 get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
2928 r->oldest_map = get_first_committed();
2929 r->newest_map = osdmap.get_epoch();
2930 return r;
2931}
2932
2933MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
2934{
2935 dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
2936 MOSDMap *m = new MOSDMap(mon->monmap->fsid);
2937 m->oldest_map = get_first_committed();
2938 m->newest_map = osdmap.get_epoch();
2939
2940 for (epoch_t e = to; e >= from && e > 0; e--) {
2941 bufferlist bl;
2942 int err = get_version(e, bl);
2943 if (err == 0) {
2944 assert(bl.length());
2945 // if (get_version(e, bl) > 0) {
2946 dout(20) << "build_incremental inc " << e << " "
2947 << bl.length() << " bytes" << dendl;
2948 m->incremental_maps[e] = bl;
2949 } else {
2950 assert(err == -ENOENT);
2951 assert(!bl.length());
2952 get_version_full(e, bl);
2953 if (bl.length() > 0) {
2954 //else if (get_version("full", e, bl) > 0) {
2955 dout(20) << "build_incremental full " << e << " "
2956 << bl.length() << " bytes" << dendl;
2957 m->maps[e] = bl;
2958 } else {
2959 ceph_abort(); // we should have all maps.
2960 }
2961 }
2962 }
2963 return m;
2964}
2965
2966void OSDMonitor::send_full(MonOpRequestRef op)
2967{
2968 op->mark_osdmon_event(__func__);
2969 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
2970 mon->send_reply(op, build_latest_full());
2971}
2972
2973void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
2974{
2975 op->mark_osdmon_event(__func__);
2976
2977 MonSession *s = op->get_session();
2978 assert(s);
2979
2980 if (s->proxy_con &&
2981 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
2982 // oh, we can tell the other mon to do it
2983 dout(10) << __func__ << " asking proxying mon to send_incremental from "
2984 << first << dendl;
2985 MRoute *r = new MRoute(s->proxy_tid, NULL);
2986 r->send_osdmap_first = first;
2987 s->proxy_con->send_message(r);
2988 op->mark_event("reply: send routed send_osdmap_first reply");
2989 } else {
2990 // do it ourselves
2991 send_incremental(first, s, false, op);
2992 }
2993}
2994
2995void OSDMonitor::send_incremental(epoch_t first,
2996 MonSession *session,
2997 bool onetime,
2998 MonOpRequestRef req)
2999{
3000 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3001 << " to " << session->inst << dendl;
3002
3003 if (first <= session->osd_epoch) {
31f18b77 3004 dout(10) << __func__ << " " << session->inst << " should already have epoch "
7c673cae
FG
3005 << session->osd_epoch << dendl;
3006 first = session->osd_epoch + 1;
3007 }
3008
3009 if (first < get_first_committed()) {
3010 first = get_first_committed();
3011 bufferlist bl;
3012 int err = get_version_full(first, bl);
3013 assert(err == 0);
3014 assert(bl.length());
3015
3016 dout(20) << "send_incremental starting with base full "
3017 << first << " " << bl.length() << " bytes" << dendl;
3018
3019 MOSDMap *m = new MOSDMap(osdmap.get_fsid());
3020 m->oldest_map = get_first_committed();
3021 m->newest_map = osdmap.get_epoch();
3022 m->maps[first] = bl;
3023
3024 if (req) {
3025 mon->send_reply(req, m);
3026 session->osd_epoch = first;
3027 return;
3028 } else {
3029 session->con->send_message(m);
3030 session->osd_epoch = first;
3031 }
3032 first++;
3033 }
3034
3035 while (first <= osdmap.get_epoch()) {
3036 epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
3037 osdmap.get_epoch());
3038 MOSDMap *m = build_incremental(first, last);
3039
3040 if (req) {
3041 // send some maps. it may not be all of them, but it will get them
3042 // started.
3043 mon->send_reply(req, m);
3044 } else {
3045 session->con->send_message(m);
3046 first = last + 1;
3047 }
3048 session->osd_epoch = last;
3049 if (onetime || req)
3050 break;
3051 }
3052}
3053
3054int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3055{
3056 if (inc_osd_cache.lookup(ver, &bl)) {
3057 return 0;
3058 }
3059 int ret = PaxosService::get_version(ver, bl);
3060 if (!ret) {
3061 inc_osd_cache.add(ver, bl);
3062 }
3063 return ret;
3064}
3065
3066int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3067{
3068 if (full_osd_cache.lookup(ver, &bl)) {
3069 return 0;
3070 }
3071 int ret = PaxosService::get_version_full(ver, bl);
3072 if (!ret) {
3073 full_osd_cache.add(ver, bl);
3074 }
3075 return ret;
3076}
3077
3078epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
3079{
3080 dout(10) << "blacklist " << a << " until " << until << dendl;
3081 pending_inc.new_blacklist[a] = until;
3082 return pending_inc.epoch;
3083}
3084
3085
3086void OSDMonitor::check_osdmap_subs()
3087{
3088 dout(10) << __func__ << dendl;
3089 if (!osdmap.get_epoch()) {
3090 return;
3091 }
3092 auto osdmap_subs = mon->session_map.subs.find("osdmap");
3093 if (osdmap_subs == mon->session_map.subs.end()) {
3094 return;
3095 }
3096 auto p = osdmap_subs->second->begin();
3097 while (!p.end()) {
3098 auto sub = *p;
3099 ++p;
3100 check_osdmap_sub(sub);
3101 }
3102}
3103
3104void OSDMonitor::check_osdmap_sub(Subscription *sub)
3105{
3106 dout(10) << __func__ << " " << sub << " next " << sub->next
3107 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
3108 if (sub->next <= osdmap.get_epoch()) {
3109 if (sub->next >= 1)
3110 send_incremental(sub->next, sub->session, sub->incremental_onetime);
3111 else
3112 sub->session->con->send_message(build_latest_full());
3113 if (sub->onetime)
3114 mon->session_map.remove_sub(sub);
3115 else
3116 sub->next = osdmap.get_epoch() + 1;
3117 }
3118}
3119
3120void OSDMonitor::check_pg_creates_subs()
3121{
3122 if (!mon->monmap->get_required_features().contains_all(
3123 ceph::features::mon::FEATURE_LUMINOUS)) {
3124 // PGMonitor takes care of this in pre-luminous era.
3125 return;
3126 }
3127 if (!osdmap.get_num_up_osds()) {
3128 return;
3129 }
3130 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3131 mon->with_session_map([this](const MonSessionMap& session_map) {
3132 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3133 if (pg_creates_subs == session_map.subs.end()) {
3134 return;
3135 }
3136 for (auto sub : *pg_creates_subs->second) {
3137 check_pg_creates_sub(sub);
3138 }
3139 });
3140}
3141
3142void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3143{
3144 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3145 assert(sub->type == "osd_pg_creates");
3146 // only send these if the OSD is up. we will check_subs() when they do
3147 // come up so they will get the creates then.
3148 if (sub->session->inst.name.is_osd() &&
3149 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3150 sub->next = send_pg_creates(sub->session->inst.name.num(),
3151 sub->session->con.get(),
3152 sub->next);
3153 }
3154}
3155
c07f9fc5
FG
3156void OSDMonitor::do_application_enable(int64_t pool_id,
3157 const std::string &app_name)
3158{
35e4c445 3159 assert(paxos->is_plugged() && is_writeable());
c07f9fc5
FG
3160
3161 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
3162 << dendl;
3163
35e4c445
FG
3164 assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS ||
3165 pending_inc.new_require_osd_release >= CEPH_RELEASE_LUMINOUS);
3166
c07f9fc5
FG
3167 auto pp = osdmap.get_pg_pool(pool_id);
3168 assert(pp != nullptr);
3169
3170 pg_pool_t p = *pp;
3171 if (pending_inc.new_pools.count(pool_id)) {
3172 p = pending_inc.new_pools[pool_id];
3173 }
3174
3175 p.application_metadata.insert({app_name, {}});
3176 p.last_change = pending_inc.epoch;
3177 pending_inc.new_pools[pool_id] = p;
3178}
3179
31f18b77 3180unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
3181 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3182 const mempool::osdmap::set<int64_t>& removed_pools,
3183 utime_t modified,
3184 creating_pgs_t* creating_pgs) const
3185{
31f18b77 3186 unsigned queued = 0;
7c673cae
FG
3187 for (auto& p : pools) {
3188 int64_t poolid = p.first;
3189 const pg_pool_t& pool = p.second;
31f18b77 3190 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
3191 pool.get_type(), pool.get_size());
3192 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3193 continue;
3194
3195 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3196 const auto created = pool.get_last_change();
3197 if (last_scan_epoch && created <= last_scan_epoch) {
3198 dout(10) << __func__ << " no change in pool " << poolid
3199 << " " << pool << dendl;
3200 continue;
3201 }
3202 if (removed_pools.count(poolid)) {
3203 dout(10) << __func__ << " pool is being removed: " << poolid
3204 << " " << pool << dendl;
3205 continue;
3206 }
31f18b77 3207 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 3208 << " " << pool << dendl;
31f18b77
FG
3209 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3210 created, modified)) {
3211 queued++;
7c673cae
FG
3212 }
3213 }
31f18b77 3214 return queued;
7c673cae
FG
3215}
3216
3217void OSDMonitor::update_creating_pgs()
3218{
31f18b77
FG
3219 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3220 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
3221 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3222 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 3223 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
3224 int acting_primary = -1;
3225 auto pgid = pg.first;
3226 auto mapped = pg.second.first;
c07f9fc5 3227 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
7c673cae
FG
3228 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3229 // check the previous creating_pgs, look for the target to whom the pg was
3230 // previously mapped
3231 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3232 const auto last_acting_primary = pgs_by_epoch.first;
3233 for (auto& pgs: pgs_by_epoch.second) {
3234 if (pgs.second.count(pgid)) {
3235 if (last_acting_primary == acting_primary) {
3236 mapped = pgs.first;
3237 } else {
3238 dout(20) << __func__ << " " << pgid << " "
3239 << " acting_primary:" << last_acting_primary
3240 << " -> " << acting_primary << dendl;
3241 // note epoch if the target of the create message changed.
3242 mapped = mapping.get_epoch();
3243 }
3244 break;
31f18b77
FG
3245 } else {
3246 // newly creating
3247 mapped = mapping.get_epoch();
3248 }
7c673cae
FG
3249 }
3250 }
3251 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 3252 << " to create " << pgid << "@" << mapped << dendl;
7c673cae
FG
3253 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3254 }
3255 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3256 creating_pgs_epoch = mapping.get_epoch();
3257}
3258
c07f9fc5 3259epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
3260{
3261 dout(30) << __func__ << " osd." << osd << " next=" << next
3262 << " " << creating_pgs_by_osd_epoch << dendl;
3263 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
3264 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
3265 dout(20) << __func__
3266 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
3267 // the subscribers will be updated when the mapping is completed anyway
3268 return next;
3269 }
7c673cae
FG
3270 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3271 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3272 return next;
3273 assert(!creating_pgs_by_epoch->second.empty());
3274
3275 MOSDPGCreate *m = nullptr;
3276 epoch_t last = 0;
3277 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3278 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3279 auto epoch = epoch_pgs->first;
3280 auto& pgs = epoch_pgs->second;
3281 dout(20) << __func__ << " osd." << osd << " from " << next
3282 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3283 last = epoch;
3284 for (auto& pg : pgs) {
3285 if (!m)
3286 m = new MOSDPGCreate(creating_pgs_epoch);
3287 // Need the create time from the monitor using its clock to set
3288 // last_scrub_stamp upon pg creation.
c07f9fc5
FG
3289 auto create = creating_pgs.pgs.find(pg);
3290 assert(create != creating_pgs.pgs.end());
3291 m->mkpg.emplace(pg, pg_create_t{create->second.first, pg, 0});
3292 m->ctimes.emplace(pg, create->second.second);
7c673cae 3293 dout(20) << __func__ << " will create " << pg
c07f9fc5 3294 << " at " << create->second.first << dendl;
7c673cae
FG
3295 }
3296 }
3297 if (!m) {
3298 dout(20) << __func__ << " osd." << osd << " from " << next
3299 << " has nothing to send" << dendl;
3300 return next;
3301 }
3302 con->send_message(m);
3303 // sub is current through last + 1
3304 return last + 1;
3305}
3306
3307// TICK
3308
3309
3310void OSDMonitor::tick()
3311{
3312 if (!is_active()) return;
3313
3314 dout(10) << osdmap << dendl;
3315
3316 if (!mon->is_leader()) return;
3317
3318 bool do_propose = false;
3319 utime_t now = ceph_clock_now();
3320
31f18b77 3321 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
3322 mon->monmap->get_required_features().contains_all(
3323 ceph::features::mon::FEATURE_LUMINOUS)) {
3324 if (handle_osd_timeouts(now, last_osd_report)) {
3325 do_propose = true;
3326 }
3327 }
3328
3329 // mark osds down?
3330 if (check_failures(now))
3331 do_propose = true;
3332
3333 // mark down osds out?
3334
3335 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3336 * influence at all. The decision is made based on the ratio of "in" osds,
3337 * and the function returns false if this ratio is lower that the minimum
3338 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3339 */
3340 if (can_mark_out(-1)) {
3341 set<int> down_cache; // quick cache of down subtrees
3342
3343 map<int,utime_t>::iterator i = down_pending_out.begin();
3344 while (i != down_pending_out.end()) {
3345 int o = i->first;
3346 utime_t down = now;
3347 down -= i->second;
3348 ++i;
3349
3350 if (osdmap.is_down(o) &&
3351 osdmap.is_in(o) &&
3352 can_mark_out(o)) {
3353 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3354 utime_t grace = orig_grace;
3355 double my_grace = 0.0;
3356
3357 if (g_conf->mon_osd_adjust_down_out_interval) {
3358 // scale grace period the same way we do the heartbeat grace.
3359 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3360 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3361 double decay_k = ::log(.5) / halflife;
3362 double decay = exp((double)down * decay_k);
3363 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3364 << " down for " << down << " decay " << decay << dendl;
3365 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3366 grace += my_grace;
3367 }
3368
3369 // is this an entire large subtree down?
3370 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3371 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3372 if (type > 0) {
3373 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3374 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3375 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3376 // reset timer, too.
3377 down_pending_out[o] = now;
3378 continue;
3379 }
3380 }
3381 }
3382
c07f9fc5
FG
3383 bool down_out = !osdmap.is_destroyed(o) &&
3384 g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace;
3385 bool destroyed_out = osdmap.is_destroyed(o) &&
3386 g_conf->mon_osd_destroyed_out_interval > 0 &&
3387 // this is not precise enough as we did not make a note when this osd
3388 // was marked as destroyed, but let's not bother with that
3389 // complexity for now.
3390 down.sec() >= g_conf->mon_osd_destroyed_out_interval;
3391 if (down_out || destroyed_out) {
7c673cae
FG
3392 dout(10) << "tick marking osd." << o << " OUT after " << down
3393 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3394 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3395
3396 // set the AUTOOUT bit.
3397 if (pending_inc.new_state.count(o) == 0)
3398 pending_inc.new_state[o] = 0;
3399 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3400
3401 // remember previous weight
3402 if (pending_inc.new_xinfo.count(o) == 0)
3403 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3404 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3405
3406 do_propose = true;
3407
224ce89b
WB
3408 mon->clog->info() << "Marking osd." << o << " out (has been down for "
3409 << int(down.sec()) << " seconds)";
7c673cae
FG
3410 } else
3411 continue;
3412 }
3413
3414 down_pending_out.erase(o);
3415 }
3416 } else {
3417 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3418 }
3419
3420 // expire blacklisted items?
3421 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3422 p != osdmap.blacklist.end();
3423 ++p) {
3424 if (p->second < now) {
3425 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3426 pending_inc.old_blacklist.push_back(p->first);
3427 do_propose = true;
3428 }
3429 }
3430
3431 // if map full setting has changed, get that info out there!
31f18b77
FG
3432 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3433 mon->pgservice->is_readable()) {
7c673cae 3434 // for pre-luminous compat only!
31f18b77 3435 if (mon->pgservice->have_full_osds()) {
7c673cae
FG
3436 dout(5) << "There are full osds, setting full flag" << dendl;
3437 add_flag(CEPH_OSDMAP_FULL);
3438 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3439 dout(10) << "No full osds, removing full flag" << dendl;
3440 remove_flag(CEPH_OSDMAP_FULL);
3441 }
3442
31f18b77 3443 if (mon->pgservice->have_nearfull_osds()) {
7c673cae
FG
3444 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3445 add_flag(CEPH_OSDMAP_NEARFULL);
3446 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3447 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3448 remove_flag(CEPH_OSDMAP_NEARFULL);
3449 }
3450 if (pending_inc.new_flags != -1 &&
3451 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3452 dout(1) << "New setting for" <<
3453 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3454 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3455 << " -- doing propose" << dendl;
3456 do_propose = true;
3457 }
3458 }
3459
3460 if (update_pools_status())
3461 do_propose = true;
3462
3463 if (do_propose ||
3464 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3465 propose_pending();
3466}
3467
3468bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3469 std::map<int,utime_t> &last_osd_report)
3470{
3471 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3472 if (now - mon->get_leader_since() < timeo) {
3473 // We haven't been the leader for long enough to consider OSD timeouts
3474 return false;
3475 }
3476
3477 int max_osd = osdmap.get_max_osd();
3478 bool new_down = false;
3479
3480 for (int i=0; i < max_osd; ++i) {
3481 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
3482 if (!osdmap.exists(i)) {
3483 last_osd_report.erase(i); // if any
3484 continue;
3485 }
7c673cae
FG
3486 if (!osdmap.is_up(i))
3487 continue;
3488 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3489 if (t == last_osd_report.end()) {
3490 // it wasn't in the map; start the timer.
3491 last_osd_report[i] = now;
3492 } else if (can_mark_down(i)) {
3493 utime_t diff = now - t->second;
3494 if (diff > timeo) {
31f18b77
FG
3495 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3496 << diff << " seconds";
3497 derr << "no beacon from osd." << i << " since " << t->second
3498 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
3499 pending_inc.new_state[i] = CEPH_OSD_UP;
3500 new_down = true;
3501 }
3502 }
3503 }
3504 return new_down;
3505}
3506
3507void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3508 list<pair<health_status_t,string> > *detail,
3509 CephContext *cct) const
3510{
3511 int num_osds = osdmap.get_num_osds();
3512
3513 if (num_osds == 0) {
3514 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3515 } else {
3516 int num_in_osds = 0;
3517 int num_down_in_osds = 0;
3518 set<int> osds;
31f18b77
FG
3519 set<int> down_in_osds;
3520 set<int> up_in_osds;
3521 set<int> subtree_up;
3522 unordered_map<int, set<int> > subtree_type_down;
3523 unordered_map<int, int> num_osds_subtree;
3524 int max_type = osdmap.crush->get_max_type_id();
3525
7c673cae
FG
3526 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3527 if (!osdmap.exists(i)) {
3528 if (osdmap.crush->item_exists(i)) {
3529 osds.insert(i);
3530 }
31f18b77 3531 continue;
224ce89b 3532 }
7c673cae
FG
3533 if (osdmap.is_out(i))
3534 continue;
3535 ++num_in_osds;
31f18b77
FG
3536 if (down_in_osds.count(i) || up_in_osds.count(i))
3537 continue;
7c673cae 3538 if (!osdmap.is_up(i)) {
31f18b77
FG
3539 down_in_osds.insert(i);
3540 int parent_id = 0;
3541 int current = i;
3542 for (int type = 0; type <= max_type; type++) {
3543 if (!osdmap.crush->get_type_name(type))
3544 continue;
3545 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3546 if (r == -ENOENT)
3547 break;
3548 // break early if this parent is already marked as up
3549 if (subtree_up.count(parent_id))
3550 break;
3551 type = osdmap.crush->get_bucket_type(parent_id);
3552 if (!osdmap.subtree_type_is_down(
3553 g_ceph_context, parent_id, type,
3554 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3555 break;
3556 current = parent_id;
3557 }
3558 }
3559 }
3560
3561 // calculate the number of down osds in each down subtree and
3562 // store it in num_osds_subtree
3563 for (int type = 1; type <= max_type; type++) {
3564 if (!osdmap.crush->get_type_name(type))
3565 continue;
3566 for (auto j = subtree_type_down[type].begin();
3567 j != subtree_type_down[type].end();
3568 ++j) {
3569 if (type == 1) {
3570 list<int> children;
3571 int num = osdmap.crush->get_children(*j, &children);
3572 num_osds_subtree[*j] = num;
3573 } else {
3574 list<int> children;
3575 int num = 0;
3576 int num_children = osdmap.crush->get_children(*j, &children);
3577 if (num_children == 0)
3578 continue;
3579 for (auto l = children.begin(); l != children.end(); ++l) {
3580 if (num_osds_subtree[*l] > 0) {
3581 num = num + num_osds_subtree[*l];
3582 }
3583 }
3584 num_osds_subtree[*j] = num;
7c673cae
FG
3585 }
3586 }
3587 }
31f18b77 3588 num_down_in_osds = down_in_osds.size();
7c673cae
FG
3589 assert(num_down_in_osds <= num_in_osds);
3590 if (num_down_in_osds > 0) {
31f18b77
FG
3591 // summary of down subtree types and osds
3592 for (int type = max_type; type > 0; type--) {
3593 if (!osdmap.crush->get_type_name(type))
3594 continue;
3595 if (subtree_type_down[type].size() > 0) {
3596 ostringstream ss;
3597 ss << subtree_type_down[type].size() << " "
3598 << osdmap.crush->get_type_name(type);
3599 if (subtree_type_down[type].size() > 1) {
3600 ss << "s";
3601 }
3602 int sum_down_osds = 0;
3603 for (auto j = subtree_type_down[type].begin();
3604 j != subtree_type_down[type].end();
3605 ++j) {
3606 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
3607 }
3608 ss << " (" << sum_down_osds << " osds) down";
3609 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3610 }
3611 }
7c673cae 3612 ostringstream ss;
31f18b77 3613 ss << down_in_osds.size() << " osds down";
7c673cae 3614 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
31f18b77
FG
3615
3616 if (detail) {
3617 // details of down subtree types
3618 for (int type = max_type; type > 0; type--) {
3619 if (!osdmap.crush->get_type_name(type))
3620 continue;
3621 for (auto j = subtree_type_down[type].rbegin();
3622 j != subtree_type_down[type].rend();
3623 ++j) {
3624 ostringstream ss;
3625 ss << osdmap.crush->get_type_name(type);
3626 ss << " ";
3627 ss << osdmap.crush->get_item_name(*j);
3628 // at the top level, do not print location
3629 if (type != max_type) {
3630 ss << " (";
3631 ss << osdmap.crush->get_full_location_ordered_string(*j);
3632 ss << ")";
3633 }
3634 int num = num_osds_subtree[*j];
3635 ss << " (" << num << " osds)";
3636 ss << " is down";
3637 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3638 }
3639 }
3640 // details of down osds
3641 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
3642 ostringstream ss;
3643 ss << "osd." << *it << " (";
3644 ss << osdmap.crush->get_full_location_ordered_string(*it);
3645 ss << ") is down";
3646 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3647 }
3648 }
7c673cae
FG
3649 }
3650
3651 if (!osds.empty()) {
3652 ostringstream ss;
31f18b77 3653 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
7c673cae
FG
3654 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3655 if (detail) {
31f18b77 3656 ss << " (osds: " << osds << ")";
7c673cae
FG
3657 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3658 }
3659 }
3660
7c673cae
FG
3661 // note: we leave it to ceph-mgr to generate details health warnings
3662 // with actual osd utilizations
3663
3664 // warn about flags
3665 uint64_t warn_flags =
3666 CEPH_OSDMAP_FULL |
3667 CEPH_OSDMAP_PAUSERD |
3668 CEPH_OSDMAP_PAUSEWR |
3669 CEPH_OSDMAP_PAUSEREC |
3670 CEPH_OSDMAP_NOUP |
3671 CEPH_OSDMAP_NODOWN |
3672 CEPH_OSDMAP_NOIN |
3673 CEPH_OSDMAP_NOOUT |
3674 CEPH_OSDMAP_NOBACKFILL |
3675 CEPH_OSDMAP_NORECOVER |
3676 CEPH_OSDMAP_NOSCRUB |
3677 CEPH_OSDMAP_NODEEP_SCRUB |
3678 CEPH_OSDMAP_NOTIERAGENT |
3679 CEPH_OSDMAP_NOREBALANCE;
3680 if (osdmap.test_flag(warn_flags)) {
3681 ostringstream ss;
3682 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
3683 << " flag(s) set";
3684 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3685 if (detail)
3686 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3687 }
3688
3689 // old crush tunables?
3690 if (g_conf->mon_warn_on_legacy_crush_tunables) {
3691 string min = osdmap.crush->get_min_required_version();
3692 if (min < g_conf->mon_crush_min_required_version) {
3693 ostringstream ss;
3694 ss << "crush map has legacy tunables (require " << min
3695 << ", min is " << g_conf->mon_crush_min_required_version << ")";
3696 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3697 if (detail) {
3698 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3699 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3700 }
3701 }
3702 }
3703 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
3704 if (osdmap.crush->get_straw_calc_version() == 0) {
3705 ostringstream ss;
3706 ss << "crush map has straw_calc_version=0";
3707 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3708 if (detail) {
3709 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3710 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3711 }
3712 }
3713 }
3714
3715 // hit_set-less cache_mode?
3716 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
3717 int problem_cache_pools = 0;
3718 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
3719 p != osdmap.pools.end();
3720 ++p) {
3721 const pg_pool_t& info = p->second;
3722 if (info.cache_mode_requires_hit_set() &&
3723 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
3724 ++problem_cache_pools;
3725 if (detail) {
3726 ostringstream ss;
3727 ss << "pool '" << osdmap.get_pool_name(p->first)
3728 << "' with cache_mode " << info.get_cache_mode_name()
3729 << " needs hit_set_type to be set but it is not";
3730 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3731 }
3732 }
3733 }
3734 if (problem_cache_pools) {
3735 ostringstream ss;
3736 ss << problem_cache_pools << " cache pools are missing hit_sets";
3737 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3738 }
3739 }
3740
31f18b77
FG
3741 if (osdmap.crush->has_multirule_rulesets()) {
3742 ostringstream ss;
3743 ss << "CRUSH map contains multirule rulesets";
3744 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3745 if (detail) {
3746 ss << "; please manually fix the map";
3747 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3748 }
3749 }
3750
7c673cae
FG
3751 // Not using 'sortbitwise' and should be?
3752 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
3753 (osdmap.get_up_osd_features() &
3754 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
3755 ostringstream ss;
3756 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
3757 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3758 }
3759
3760 // Warn if 'mon_osd_down_out_interval' is set to zero.
3761 // Having this option set to zero on the leader acts much like the
3762 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3763 // without the 'noout' flag set but acting like that just the same, so
3764 // we report a HEALTH_WARN in case this option is set to zero.
3765 // This is an ugly hack to get the warning out, but until we find a way
3766 // to spread global options throughout the mon cluster and have all mons
3767 // using a base set of the same options, we need to work around this sort
3768 // of things.
3769 // There's also the obvious drawback that if this is set on a single
3770 // monitor on a 3-monitor cluster, this warning will only be shown every
3771 // third monitor connection.
3772 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
3773 g_conf->mon_osd_down_out_interval == 0) {
3774 ostringstream ss;
3775 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
3776 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3777 if (detail) {
3778 ss << "; this has the same effect as the 'noout' flag";
3779 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3780 }
3781 }
3782
3783 // warn about upgrade flags that can be set but are not.
3784 if (g_conf->mon_debug_no_require_luminous) {
3785 // ignore these checks
3786 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
31f18b77
FG
3787 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
3788 string msg = "all OSDs are running luminous or later but"
3789 " require_osd_release < luminous";
7c673cae
FG
3790 summary.push_back(make_pair(HEALTH_WARN, msg));
3791 if (detail) {
3792 detail->push_back(make_pair(HEALTH_WARN, msg));
3793 }
3794 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
31f18b77
FG
3795 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
3796 string msg = "all OSDs are running kraken or later but"
3797 " require_osd_release < kraken";
7c673cae
FG
3798 summary.push_back(make_pair(HEALTH_WARN, msg));
3799 if (detail) {
3800 detail->push_back(make_pair(HEALTH_WARN, msg));
3801 }
3802 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
31f18b77
FG
3803 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
3804 string msg = "all OSDs are running jewel or later but"
3805 " require_osd_release < jewel";
7c673cae
FG
3806 summary.push_back(make_pair(HEALTH_WARN, msg));
3807 if (detail) {
3808 detail->push_back(make_pair(HEALTH_WARN, msg));
3809 }
3810 }
3811
224ce89b
WB
3812 for (auto it : osdmap.get_pools()) {
3813 const pg_pool_t &pool = it.second;
3814 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
3815 const string& pool_name = osdmap.get_pool_name(it.first);
3816 stringstream ss;
3817 ss << "pool '" << pool_name << "' is full";
3818 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3819 if (detail)
3820 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3821 }
3822 }
7c673cae
FG
3823 }
3824}
3825
3826void OSDMonitor::dump_info(Formatter *f)
3827{
3828 f->open_object_section("osdmap");
3829 osdmap.dump(f);
3830 f->close_section();
3831
3832 f->open_array_section("osd_metadata");
3833 for (int i=0; i<osdmap.get_max_osd(); ++i) {
3834 if (osdmap.exists(i)) {
3835 f->open_object_section("osd");
3836 f->dump_unsigned("id", i);
3837 dump_osd_metadata(i, f, NULL);
3838 f->close_section();
3839 }
3840 }
3841 f->close_section();
3842
3843 f->dump_unsigned("osdmap_first_committed", get_first_committed());
3844 f->dump_unsigned("osdmap_last_committed", get_last_committed());
3845
3846 f->open_object_section("crushmap");
3847 osdmap.crush->dump(f);
3848 f->close_section();
3849}
3850
3851namespace {
3852 enum osd_pool_get_choices {
3853 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
31f18b77 3854 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
7c673cae
FG
3855 NODELETE, NOPGCHANGE, NOSIZECHANGE,
3856 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
3857 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
3858 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
3859 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
3860 CACHE_TARGET_FULL_RATIO,
3861 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
3862 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
3863 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
3864 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
3865 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
3866 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
3867 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
3868 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
3869 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
3870
3871 std::set<osd_pool_get_choices>
3872 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
3873 const std::set<osd_pool_get_choices>& second)
3874 {
3875 std::set<osd_pool_get_choices> result;
3876 std::set_difference(first.begin(), first.end(),
3877 second.begin(), second.end(),
3878 std::inserter(result, result.end()));
3879 return result;
3880 }
3881}
3882
3883
3884bool OSDMonitor::preprocess_command(MonOpRequestRef op)
3885{
3886 op->mark_osdmon_event(__func__);
3887 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
3888 int r = 0;
3889 bufferlist rdata;
3890 stringstream ss, ds;
3891
3892 map<string, cmd_vartype> cmdmap;
3893 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
3894 string rs = ss.str();
3895 mon->reply_command(op, -EINVAL, rs, get_last_committed());
3896 return true;
3897 }
3898
3899 MonSession *session = m->get_session();
3900 if (!session) {
3901 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
3902 return true;
3903 }
3904
3905 string prefix;
3906 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
3907
3908 string format;
3909 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
3910 boost::scoped_ptr<Formatter> f(Formatter::create(format));
3911
3912 if (prefix == "osd stat") {
224ce89b 3913 osdmap.print_summary(f.get(), ds, "");
7c673cae
FG
3914 if (f)
3915 f->flush(rdata);
3916 else
3917 rdata.append(ds);
3918 }
3919 else if (prefix == "osd perf" ||
3920 prefix == "osd blocked-by") {
31f18b77
FG
3921 r = mon->pgservice->process_pg_command(prefix, cmdmap,
3922 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
3923 }
3924 else if (prefix == "osd dump" ||
3925 prefix == "osd tree" ||
3926 prefix == "osd ls" ||
3927 prefix == "osd getmap" ||
31f18b77
FG
3928 prefix == "osd getcrushmap" ||
3929 prefix == "osd ls-tree") {
7c673cae
FG
3930 string val;
3931
3932 epoch_t epoch = 0;
3933 int64_t epochnum;
3934 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
3935 epoch = epochnum;
3936
3937 bufferlist osdmap_bl;
3938 int err = get_version_full(epoch, osdmap_bl);
3939 if (err == -ENOENT) {
3940 r = -ENOENT;
3941 ss << "there is no map for epoch " << epoch;
3942 goto reply;
3943 }
3944 assert(err == 0);
3945 assert(osdmap_bl.length());
3946
3947 OSDMap *p;
3948 if (epoch == osdmap.get_epoch()) {
3949 p = &osdmap;
3950 } else {
3951 p = new OSDMap;
3952 p->decode(osdmap_bl);
3953 }
3954
224ce89b
WB
3955 auto sg = make_scope_guard([&] {
3956 if (p != &osdmap) {
3957 delete p;
3958 }
3959 });
3960
7c673cae
FG
3961 if (prefix == "osd dump") {
3962 stringstream ds;
3963 if (f) {
3964 f->open_object_section("osdmap");
3965 p->dump(f.get());
3966 f->close_section();
3967 f->flush(ds);
3968 } else {
3969 p->print(ds);
3970 }
3971 rdata.append(ds);
3972 if (!f)
3973 ds << " ";
3974 } else if (prefix == "osd ls") {
3975 if (f) {
3976 f->open_array_section("osds");
3977 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3978 if (osdmap.exists(i)) {
3979 f->dump_int("osd", i);
3980 }
3981 }
3982 f->close_section();
3983 f->flush(ds);
3984 } else {
3985 bool first = true;
3986 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3987 if (osdmap.exists(i)) {
3988 if (!first)
3989 ds << "\n";
3990 first = false;
3991 ds << i;
3992 }
3993 }
3994 }
3995 rdata.append(ds);
3996 } else if (prefix == "osd tree") {
31f18b77
FG
3997 vector<string> states;
3998 cmd_getval(g_ceph_context, cmdmap, "states", states);
3999 unsigned filter = 0;
4000 for (auto& s : states) {
4001 if (s == "up") {
4002 filter |= OSDMap::DUMP_UP;
4003 } else if (s == "down") {
4004 filter |= OSDMap::DUMP_DOWN;
4005 } else if (s == "in") {
4006 filter |= OSDMap::DUMP_IN;
4007 } else if (s == "out") {
4008 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
4009 } else if (s == "destroyed") {
4010 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
4011 } else {
4012 ss << "unrecognized state '" << s << "'";
4013 r = -EINVAL;
4014 goto reply;
4015 }
4016 }
4017 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
4018 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4019 ss << "cannot specify both 'in' and 'out'";
4020 r = -EINVAL;
4021 goto reply;
4022 }
4023 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4024 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4025 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4026 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4027 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4028 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4029 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
4030 r = -EINVAL;
4031 goto reply;
4032 }
7c673cae
FG
4033 if (f) {
4034 f->open_object_section("tree");
31f18b77 4035 p->print_tree(f.get(), NULL, filter);
7c673cae
FG
4036 f->close_section();
4037 f->flush(ds);
4038 } else {
31f18b77 4039 p->print_tree(NULL, &ds, filter);
7c673cae
FG
4040 }
4041 rdata.append(ds);
4042 } else if (prefix == "osd getmap") {
4043 rdata.append(osdmap_bl);
4044 ss << "got osdmap epoch " << p->get_epoch();
4045 } else if (prefix == "osd getcrushmap") {
4046 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
4047 ss << p->get_crush_version();
4048 } else if (prefix == "osd ls-tree") {
4049 string bucket_name;
4050 cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
4051 set<int> osds;
4052 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4053 if (r == -ENOENT) {
4054 ss << "\"" << bucket_name << "\" does not exist";
4055 goto reply;
4056 } else if (r < 0) {
4057 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4058 goto reply;
4059 }
4060
4061 if (f) {
4062 f->open_array_section("osds");
4063 for (auto &i : osds) {
4064 if (osdmap.exists(i)) {
4065 f->dump_int("osd", i);
4066 }
4067 }
4068 f->close_section();
4069 f->flush(ds);
4070 } else {
4071 bool first = true;
4072 for (auto &i : osds) {
4073 if (osdmap.exists(i)) {
4074 if (!first)
4075 ds << "\n";
4076 first = false;
4077 ds << i;
4078 }
4079 }
4080 }
4081
4082 rdata.append(ds);
7c673cae 4083 }
7c673cae
FG
4084 } else if (prefix == "osd df") {
4085 string method;
4086 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
31f18b77
FG
4087 print_osd_utilization(osdmap, mon->pgservice, ds,
4088 f.get(), method == "tree");
7c673cae
FG
4089 rdata.append(ds);
4090 } else if (prefix == "osd getmaxosd") {
4091 if (f) {
4092 f->open_object_section("getmaxosd");
4093 f->dump_unsigned("epoch", osdmap.get_epoch());
4094 f->dump_int("max_osd", osdmap.get_max_osd());
4095 f->close_section();
4096 f->flush(rdata);
4097 } else {
4098 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4099 rdata.append(ds);
4100 }
4101 } else if (prefix == "osd utilization") {
4102 string out;
4103 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4104 if (f)
4105 f->flush(rdata);
4106 else
4107 rdata.append(out);
4108 r = 0;
4109 goto reply;
4110 } else if (prefix == "osd find") {
4111 int64_t osd;
4112 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4113 ss << "unable to parse osd id value '"
4114 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4115 r = -EINVAL;
4116 goto reply;
4117 }
4118 if (!osdmap.exists(osd)) {
4119 ss << "osd." << osd << " does not exist";
4120 r = -ENOENT;
4121 goto reply;
4122 }
4123 string format;
4124 cmd_getval(g_ceph_context, cmdmap, "format", format);
4125 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4126 f->open_object_section("osd_location");
4127 f->dump_int("osd", osd);
4128 f->dump_stream("ip") << osdmap.get_addr(osd);
4129 f->open_object_section("crush_location");
4130 map<string,string> loc = osdmap.crush->get_full_location(osd);
4131 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4132 f->dump_string(p->first.c_str(), p->second);
4133 f->close_section();
4134 f->close_section();
4135 f->flush(rdata);
4136 } else if (prefix == "osd metadata") {
4137 int64_t osd = -1;
4138 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4139 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4140 ss << "unable to parse osd id value '"
4141 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4142 r = -EINVAL;
4143 goto reply;
4144 }
4145 if (osd >= 0 && !osdmap.exists(osd)) {
4146 ss << "osd." << osd << " does not exist";
4147 r = -ENOENT;
4148 goto reply;
4149 }
4150 string format;
4151 cmd_getval(g_ceph_context, cmdmap, "format", format);
4152 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4153 if (osd >= 0) {
4154 f->open_object_section("osd_metadata");
4155 f->dump_unsigned("id", osd);
4156 r = dump_osd_metadata(osd, f.get(), &ss);
4157 if (r < 0)
4158 goto reply;
4159 f->close_section();
4160 } else {
4161 r = 0;
4162 f->open_array_section("osd_metadata");
4163 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4164 if (osdmap.exists(i)) {
4165 f->open_object_section("osd");
4166 f->dump_unsigned("id", i);
4167 r = dump_osd_metadata(i, f.get(), NULL);
4168 if (r == -EINVAL || r == -ENOENT) {
4169 // Drop error, continue to get other daemons' metadata
4170 dout(4) << "No metadata for osd." << i << dendl;
4171 r = 0;
4172 } else if (r < 0) {
4173 // Unexpected error
4174 goto reply;
4175 }
4176 f->close_section();
4177 }
4178 }
4179 f->close_section();
4180 }
4181 f->flush(rdata);
31f18b77
FG
4182 } else if (prefix == "osd versions") {
4183 if (!f)
4184 f.reset(Formatter::create("json-pretty"));
4185 count_metadata("ceph_version", f.get());
4186 f->flush(rdata);
4187 r = 0;
4188 } else if (prefix == "osd count-metadata") {
4189 if (!f)
4190 f.reset(Formatter::create("json-pretty"));
4191 string field;
4192 cmd_getval(g_ceph_context, cmdmap, "property", field);
4193 count_metadata(field, f.get());
4194 f->flush(rdata);
4195 r = 0;
7c673cae
FG
4196 } else if (prefix == "osd map") {
4197 string poolstr, objstr, namespacestr;
4198 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4199 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4200 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4201
4202 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4203 if (pool < 0) {
4204 ss << "pool " << poolstr << " does not exist";
4205 r = -ENOENT;
4206 goto reply;
4207 }
4208 object_locator_t oloc(pool, namespacestr);
4209 object_t oid(objstr);
4210 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4211 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4212 vector<int> up, acting;
4213 int up_p, acting_p;
4214 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4215
4216 string fullobjname;
4217 if (!namespacestr.empty())
4218 fullobjname = namespacestr + string("/") + oid.name;
4219 else
4220 fullobjname = oid.name;
4221 if (f) {
4222 f->open_object_section("osd_map");
4223 f->dump_unsigned("epoch", osdmap.get_epoch());
4224 f->dump_string("pool", poolstr);
4225 f->dump_int("pool_id", pool);
4226 f->dump_stream("objname") << fullobjname;
4227 f->dump_stream("raw_pgid") << pgid;
4228 f->dump_stream("pgid") << mpgid;
4229 f->open_array_section("up");
4230 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4231 f->dump_int("osd", *p);
4232 f->close_section();
4233 f->dump_int("up_primary", up_p);
4234 f->open_array_section("acting");
4235 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4236 f->dump_int("osd", *p);
4237 f->close_section();
4238 f->dump_int("acting_primary", acting_p);
4239 f->close_section(); // osd_map
4240 f->flush(rdata);
4241 } else {
4242 ds << "osdmap e" << osdmap.get_epoch()
4243 << " pool '" << poolstr << "' (" << pool << ")"
4244 << " object '" << fullobjname << "' ->"
4245 << " pg " << pgid << " (" << mpgid << ")"
4246 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4247 << pg_vector_string(acting) << ", p" << acting_p << ")";
4248 rdata.append(ds);
4249 }
4250
4251 } else if (prefix == "pg map") {
4252 pg_t pgid;
4253 string pgidstr;
4254 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4255 if (!pgid.parse(pgidstr.c_str())) {
4256 ss << "invalid pgid '" << pgidstr << "'";
4257 r = -EINVAL;
4258 goto reply;
4259 }
4260 vector<int> up, acting;
4261 if (!osdmap.have_pg_pool(pgid.pool())) {
4262 ss << "pg '" << pgidstr << "' does not exist";
4263 r = -ENOENT;
4264 goto reply;
4265 }
4266 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4267 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4268 if (f) {
4269 f->open_object_section("pg_map");
4270 f->dump_unsigned("epoch", osdmap.get_epoch());
4271 f->dump_stream("raw_pgid") << pgid;
4272 f->dump_stream("pgid") << mpgid;
4273 f->open_array_section("up");
4274 for (auto osd : up) {
4275 f->dump_int("up_osd", osd);
4276 }
4277 f->close_section();
4278 f->open_array_section("acting");
4279 for (auto osd : acting) {
4280 f->dump_int("acting_osd", osd);
4281 }
4282 f->close_section();
4283 f->close_section();
4284 f->flush(rdata);
4285 } else {
4286 ds << "osdmap e" << osdmap.get_epoch()
4287 << " pg " << pgid << " (" << mpgid << ")"
4288 << " -> up " << up << " acting " << acting;
4289 rdata.append(ds);
4290 }
4291 goto reply;
4292
224ce89b
WB
4293 } else if (prefix == "osd scrub" ||
4294 prefix == "osd deep-scrub" ||
4295 prefix == "osd repair") {
7c673cae
FG
4296 string whostr;
4297 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4298 vector<string> pvec;
4299 get_str_vec(prefix, pvec);
4300
224ce89b 4301 if (whostr == "*" || whostr == "all" || whostr == "any") {
7c673cae
FG
4302 ss << "osds ";
4303 int c = 0;
4304 for (int i = 0; i < osdmap.get_max_osd(); i++)
4305 if (osdmap.is_up(i)) {
4306 ss << (c++ ? "," : "") << i;
4307 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4308 pvec.back() == "repair",
4309 pvec.back() == "deep-scrub"),
4310 osdmap.get_inst(i));
4311 }
4312 r = 0;
4313 ss << " instructed to " << pvec.back();
4314 } else {
4315 long osd = parse_osd_id(whostr.c_str(), &ss);
4316 if (osd < 0) {
4317 r = -EINVAL;
4318 } else if (osdmap.is_up(osd)) {
4319 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4320 pvec.back() == "repair",
4321 pvec.back() == "deep-scrub"),
4322 osdmap.get_inst(osd));
4323 ss << "osd." << osd << " instructed to " << pvec.back();
4324 } else {
4325 ss << "osd." << osd << " is not up";
4326 r = -EAGAIN;
4327 }
4328 }
4329 } else if (prefix == "osd lspools") {
4330 int64_t auid;
4331 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4332 if (f)
4333 f->open_array_section("pools");
4334 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4335 p != osdmap.pools.end();
4336 ++p) {
4337 if (!auid || p->second.auid == (uint64_t)auid) {
4338 if (f) {
4339 f->open_object_section("pool");
4340 f->dump_int("poolnum", p->first);
4341 f->dump_string("poolname", osdmap.pool_name[p->first]);
4342 f->close_section();
4343 } else {
4344 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4345 }
4346 }
4347 }
4348 if (f) {
4349 f->close_section();
4350 f->flush(ds);
4351 }
4352 rdata.append(ds);
4353 } else if (prefix == "osd blacklist ls") {
4354 if (f)
4355 f->open_array_section("blacklist");
4356
4357 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4358 p != osdmap.blacklist.end();
4359 ++p) {
4360 if (f) {
4361 f->open_object_section("entry");
4362 f->dump_stream("addr") << p->first;
4363 f->dump_stream("until") << p->second;
4364 f->close_section();
4365 } else {
4366 stringstream ss;
4367 string s;
4368 ss << p->first << " " << p->second;
4369 getline(ss, s);
4370 s += "\n";
4371 rdata.append(s);
4372 }
4373 }
4374 if (f) {
4375 f->close_section();
4376 f->flush(rdata);
4377 }
4378 ss << "listed " << osdmap.blacklist.size() << " entries";
4379
4380 } else if (prefix == "osd pool ls") {
4381 string detail;
4382 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4383 if (!f && detail == "detail") {
4384 ostringstream ss;
4385 osdmap.print_pools(ss);
4386 rdata.append(ss.str());
4387 } else {
4388 if (f)
4389 f->open_array_section("pools");
4390 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4391 it != osdmap.get_pools().end();
4392 ++it) {
4393 if (f) {
4394 if (detail == "detail") {
4395 f->open_object_section("pool");
4396 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4397 it->second.dump(f.get());
4398 f->close_section();
4399 } else {
4400 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4401 }
4402 } else {
4403 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4404 }
4405 }
4406 if (f) {
4407 f->close_section();
4408 f->flush(rdata);
4409 }
4410 }
4411
4412 } else if (prefix == "osd crush get-tunable") {
4413 string tunable;
4414 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4415 ostringstream rss;
4416 if (f)
4417 f->open_object_section("tunable");
4418 if (tunable == "straw_calc_version") {
4419 if (f)
4420 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4421 else
4422 rss << osdmap.crush->get_straw_calc_version() << "\n";
4423 } else {
4424 r = -EINVAL;
4425 goto reply;
4426 }
4427 if (f) {
4428 f->close_section();
4429 f->flush(rdata);
4430 } else {
4431 rdata.append(rss.str());
4432 }
4433 r = 0;
4434
4435 } else if (prefix == "osd pool get") {
4436 string poolstr;
4437 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4438 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4439 if (pool < 0) {
4440 ss << "unrecognized pool '" << poolstr << "'";
4441 r = -ENOENT;
4442 goto reply;
4443 }
4444
4445 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4446 string var;
4447 cmd_getval(g_ceph_context, cmdmap, "var", var);
4448
4449 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4450 const choices_map_t ALL_CHOICES = {
4451 {"size", SIZE},
4452 {"min_size", MIN_SIZE},
4453 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4454 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
4455 {"crush_rule", CRUSH_RULE},
7c673cae
FG
4456 {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
4457 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4458 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4459 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4460 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4461 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4462 {"use_gmt_hitset", USE_GMT_HITSET},
4463 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4464 {"target_max_bytes", TARGET_MAX_BYTES},
4465 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4466 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4467 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4468 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4469 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4470 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4471 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4472 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4473 {"fast_read", FAST_READ},
4474 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4475 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4476 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4477 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4478 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4479 {"recovery_priority", RECOVERY_PRIORITY},
4480 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4481 {"scrub_priority", SCRUB_PRIORITY},
4482 {"compression_mode", COMPRESSION_MODE},
4483 {"compression_algorithm", COMPRESSION_ALGORITHM},
4484 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4485 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4486 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4487 {"csum_type", CSUM_TYPE},
4488 {"csum_max_block", CSUM_MAX_BLOCK},
4489 {"csum_min_block", CSUM_MIN_BLOCK},
4490 };
4491
4492 typedef std::set<osd_pool_get_choices> choices_set_t;
4493
4494 const choices_set_t ONLY_TIER_CHOICES = {
4495 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4496 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4497 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4498 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4499 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 4500 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
4501 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4502 };
4503 const choices_set_t ONLY_ERASURE_CHOICES = {
4504 ERASURE_CODE_PROFILE
4505 };
4506
4507 choices_set_t selected_choices;
4508 if (var == "all") {
4509 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4510 it != ALL_CHOICES.end(); ++it) {
4511 selected_choices.insert(it->second);
4512 }
4513
4514 if(!p->is_tier()) {
4515 selected_choices = subtract_second_from_first(selected_choices,
4516 ONLY_TIER_CHOICES);
4517 }
4518
4519 if(!p->is_erasure()) {
4520 selected_choices = subtract_second_from_first(selected_choices,
4521 ONLY_ERASURE_CHOICES);
4522 }
4523 } else /* var != "all" */ {
4524 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4525 osd_pool_get_choices selected = found->second;
4526
4527 if (!p->is_tier() &&
4528 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4529 ss << "pool '" << poolstr
4530 << "' is not a tier pool: variable not applicable";
4531 r = -EACCES;
4532 goto reply;
4533 }
4534
4535 if (!p->is_erasure() &&
4536 ONLY_ERASURE_CHOICES.find(selected)
4537 != ONLY_ERASURE_CHOICES.end()) {
4538 ss << "pool '" << poolstr
4539 << "' is not a erasure pool: variable not applicable";
4540 r = -EACCES;
4541 goto reply;
4542 }
4543
4544 selected_choices.insert(selected);
4545 }
4546
4547 if (f) {
4548 for(choices_set_t::const_iterator it = selected_choices.begin();
4549 it != selected_choices.end(); ++it) {
4550 choices_map_t::const_iterator i;
c07f9fc5
FG
4551 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4552 if (i->second == *it) {
4553 break;
4554 }
4555 }
4556 assert(i != ALL_CHOICES.end());
4557 bool pool_opt = pool_opts_t::is_opt_name(i->first);
4558 if (!pool_opt) {
4559 f->open_object_section("pool");
4560 f->dump_string("pool", poolstr);
4561 f->dump_int("pool_id", pool);
4562 }
7c673cae
FG
4563 switch(*it) {
4564 case PG_NUM:
4565 f->dump_int("pg_num", p->get_pg_num());
4566 break;
4567 case PGP_NUM:
4568 f->dump_int("pgp_num", p->get_pgp_num());
4569 break;
4570 case AUID:
4571 f->dump_int("auid", p->get_auid());
4572 break;
4573 case SIZE:
4574 f->dump_int("size", p->get_size());
4575 break;
4576 case MIN_SIZE:
4577 f->dump_int("min_size", p->get_min_size());
4578 break;
4579 case CRASH_REPLAY_INTERVAL:
4580 f->dump_int("crash_replay_interval",
4581 p->get_crash_replay_interval());
4582 break;
4583 case CRUSH_RULE:
31f18b77 4584 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4585 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 4586 p->get_crush_rule()));
7c673cae 4587 } else {
31f18b77 4588 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
4589 }
4590 break;
7c673cae
FG
4591 case HASHPSPOOL:
4592 case NODELETE:
4593 case NOPGCHANGE:
4594 case NOSIZECHANGE:
4595 case WRITE_FADVISE_DONTNEED:
4596 case NOSCRUB:
4597 case NODEEP_SCRUB:
7c673cae
FG
4598 f->dump_string(i->first.c_str(),
4599 p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4600 "true" : "false");
4601 break;
4602 case HIT_SET_PERIOD:
4603 f->dump_int("hit_set_period", p->hit_set_period);
4604 break;
4605 case HIT_SET_COUNT:
4606 f->dump_int("hit_set_count", p->hit_set_count);
4607 break;
4608 case HIT_SET_TYPE:
4609 f->dump_string("hit_set_type",
4610 HitSet::get_type_name(p->hit_set_params.get_type()));
4611 break;
4612 case HIT_SET_FPP:
4613 {
4614 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4615 BloomHitSet::Params *bloomp =
4616 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4617 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4618 } else if(var != "all") {
4619 f->close_section();
4620 ss << "hit set is not of type Bloom; " <<
4621 "invalid to get a false positive rate!";
4622 r = -EINVAL;
4623 goto reply;
4624 }
4625 }
4626 break;
4627 case USE_GMT_HITSET:
4628 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4629 break;
4630 case TARGET_MAX_OBJECTS:
4631 f->dump_unsigned("target_max_objects", p->target_max_objects);
4632 break;
4633 case TARGET_MAX_BYTES:
4634 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4635 break;
4636 case CACHE_TARGET_DIRTY_RATIO:
4637 f->dump_unsigned("cache_target_dirty_ratio_micro",
4638 p->cache_target_dirty_ratio_micro);
4639 f->dump_float("cache_target_dirty_ratio",
4640 ((float)p->cache_target_dirty_ratio_micro/1000000));
4641 break;
4642 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4643 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
4644 p->cache_target_dirty_high_ratio_micro);
4645 f->dump_float("cache_target_dirty_high_ratio",
4646 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
4647 break;
4648 case CACHE_TARGET_FULL_RATIO:
4649 f->dump_unsigned("cache_target_full_ratio_micro",
4650 p->cache_target_full_ratio_micro);
4651 f->dump_float("cache_target_full_ratio",
4652 ((float)p->cache_target_full_ratio_micro/1000000));
4653 break;
4654 case CACHE_MIN_FLUSH_AGE:
4655 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
4656 break;
4657 case CACHE_MIN_EVICT_AGE:
4658 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
4659 break;
4660 case ERASURE_CODE_PROFILE:
4661 f->dump_string("erasure_code_profile", p->erasure_code_profile);
4662 break;
4663 case MIN_READ_RECENCY_FOR_PROMOTE:
4664 f->dump_int("min_read_recency_for_promote",
4665 p->min_read_recency_for_promote);
4666 break;
4667 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4668 f->dump_int("min_write_recency_for_promote",
4669 p->min_write_recency_for_promote);
4670 break;
4671 case FAST_READ:
4672 f->dump_int("fast_read", p->fast_read);
4673 break;
4674 case HIT_SET_GRADE_DECAY_RATE:
4675 f->dump_int("hit_set_grade_decay_rate",
4676 p->hit_set_grade_decay_rate);
4677 break;
4678 case HIT_SET_SEARCH_LAST_N:
4679 f->dump_int("hit_set_search_last_n",
4680 p->hit_set_search_last_n);
4681 break;
4682 case SCRUB_MIN_INTERVAL:
4683 case SCRUB_MAX_INTERVAL:
4684 case DEEP_SCRUB_INTERVAL:
4685 case RECOVERY_PRIORITY:
4686 case RECOVERY_OP_PRIORITY:
4687 case SCRUB_PRIORITY:
4688 case COMPRESSION_MODE:
4689 case COMPRESSION_ALGORITHM:
4690 case COMPRESSION_REQUIRED_RATIO:
4691 case COMPRESSION_MAX_BLOB_SIZE:
4692 case COMPRESSION_MIN_BLOB_SIZE:
4693 case CSUM_TYPE:
4694 case CSUM_MAX_BLOCK:
4695 case CSUM_MIN_BLOCK:
c07f9fc5
FG
4696 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4697 if (p->opts.is_set(key)) {
4698 f->open_object_section("pool");
4699 f->dump_string("pool", poolstr);
4700 f->dump_int("pool_id", pool);
4701 if(*it == CSUM_TYPE) {
4702 int val;
4703 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
4704 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
4705 } else {
4706 p->opts.dump(i->first, f.get());
4707 }
4708 f->close_section();
4709 f->flush(rdata);
7c673cae
FG
4710 }
4711 break;
4712 }
c07f9fc5
FG
4713 if (!pool_opt) {
4714 f->close_section();
4715 f->flush(rdata);
4716 }
7c673cae
FG
4717 }
4718
4719 } else /* !f */ {
4720 for(choices_set_t::const_iterator it = selected_choices.begin();
4721 it != selected_choices.end(); ++it) {
4722 choices_map_t::const_iterator i;
4723 switch(*it) {
4724 case PG_NUM:
4725 ss << "pg_num: " << p->get_pg_num() << "\n";
4726 break;
4727 case PGP_NUM:
4728 ss << "pgp_num: " << p->get_pgp_num() << "\n";
4729 break;
4730 case AUID:
4731 ss << "auid: " << p->get_auid() << "\n";
4732 break;
4733 case SIZE:
4734 ss << "size: " << p->get_size() << "\n";
4735 break;
4736 case MIN_SIZE:
4737 ss << "min_size: " << p->get_min_size() << "\n";
4738 break;
4739 case CRASH_REPLAY_INTERVAL:
4740 ss << "crash_replay_interval: " <<
4741 p->get_crash_replay_interval() << "\n";
4742 break;
4743 case CRUSH_RULE:
31f18b77 4744 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4745 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 4746 p->get_crush_rule()) << "\n";
7c673cae 4747 } else {
31f18b77 4748 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
4749 }
4750 break;
7c673cae
FG
4751 case HIT_SET_PERIOD:
4752 ss << "hit_set_period: " << p->hit_set_period << "\n";
4753 break;
4754 case HIT_SET_COUNT:
4755 ss << "hit_set_count: " << p->hit_set_count << "\n";
4756 break;
4757 case HIT_SET_TYPE:
4758 ss << "hit_set_type: " <<
4759 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
4760 break;
4761 case HIT_SET_FPP:
4762 {
4763 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4764 BloomHitSet::Params *bloomp =
4765 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4766 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
4767 } else if(var != "all") {
4768 ss << "hit set is not of type Bloom; " <<
4769 "invalid to get a false positive rate!";
4770 r = -EINVAL;
4771 goto reply;
4772 }
4773 }
4774 break;
4775 case USE_GMT_HITSET:
4776 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
4777 break;
4778 case TARGET_MAX_OBJECTS:
4779 ss << "target_max_objects: " << p->target_max_objects << "\n";
4780 break;
4781 case TARGET_MAX_BYTES:
4782 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
4783 break;
4784 case CACHE_TARGET_DIRTY_RATIO:
4785 ss << "cache_target_dirty_ratio: "
4786 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
4787 break;
4788 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4789 ss << "cache_target_dirty_high_ratio: "
4790 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
4791 break;
4792 case CACHE_TARGET_FULL_RATIO:
4793 ss << "cache_target_full_ratio: "
4794 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
4795 break;
4796 case CACHE_MIN_FLUSH_AGE:
4797 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
4798 break;
4799 case CACHE_MIN_EVICT_AGE:
4800 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
4801 break;
4802 case ERASURE_CODE_PROFILE:
4803 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
4804 break;
4805 case MIN_READ_RECENCY_FOR_PROMOTE:
4806 ss << "min_read_recency_for_promote: " <<
4807 p->min_read_recency_for_promote << "\n";
4808 break;
4809 case HIT_SET_GRADE_DECAY_RATE:
4810 ss << "hit_set_grade_decay_rate: " <<
4811 p->hit_set_grade_decay_rate << "\n";
4812 break;
4813 case HIT_SET_SEARCH_LAST_N:
4814 ss << "hit_set_search_last_n: " <<
4815 p->hit_set_search_last_n << "\n";
4816 break;
4817 case HASHPSPOOL:
4818 case NODELETE:
4819 case NOPGCHANGE:
4820 case NOSIZECHANGE:
4821 case WRITE_FADVISE_DONTNEED:
4822 case NOSCRUB:
4823 case NODEEP_SCRUB:
4824 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4825 if (i->second == *it)
4826 break;
4827 }
4828 assert(i != ALL_CHOICES.end());
4829 ss << i->first << ": " <<
4830 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4831 "true" : "false") << "\n";
4832 break;
4833 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4834 ss << "min_write_recency_for_promote: " <<
4835 p->min_write_recency_for_promote << "\n";
4836 break;
4837 case FAST_READ:
4838 ss << "fast_read: " << p->fast_read << "\n";
4839 break;
4840 case SCRUB_MIN_INTERVAL:
4841 case SCRUB_MAX_INTERVAL:
4842 case DEEP_SCRUB_INTERVAL:
4843 case RECOVERY_PRIORITY:
4844 case RECOVERY_OP_PRIORITY:
4845 case SCRUB_PRIORITY:
4846 case COMPRESSION_MODE:
4847 case COMPRESSION_ALGORITHM:
4848 case COMPRESSION_REQUIRED_RATIO:
4849 case COMPRESSION_MAX_BLOB_SIZE:
4850 case COMPRESSION_MIN_BLOB_SIZE:
4851 case CSUM_TYPE:
4852 case CSUM_MAX_BLOCK:
4853 case CSUM_MIN_BLOCK:
4854 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4855 if (i->second == *it)
4856 break;
4857 }
4858 assert(i != ALL_CHOICES.end());
4859 {
4860 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4861 if (p->opts.is_set(key)) {
4862 if(key == pool_opts_t::CSUM_TYPE) {
4863 int val;
4864 p->opts.get(key, &val);
4865 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
4866 } else {
4867 ss << i->first << ": " << p->opts.get(key) << "\n";
4868 }
4869 }
4870 }
4871 break;
4872 }
4873 rdata.append(ss.str());
4874 ss.str("");
4875 }
4876 }
4877 r = 0;
4878 } else if (prefix == "osd pool stats") {
31f18b77
FG
4879 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4880 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
4881 } else if (prefix == "osd pool get-quota") {
4882 string pool_name;
4883 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
4884
4885 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
4886 if (poolid < 0) {
4887 assert(poolid == -ENOENT);
4888 ss << "unrecognized pool '" << pool_name << "'";
4889 r = -ENOENT;
4890 goto reply;
4891 }
4892 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
4893
4894 if (f) {
4895 f->open_object_section("pool_quotas");
4896 f->dump_string("pool_name", pool_name);
4897 f->dump_unsigned("pool_id", poolid);
4898 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
4899 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
4900 f->close_section();
4901 f->flush(rdata);
4902 } else {
4903 stringstream rs;
4904 rs << "quotas for pool '" << pool_name << "':\n"
4905 << " max objects: ";
4906 if (p->quota_max_objects == 0)
4907 rs << "N/A";
4908 else
4909 rs << si_t(p->quota_max_objects) << " objects";
4910 rs << "\n"
4911 << " max bytes : ";
4912 if (p->quota_max_bytes == 0)
4913 rs << "N/A";
4914 else
4915 rs << si_t(p->quota_max_bytes) << "B";
4916 rdata.append(rs.str());
4917 }
4918 rdata.append("\n");
4919 r = 0;
4920 } else if (prefix == "osd crush rule list" ||
4921 prefix == "osd crush rule ls") {
c07f9fc5
FG
4922 if (f) {
4923 f->open_array_section("rules");
4924 osdmap.crush->list_rules(f.get());
4925 f->close_section();
4926 f->flush(rdata);
4927 } else {
4928 ostringstream ss;
4929 osdmap.crush->list_rules(&ss);
4930 rdata.append(ss.str());
4931 }
b5b8bbf5
FG
4932 } else if (prefix == "osd crush rule ls-by-class") {
4933 string class_name;
4934 cmd_getval(g_ceph_context, cmdmap, "class", class_name);
4935 if (class_name.empty()) {
4936 ss << "no class specified";
4937 r = -EINVAL;
4938 goto reply;
4939 }
4940 set<int> rules;
4941 r = osdmap.crush->get_rules_by_class(class_name, &rules);
4942 if (r < 0) {
4943 ss << "failed to get rules by class '" << class_name << "'";
4944 goto reply;
4945 }
4946 if (f) {
4947 f->open_array_section("rules");
4948 for (auto &rule: rules) {
4949 f->dump_string("name", osdmap.crush->get_rule_name(rule));
4950 }
4951 f->close_section();
4952 f->flush(rdata);
4953 } else {
4954 ostringstream rs;
4955 for (auto &rule: rules) {
4956 rs << osdmap.crush->get_rule_name(rule) << "\n";
4957 }
4958 rdata.append(rs.str());
4959 }
7c673cae
FG
4960 } else if (prefix == "osd crush rule dump") {
4961 string name;
4962 cmd_getval(g_ceph_context, cmdmap, "name", name);
4963 string format;
4964 cmd_getval(g_ceph_context, cmdmap, "format", format);
4965 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4966 if (name == "") {
4967 f->open_array_section("rules");
4968 osdmap.crush->dump_rules(f.get());
4969 f->close_section();
4970 } else {
4971 int ruleno = osdmap.crush->get_rule_id(name);
4972 if (ruleno < 0) {
31f18b77 4973 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
4974 r = ruleno;
4975 goto reply;
4976 }
4977 osdmap.crush->dump_rule(ruleno, f.get());
4978 }
4979 ostringstream rs;
4980 f->flush(rs);
4981 rs << "\n";
4982 rdata.append(rs.str());
4983 } else if (prefix == "osd crush dump") {
4984 string format;
4985 cmd_getval(g_ceph_context, cmdmap, "format", format);
4986 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4987 f->open_object_section("crush_map");
4988 osdmap.crush->dump(f.get());
4989 f->close_section();
4990 ostringstream rs;
4991 f->flush(rs);
4992 rs << "\n";
4993 rdata.append(rs.str());
4994 } else if (prefix == "osd crush show-tunables") {
4995 string format;
4996 cmd_getval(g_ceph_context, cmdmap, "format", format);
4997 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4998 f->open_object_section("crush_map_tunables");
4999 osdmap.crush->dump_tunables(f.get());
5000 f->close_section();
5001 ostringstream rs;
5002 f->flush(rs);
5003 rs << "\n";
5004 rdata.append(rs.str());
5005 } else if (prefix == "osd crush tree") {
c07f9fc5
FG
5006 string shadow;
5007 cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
5008 bool show_shadow = shadow == "--show-shadow";
5009 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5010 if (f) {
5011 osdmap.crush->dump_tree(nullptr,
5012 f.get(),
5013 osdmap.get_pool_names(),
5014 show_shadow);
5015 f->flush(rdata);
5016 } else {
5017 ostringstream ss;
5018 osdmap.crush->dump_tree(&ss,
5019 nullptr,
5020 osdmap.get_pool_names(),
5021 show_shadow);
5022 rdata.append(ss.str());
5023 }
d2e6a577
FG
5024 } else if (prefix == "osd crush ls") {
5025 string name;
5026 if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
5027 ss << "no node specified";
5028 r = -EINVAL;
5029 goto reply;
5030 }
5031 if (!osdmap.crush->name_exists(name)) {
5032 ss << "node '" << name << "' does not exist";
5033 r = -ENOENT;
5034 goto reply;
5035 }
5036 int id = osdmap.crush->get_item_id(name);
5037 list<int> result;
5038 if (id >= 0) {
5039 result.push_back(id);
5040 } else {
5041 int num = osdmap.crush->get_bucket_size(id);
5042 for (int i = 0; i < num; ++i) {
5043 result.push_back(osdmap.crush->get_bucket_item(id, i));
5044 }
5045 }
5046 if (f) {
5047 f->open_array_section("items");
5048 for (auto i : result) {
5049 f->dump_string("item", osdmap.crush->get_item_name(i));
5050 }
5051 f->close_section();
5052 f->flush(rdata);
5053 } else {
5054 ostringstream ss;
5055 for (auto i : result) {
5056 ss << osdmap.crush->get_item_name(i) << "\n";
5057 }
5058 rdata.append(ss.str());
5059 }
5060 r = 0;
7c673cae
FG
5061 } else if (prefix == "osd crush class ls") {
5062 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5063 f->open_array_section("crush_classes");
5064 for (auto i : osdmap.crush->class_name)
5065 f->dump_string("class", i.second);
5066 f->close_section();
5067 f->flush(rdata);
224ce89b
WB
5068 } else if (prefix == "osd crush class ls-osd") {
5069 string name;
5070 cmd_getval(g_ceph_context, cmdmap, "class", name);
224ce89b
WB
5071 set<int> osds;
5072 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
5073 if (f) {
5074 f->open_array_section("osds");
5075 for (auto &osd: osds)
5076 f->dump_int("osd", osd);
5077 f->close_section();
5078 f->flush(rdata);
5079 } else {
5080 bool first = true;
5081 for (auto &osd : osds) {
5082 if (!first)
5083 ds << "\n";
5084 first = false;
5085 ds << osd;
5086 }
5087 rdata.append(ds);
5088 }
7c673cae
FG
5089 } else if (prefix == "osd erasure-code-profile ls") {
5090 const auto &profiles = osdmap.get_erasure_code_profiles();
5091 if (f)
5092 f->open_array_section("erasure-code-profiles");
5093 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5094 if (f)
5095 f->dump_string("profile", i->first.c_str());
5096 else
5097 rdata.append(i->first + "\n");
5098 }
5099 if (f) {
5100 f->close_section();
5101 ostringstream rs;
5102 f->flush(rs);
5103 rs << "\n";
5104 rdata.append(rs.str());
5105 }
c07f9fc5
FG
5106 } else if (prefix == "osd crush weight-set ls") {
5107 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5108 if (f) {
5109 f->open_array_section("weight_sets");
5110 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5111 f->dump_string("pool", "(compat)");
5112 }
5113 for (auto& i : osdmap.crush->choose_args) {
5114 if (i.first >= 0) {
5115 f->dump_string("pool", osdmap.get_pool_name(i.first));
5116 }
5117 }
5118 f->close_section();
5119 f->flush(rdata);
5120 } else {
5121 ostringstream rs;
5122 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5123 rs << "(compat)\n";
5124 }
5125 for (auto& i : osdmap.crush->choose_args) {
5126 if (i.first >= 0) {
5127 rs << osdmap.get_pool_name(i.first) << "\n";
5128 }
5129 }
5130 rdata.append(rs.str());
5131 }
5132 } else if (prefix == "osd crush weight-set dump") {
5133 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5134 "json-pretty"));
5135 osdmap.crush->dump_choose_args(f.get());
5136 f->flush(rdata);
7c673cae
FG
5137 } else if (prefix == "osd erasure-code-profile get") {
5138 string name;
5139 cmd_getval(g_ceph_context, cmdmap, "name", name);
5140 if (!osdmap.has_erasure_code_profile(name)) {
5141 ss << "unknown erasure code profile '" << name << "'";
5142 r = -ENOENT;
5143 goto reply;
5144 }
5145 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5146 if (f)
5147 f->open_object_section("profile");
5148 for (map<string,string>::const_iterator i = profile.begin();
5149 i != profile.end();
5150 ++i) {
5151 if (f)
5152 f->dump_string(i->first.c_str(), i->second.c_str());
5153 else
5154 rdata.append(i->first + "=" + i->second + "\n");
5155 }
5156 if (f) {
5157 f->close_section();
5158 ostringstream rs;
5159 f->flush(rs);
5160 rs << "\n";
5161 rdata.append(rs.str());
5162 }
5163 } else {
5164 // try prepare update
5165 return false;
5166 }
5167
5168 reply:
5169 string rs;
5170 getline(ss, rs);
5171 mon->reply_command(op, r, rs, rdata, get_last_committed());
5172 return true;
5173}
5174
5175void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
5176{
5177 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
5178 pending_inc.get_new_pool(pool_id, pool)->flags = flags;
5179}
5180
5181bool OSDMonitor::update_pools_status()
5182{
31f18b77 5183 if (!mon->pgservice->is_readable())
7c673cae
FG
5184 return false;
5185
5186 bool ret = false;
5187
5188 auto& pools = osdmap.get_pools();
5189 for (auto it = pools.begin(); it != pools.end(); ++it) {
31f18b77
FG
5190 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5191 if (!pstat)
7c673cae 5192 continue;
31f18b77 5193 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
5194 const pg_pool_t &pool = it->second;
5195 const string& pool_name = osdmap.get_pool_name(it->first);
5196
5197 bool pool_is_full =
5198 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
5199 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
5200
5201 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
5202 if (pool_is_full)
5203 continue;
5204
5205 mon->clog->info() << "pool '" << pool_name
5206 << "' no longer full; removing FULL flag";
5207
5208 update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
5209 ret = true;
5210 } else {
5211 if (!pool_is_full)
5212 continue;
5213
5214 if (pool.quota_max_bytes > 0 &&
5215 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5216 mon->clog->warn() << "pool '" << pool_name << "' is full"
5217 << " (reached quota's max_bytes: "
5218 << si_t(pool.quota_max_bytes) << ")";
5219 }
5220 if (pool.quota_max_objects > 0 &&
5221 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
5222 mon->clog->warn() << "pool '" << pool_name << "' is full"
5223 << " (reached quota's max_objects: "
5224 << pool.quota_max_objects << ")";
5225 }
5226 update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
5227 ret = true;
5228 }
5229 }
5230 return ret;
5231}
5232
7c673cae
FG
5233int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5234{
5235 op->mark_osdmon_event(__func__);
5236 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5237 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5238 MonSession *session = m->get_session();
5239 if (!session)
5240 return -EPERM;
5241 string erasure_code_profile;
5242 stringstream ss;
31f18b77 5243 string rule_name;
7c673cae 5244 if (m->auid)
31f18b77 5245 return prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
7c673cae
FG
5246 0, 0,
5247 erasure_code_profile,
5248 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5249 else
31f18b77 5250 return prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
7c673cae
FG
5251 0, 0,
5252 erasure_code_profile,
5253 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5254}
5255
5256int OSDMonitor::crush_rename_bucket(const string& srcname,
5257 const string& dstname,
5258 ostream *ss)
5259{
5260 int ret;
5261 //
5262 // Avoid creating a pending crush if it does not already exists and
5263 // the rename would fail.
5264 //
5265 if (!_have_pending_crush()) {
5266 ret = _get_stable_crush().can_rename_bucket(srcname,
5267 dstname,
5268 ss);
5269 if (ret)
5270 return ret;
5271 }
5272
5273 CrushWrapper newcrush;
5274 _get_pending_crush(newcrush);
5275
5276 ret = newcrush.rename_bucket(srcname,
5277 dstname,
5278 ss);
5279 if (ret)
5280 return ret;
5281
5282 pending_inc.crush.clear();
5283 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5284 *ss << "renamed bucket " << srcname << " into " << dstname;
5285 return 0;
5286}
5287
5288void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5289{
5290 string replacement = "";
5291
5292 if (plugin == "jerasure_generic" ||
5293 plugin == "jerasure_sse3" ||
5294 plugin == "jerasure_sse4" ||
5295 plugin == "jerasure_neon") {
5296 replacement = "jerasure";
5297 } else if (plugin == "shec_generic" ||
5298 plugin == "shec_sse3" ||
5299 plugin == "shec_sse4" ||
5300 plugin == "shec_neon") {
5301 replacement = "shec";
5302 }
5303
5304 if (replacement != "") {
5305 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5306 << plugin << " that has been deprecated. Please use "
5307 << replacement << " instead." << dendl;
5308 }
5309}
5310
5311int OSDMonitor::normalize_profile(const string& profilename,
5312 ErasureCodeProfile &profile,
5313 bool force,
5314 ostream *ss)
5315{
5316 ErasureCodeInterfaceRef erasure_code;
5317 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5318 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5319 check_legacy_ec_plugin(plugin->second, profilename);
5320 int err = instance.factory(plugin->second,
5321 g_conf->get_val<std::string>("erasure_code_dir"),
5322 profile, &erasure_code, ss);
5323 if (err) {
5324 return err;
5325 }
5326
5327 err = erasure_code->init(profile, ss);
5328 if (err) {
5329 return err;
5330 }
5331
5332 auto it = profile.find("stripe_unit");
5333 if (it != profile.end()) {
5334 string err_str;
5335 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5336 if (!err_str.empty()) {
5337 *ss << "could not parse stripe_unit '" << it->second
5338 << "': " << err_str << std::endl;
5339 return -EINVAL;
5340 }
5341 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5342 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5343 if (chunk_size != stripe_unit) {
5344 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5345 << "alignment. Would be padded to " << chunk_size
5346 << std::endl;
5347 return -EINVAL;
5348 }
5349 if ((stripe_unit % 4096) != 0 && !force) {
5350 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5351 << "use --force to override this check" << std::endl;
5352 return -EINVAL;
5353 }
5354 }
5355 return 0;
5356}
5357
31f18b77 5358int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 5359 const string &profile,
31f18b77 5360 int *rule,
7c673cae
FG
5361 ostream *ss)
5362{
5363 int ruleid = osdmap.crush->get_rule_id(name);
5364 if (ruleid != -ENOENT) {
31f18b77 5365 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
5366 return -EEXIST;
5367 }
5368
5369 CrushWrapper newcrush;
5370 _get_pending_crush(newcrush);
5371
5372 ruleid = newcrush.get_rule_id(name);
5373 if (ruleid != -ENOENT) {
31f18b77 5374 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
5375 return -EALREADY;
5376 } else {
5377 ErasureCodeInterfaceRef erasure_code;
5378 int err = get_erasure_code(profile, &erasure_code, ss);
5379 if (err) {
5380 *ss << "failed to load plugin using profile " << profile << std::endl;
5381 return err;
5382 }
5383
224ce89b 5384 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
5385 erasure_code.reset();
5386 if (err < 0)
5387 return err;
31f18b77 5388 *rule = err;
7c673cae
FG
5389 pending_inc.crush.clear();
5390 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5391 return 0;
5392 }
5393}
5394
5395int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5396 ErasureCodeInterfaceRef *erasure_code,
5397 ostream *ss) const
5398{
5399 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5400 return -EAGAIN;
5401 ErasureCodeProfile profile =
5402 osdmap.get_erasure_code_profile(erasure_code_profile);
5403 ErasureCodeProfile::const_iterator plugin =
5404 profile.find("plugin");
5405 if (plugin == profile.end()) {
5406 *ss << "cannot determine the erasure code plugin"
5407 << " because there is no 'plugin' entry in the erasure_code_profile "
5408 << profile << std::endl;
5409 return -EINVAL;
5410 }
5411 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5412 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5413 return instance.factory(plugin->second,
5414 g_conf->get_val<std::string>("erasure_code_dir"),
5415 profile, erasure_code, ss);
5416}
5417
5418int OSDMonitor::check_cluster_features(uint64_t features,
5419 stringstream &ss)
5420{
5421 stringstream unsupported_ss;
5422 int unsupported_count = 0;
5423 if ((mon->get_quorum_con_features() & features) != features) {
5424 unsupported_ss << "the monitor cluster";
5425 ++unsupported_count;
5426 }
5427
5428 set<int32_t> up_osds;
5429 osdmap.get_up_osds(up_osds);
5430 for (set<int32_t>::iterator it = up_osds.begin();
5431 it != up_osds.end(); ++it) {
5432 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5433 if ((xi.features & features) != features) {
5434 if (unsupported_count > 0)
5435 unsupported_ss << ", ";
5436 unsupported_ss << "osd." << *it;
5437 unsupported_count ++;
5438 }
5439 }
5440
5441 if (unsupported_count > 0) {
5442 ss << "features " << features << " unsupported by: "
5443 << unsupported_ss.str();
5444 return -ENOTSUP;
5445 }
5446
5447 // check pending osd state, too!
5448 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5449 pending_inc.new_xinfo.begin();
5450 p != pending_inc.new_xinfo.end(); ++p) {
5451 const osd_xinfo_t &xi = p->second;
5452 if ((xi.features & features) != features) {
5453 dout(10) << __func__ << " pending osd." << p->first
5454 << " features are insufficient; retry" << dendl;
5455 return -EAGAIN;
5456 }
5457 }
5458
5459 return 0;
5460}
5461
5462bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5463 stringstream& ss)
5464{
5465 OSDMap::Incremental new_pending = pending_inc;
5466 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5467 OSDMap newmap;
5468 newmap.deepish_copy_from(osdmap);
5469 newmap.apply_incremental(new_pending);
5470
5471 // client compat
31f18b77 5472 if (newmap.require_min_compat_client > 0) {
7c673cae 5473 auto mv = newmap.get_min_compat_client();
31f18b77
FG
5474 if (mv > newmap.require_min_compat_client) {
5475 ss << "new crush map requires client version " << ceph_release_name(mv)
7c673cae 5476 << " but require_min_compat_client is "
31f18b77 5477 << ceph_release_name(newmap.require_min_compat_client);
7c673cae
FG
5478 return false;
5479 }
5480 }
5481
5482 // osd compat
5483 uint64_t features =
5484 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5485 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5486 stringstream features_ss;
5487 int r = check_cluster_features(features, features_ss);
5488 if (r) {
5489 ss << "Could not change CRUSH: " << features_ss.str();
5490 return false;
5491 }
5492
5493 return true;
5494}
5495
5496bool OSDMonitor::erasure_code_profile_in_use(
5497 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5498 const string &profile,
5499 ostream *ss)
5500{
5501 bool found = false;
5502 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5503 p != pools.end();
5504 ++p) {
5505 if (p->second.erasure_code_profile == profile) {
5506 *ss << osdmap.pool_name[p->first] << " ";
5507 found = true;
5508 }
5509 }
5510 if (found) {
5511 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5512 }
5513 return found;
5514}
5515
5516int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5517 map<string,string> *erasure_code_profile_map,
5518 ostream *ss)
5519{
5520 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5521 *ss,
5522 erasure_code_profile_map);
5523 if (r)
5524 return r;
5525 assert((*erasure_code_profile_map).count("plugin"));
5526 string default_plugin = (*erasure_code_profile_map)["plugin"];
5527 map<string,string> user_map;
5528 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5529 i != erasure_code_profile.end();
5530 ++i) {
5531 size_t equal = i->find('=');
5532 if (equal == string::npos) {
5533 user_map[*i] = string();
5534 (*erasure_code_profile_map)[*i] = string();
5535 } else {
5536 const string key = i->substr(0, equal);
5537 equal++;
5538 const string value = i->substr(equal);
5539 user_map[key] = value;
5540 (*erasure_code_profile_map)[key] = value;
5541 }
5542 }
5543
5544 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
5545 (*erasure_code_profile_map) = user_map;
5546
5547 return 0;
5548}
5549
5550int OSDMonitor::prepare_pool_size(const unsigned pool_type,
5551 const string &erasure_code_profile,
5552 unsigned *size, unsigned *min_size,
5553 ostream *ss)
5554{
5555 int err = 0;
5556 switch (pool_type) {
5557 case pg_pool_t::TYPE_REPLICATED:
5558 *size = g_conf->osd_pool_default_size;
5559 *min_size = g_conf->get_osd_pool_default_min_size();
5560 break;
5561 case pg_pool_t::TYPE_ERASURE:
5562 {
5563 ErasureCodeInterfaceRef erasure_code;
5564 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5565 if (err == 0) {
5566 *size = erasure_code->get_chunk_count();
5567 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
5568 }
5569 }
5570 break;
5571 default:
5572 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
5573 err = -EINVAL;
5574 break;
5575 }
5576 return err;
5577}
5578
5579int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
5580 const string &erasure_code_profile,
5581 uint32_t *stripe_width,
5582 ostream *ss)
5583{
5584 int err = 0;
5585 switch (pool_type) {
5586 case pg_pool_t::TYPE_REPLICATED:
5587 // ignored
5588 break;
5589 case pg_pool_t::TYPE_ERASURE:
5590 {
5591 ErasureCodeProfile profile =
5592 osdmap.get_erasure_code_profile(erasure_code_profile);
5593 ErasureCodeInterfaceRef erasure_code;
5594 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5595 if (err)
5596 break;
5597 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5598 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
5599 auto it = profile.find("stripe_unit");
5600 if (it != profile.end()) {
5601 string err_str;
5602 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5603 assert(err_str.empty());
5604 }
5605 *stripe_width = data_chunks *
5606 erasure_code->get_chunk_size(stripe_unit * data_chunks);
5607 }
5608 break;
5609 default:
5610 *ss << "prepare_pool_stripe_width: "
5611 << pool_type << " is not a known pool type";
5612 err = -EINVAL;
5613 break;
5614 }
5615 return err;
5616}
5617
31f18b77 5618int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
5619 const string &erasure_code_profile,
5620 const string &rule_name,
5621 int *crush_rule,
5622 ostream *ss)
7c673cae
FG
5623{
5624
31f18b77 5625 if (*crush_rule < 0) {
7c673cae
FG
5626 switch (pool_type) {
5627 case pg_pool_t::TYPE_REPLICATED:
5628 {
31f18b77 5629 if (rule_name == "") {
224ce89b 5630 // Use default rule
31f18b77
FG
5631 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
5632 if (*crush_rule < 0) {
5633 // Errors may happen e.g. if no valid rule is available
5634 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
5635 << "'osd pool default crush *' config options";
5636 return -ENOENT;
5637 }
5638 } else {
31f18b77 5639 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
5640 }
5641 }
5642 break;
5643 case pg_pool_t::TYPE_ERASURE:
5644 {
31f18b77 5645 int err = crush_rule_create_erasure(rule_name,
7c673cae 5646 erasure_code_profile,
31f18b77 5647 crush_rule, ss);
7c673cae
FG
5648 switch (err) {
5649 case -EALREADY:
31f18b77
FG
5650 dout(20) << "prepare_pool_crush_rule: rule "
5651 << rule_name << " try again" << dendl;
7c673cae
FG
5652 // fall through
5653 case 0:
5654 // need to wait for the crush rule to be proposed before proceeding
5655 err = -EAGAIN;
5656 break;
5657 case -EEXIST:
5658 err = 0;
5659 break;
5660 }
5661 return err;
5662 }
5663 break;
5664 default:
31f18b77 5665 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
5666 << " is not a known pool type";
5667 return -EINVAL;
5668 break;
5669 }
5670 } else {
31f18b77
FG
5671 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
5672 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
5673 return -ENOENT;
5674 }
5675 }
5676
5677 return 0;
5678}
5679
31f18b77 5680int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
5681 int *crush_rule,
5682 ostream *ss)
7c673cae
FG
5683{
5684 int ret;
31f18b77 5685 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
5686 if (ret != -ENOENT) {
5687 // found it, use it
31f18b77 5688 *crush_rule = ret;
7c673cae
FG
5689 } else {
5690 CrushWrapper newcrush;
5691 _get_pending_crush(newcrush);
5692
31f18b77 5693 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
5694 if (ret != -ENOENT) {
5695 // found it, wait for it to be proposed
31f18b77 5696 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
5697 << " try again" << dendl;
5698 return -EAGAIN;
5699 } else {
224ce89b 5700 // Cannot find it , return error
31f18b77 5701 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
5702 return ret;
5703 }
5704 }
5705 return 0;
5706}
5707
5708/**
5709 * @param name The name of the new pool
5710 * @param auid The auid of the pool owner. Can be -1
31f18b77
FG
5711 * @param crush_rule The crush rule to use. If <0, will use the system default
5712 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
5713 * @param pg_num The pg_num to use. If set to 0, will use the system default
5714 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5715 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5716 * @param pool_type TYPE_ERASURE, or TYPE_REP
5717 * @param expected_num_objects expected number of objects on the pool
5718 * @param fast_read fast read type.
5719 * @param ss human readable error message, if any.
5720 *
5721 * @return 0 on success, negative errno on failure.
5722 */
5723int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
31f18b77
FG
5724 int crush_rule,
5725 const string &crush_rule_name,
7c673cae
FG
5726 unsigned pg_num, unsigned pgp_num,
5727 const string &erasure_code_profile,
5728 const unsigned pool_type,
5729 const uint64_t expected_num_objects,
5730 FastReadType fast_read,
5731 ostream *ss)
5732{
5733 if (name.length() == 0)
5734 return -EINVAL;
5735 if (pg_num == 0)
5736 pg_num = g_conf->osd_pool_default_pg_num;
5737 if (pgp_num == 0)
5738 pgp_num = g_conf->osd_pool_default_pgp_num;
5739 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
5740 *ss << "'pg_num' must be greater than 0 and less than or equal to "
5741 << g_conf->mon_max_pool_pg_num
5742 << " (you may adjust 'mon max pool pg num' for higher values)";
5743 return -ERANGE;
5744 }
5745 if (pgp_num > pg_num) {
5746 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5747 << ", which in this case is " << pg_num;
5748 return -ERANGE;
5749 }
5750 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
5751 *ss << "'fast_read' can only apply to erasure coding pool";
5752 return -EINVAL;
5753 }
5754 int r;
31f18b77
FG
5755 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
5756 crush_rule_name, &crush_rule, ss);
7c673cae 5757 if (r) {
31f18b77 5758 dout(10) << " prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
5759 return r;
5760 }
224ce89b
WB
5761 if (g_conf->mon_osd_crush_smoke_test) {
5762 CrushWrapper newcrush;
5763 _get_pending_crush(newcrush);
5764 ostringstream err;
5765 CrushTester tester(newcrush, err);
b5b8bbf5 5766 tester.set_min_x(0);
224ce89b
WB
5767 tester.set_max_x(50);
5768 tester.set_rule(crush_rule);
b5b8bbf5 5769 auto start = ceph::coarse_mono_clock::now();
224ce89b 5770 r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 5771 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
5772 if (r < 0) {
5773 dout(10) << " tester.test_with_fork returns " << r
5774 << ": " << err.str() << dendl;
5775 *ss << "crush test failed with " << r << ": " << err.str();
5776 return r;
5777 }
b5b8bbf5
FG
5778 dout(10) << __func__ << " crush somke test duration: "
5779 << duration << dendl;
7c673cae
FG
5780 }
5781 unsigned size, min_size;
5782 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
5783 if (r) {
5784 dout(10) << " prepare_pool_size returns " << r << dendl;
5785 return r;
5786 }
5787
31f18b77 5788 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
5789 return -EINVAL;
5790 }
5791
5792 uint32_t stripe_width = 0;
5793 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
5794 if (r) {
5795 dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
5796 return r;
5797 }
5798
5799 bool fread = false;
5800 if (pool_type == pg_pool_t::TYPE_ERASURE) {
5801 switch (fast_read) {
5802 case FAST_READ_OFF:
5803 fread = false;
5804 break;
5805 case FAST_READ_ON:
5806 fread = true;
5807 break;
5808 case FAST_READ_DEFAULT:
5809 fread = g_conf->mon_osd_pool_ec_fast_read;
5810 break;
5811 default:
5812 *ss << "invalid fast_read setting: " << fast_read;
5813 return -EINVAL;
5814 }
5815 }
5816
5817 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
5818 p != pending_inc.new_pool_names.end();
5819 ++p) {
5820 if (p->second == name)
5821 return 0;
5822 }
5823
5824 if (-1 == pending_inc.new_pool_max)
5825 pending_inc.new_pool_max = osdmap.pool_max;
5826 int64_t pool = ++pending_inc.new_pool_max;
5827 pg_pool_t empty;
5828 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
5829 pi->type = pool_type;
5830 pi->fast_read = fread;
5831 pi->flags = g_conf->osd_pool_default_flags;
5832 if (g_conf->osd_pool_default_flag_hashpspool)
5833 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
5834 if (g_conf->osd_pool_default_flag_nodelete)
5835 pi->set_flag(pg_pool_t::FLAG_NODELETE);
5836 if (g_conf->osd_pool_default_flag_nopgchange)
5837 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
5838 if (g_conf->osd_pool_default_flag_nosizechange)
5839 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
5840 if (g_conf->osd_pool_use_gmt_hitset &&
5841 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
5842 pi->use_gmt_hitset = true;
5843 else
5844 pi->use_gmt_hitset = false;
5845
5846 pi->size = size;
5847 pi->min_size = min_size;
31f18b77 5848 pi->crush_rule = crush_rule;
7c673cae
FG
5849 pi->expected_num_objects = expected_num_objects;
5850 pi->object_hash = CEPH_STR_HASH_RJENKINS;
5851 pi->set_pg_num(pg_num);
5852 pi->set_pgp_num(pgp_num);
5853 pi->last_change = pending_inc.epoch;
5854 pi->auid = auid;
5855 pi->erasure_code_profile = erasure_code_profile;
5856 pi->stripe_width = stripe_width;
5857 pi->cache_target_dirty_ratio_micro =
5858 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
5859 pi->cache_target_dirty_high_ratio_micro =
5860 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
5861 pi->cache_target_full_ratio_micro =
5862 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
5863 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
5864 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
5865 pending_inc.new_pool_names[pool] = name;
5866 return 0;
5867}
5868
5869bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
5870{
5871 op->mark_osdmon_event(__func__);
5872 ostringstream ss;
5873 if (pending_inc.new_flags < 0)
5874 pending_inc.new_flags = osdmap.get_flags();
5875 pending_inc.new_flags |= flag;
5876 ss << OSDMap::get_flag_string(flag) << " is set";
5877 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5878 get_last_committed() + 1));
5879 return true;
5880}
5881
5882bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
5883{
5884 op->mark_osdmon_event(__func__);
5885 ostringstream ss;
5886 if (pending_inc.new_flags < 0)
5887 pending_inc.new_flags = osdmap.get_flags();
5888 pending_inc.new_flags &= ~flag;
5889 ss << OSDMap::get_flag_string(flag) << " is unset";
5890 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5891 get_last_committed() + 1));
5892 return true;
5893}
5894
7c673cae
FG
5895int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
5896 stringstream& ss)
5897{
5898 string poolstr;
5899 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
5900 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5901 if (pool < 0) {
5902 ss << "unrecognized pool '" << poolstr << "'";
5903 return -ENOENT;
5904 }
5905 string var;
5906 cmd_getval(g_ceph_context, cmdmap, "var", var);
5907
5908 pg_pool_t p = *osdmap.get_pg_pool(pool);
5909 if (pending_inc.new_pools.count(pool))
5910 p = pending_inc.new_pools[pool];
5911
5912 // accept val as a json string in the normal case (current
5913 // generation monitor). parse out int or float values from the
5914 // string as needed. however, if it is not a string, try to pull
5915 // out an int, in case an older monitor with an older json schema is
5916 // forwarding a request.
5917 string val;
5918 string interr, floaterr;
5919 int64_t n = 0;
5920 double f = 0;
5921 int64_t uf = 0; // micro-f
5922 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
5923 // wasn't a string; maybe an older mon forwarded json with an int?
5924 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
5925 return -EINVAL; // no value!
5926 } else {
5927 // we got a string. see if it contains an int.
5928 n = strict_strtoll(val.c_str(), 10, &interr);
5929 // or a float
5930 f = strict_strtod(val.c_str(), &floaterr);
5931 uf = llrintl(f * (double)1000000.0);
5932 }
5933
5934 if (!p.is_tier() &&
5935 (var == "hit_set_type" || var == "hit_set_period" ||
5936 var == "hit_set_count" || var == "hit_set_fpp" ||
5937 var == "target_max_objects" || var == "target_max_bytes" ||
5938 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
5939 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
5940 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
5941 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
5942 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
5943 return -EACCES;
5944 }
5945
5946 if (var == "size") {
5947 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5948 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
5949 return -EPERM;
5950 }
5951 if (p.type == pg_pool_t::TYPE_ERASURE) {
5952 ss << "can not change the size of an erasure-coded pool";
5953 return -ENOTSUP;
5954 }
5955 if (interr.length()) {
5956 ss << "error parsing integer value '" << val << "': " << interr;
5957 return -EINVAL;
5958 }
5959 if (n <= 0 || n > 10) {
5960 ss << "pool size must be between 1 and 10";
5961 return -EINVAL;
5962 }
5963 p.size = n;
5964 if (n < p.min_size)
5965 p.min_size = n;
5966 } else if (var == "min_size") {
5967 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5968 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
5969 return -EPERM;
5970 }
5971 if (interr.length()) {
5972 ss << "error parsing integer value '" << val << "': " << interr;
5973 return -EINVAL;
5974 }
5975
5976 if (p.type != pg_pool_t::TYPE_ERASURE) {
5977 if (n < 1 || n > p.size) {
5978 ss << "pool min_size must be between 1 and " << (int)p.size;
5979 return -EINVAL;
5980 }
5981 } else {
5982 ErasureCodeInterfaceRef erasure_code;
5983 int k;
5984 stringstream tmp;
5985 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
5986 if (err == 0) {
5987 k = erasure_code->get_data_chunk_count();
5988 } else {
5989 ss << __func__ << " get_erasure_code failed: " << tmp.rdbuf();
5990 return err;
5991 }
5992
5993 if (n < k || n > p.size) {
5994 ss << "pool min_size must be between " << k << " and " << (int)p.size;
5995 return -EINVAL;
5996 }
5997 }
5998 p.min_size = n;
5999 } else if (var == "auid") {
6000 if (interr.length()) {
6001 ss << "error parsing integer value '" << val << "': " << interr;
6002 return -EINVAL;
6003 }
6004 p.auid = n;
6005 } else if (var == "crash_replay_interval") {
6006 if (interr.length()) {
6007 ss << "error parsing integer value '" << val << "': " << interr;
6008 return -EINVAL;
6009 }
6010 p.crash_replay_interval = n;
6011 } else if (var == "pg_num") {
6012 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6013 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6014 return -EPERM;
6015 }
6016 if (interr.length()) {
6017 ss << "error parsing integer value '" << val << "': " << interr;
6018 return -EINVAL;
6019 }
6020 if (n <= (int)p.get_pg_num()) {
6021 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
6022 if (n < (int)p.get_pg_num())
6023 return -EEXIST;
6024 return 0;
6025 }
c07f9fc5
FG
6026 if (n > (unsigned)g_conf->mon_max_pool_pg_num) {
6027 ss << "'pg_num' must be greater than 0 and less than or equal to "
6028 << g_conf->mon_max_pool_pg_num
6029 << " (you may adjust 'mon max pool pg num' for higher values)";
6030 return -ERANGE;
6031 }
7c673cae
FG
6032 string force;
6033 cmd_getval(g_ceph_context,cmdmap, "force", force);
6034 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
6035 force != "--yes-i-really-mean-it") {
6036 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6037 return -EPERM;
6038 }
6039 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
6040 int64_t new_pgs = n - p.get_pg_num();
6041 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
6042 ss << "specified pg_num " << n << " is too large (creating "
6043 << new_pgs << " new PGs on ~" << expected_osds
6044 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
6045 << ')';
6046 return -E2BIG;
6047 }
6048 p.set_pg_num(n);
6049 // force pre-luminous clients to resend their ops, since they
6050 // don't understand that split PGs now form a new interval.
6051 p.last_force_op_resend_preluminous = pending_inc.epoch;
6052 } else if (var == "pgp_num") {
6053 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6054 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6055 return -EPERM;
6056 }
6057 if (interr.length()) {
6058 ss << "error parsing integer value '" << val << "': " << interr;
6059 return -EINVAL;
6060 }
6061 if (n <= 0) {
6062 ss << "specified pgp_num must > 0, but you set to " << n;
6063 return -EINVAL;
6064 }
6065 if (n > (int)p.get_pg_num()) {
6066 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
6067 return -EINVAL;
6068 }
6069 p.set_pgp_num(n);
6070 } else if (var == "crush_rule") {
6071 int id = osdmap.crush->get_rule_id(val);
6072 if (id == -ENOENT) {
6073 ss << "crush rule " << val << " does not exist";
6074 return -ENOENT;
6075 }
6076 if (id < 0) {
6077 ss << cpp_strerror(id);
6078 return -ENOENT;
6079 }
6080 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
6081 return -EINVAL;
6082 }
31f18b77 6083 p.crush_rule = id;
7c673cae
FG
6084 } else if (var == "nodelete" || var == "nopgchange" ||
6085 var == "nosizechange" || var == "write_fadvise_dontneed" ||
6086 var == "noscrub" || var == "nodeep-scrub") {
6087 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6088 // make sure we only compare against 'n' if we didn't receive a string
6089 if (val == "true" || (interr.empty() && n == 1)) {
6090 p.set_flag(flag);
6091 } else if (val == "false" || (interr.empty() && n == 0)) {
6092 p.unset_flag(flag);
6093 } else {
6094 ss << "expecting value 'true', 'false', '0', or '1'";
6095 return -EINVAL;
6096 }
6097 } else if (var == "hashpspool") {
6098 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6099 string force;
6100 cmd_getval(g_ceph_context, cmdmap, "force", force);
6101 if (force != "--yes-i-really-mean-it") {
6102 ss << "are you SURE? this will remap all placement groups in this pool,"
6103 " this triggers large data movement,"
6104 " pass --yes-i-really-mean-it if you really do.";
6105 return -EPERM;
6106 }
6107 // make sure we only compare against 'n' if we didn't receive a string
6108 if (val == "true" || (interr.empty() && n == 1)) {
6109 p.set_flag(flag);
6110 } else if (val == "false" || (interr.empty() && n == 0)) {
6111 p.unset_flag(flag);
6112 } else {
6113 ss << "expecting value 'true', 'false', '0', or '1'";
6114 return -EINVAL;
6115 }
6116 } else if (var == "hit_set_type") {
6117 if (val == "none")
6118 p.hit_set_params = HitSet::Params();
6119 else {
6120 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
6121 if (err)
6122 return err;
6123 if (val == "bloom") {
6124 BloomHitSet::Params *bsp = new BloomHitSet::Params;
6125 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
6126 p.hit_set_params = HitSet::Params(bsp);
6127 } else if (val == "explicit_hash")
6128 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
6129 else if (val == "explicit_object")
6130 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
6131 else {
6132 ss << "unrecognized hit_set type '" << val << "'";
6133 return -EINVAL;
6134 }
6135 }
6136 } else if (var == "hit_set_period") {
6137 if (interr.length()) {
6138 ss << "error parsing integer value '" << val << "': " << interr;
6139 return -EINVAL;
6140 }
6141 p.hit_set_period = n;
6142 } else if (var == "hit_set_count") {
6143 if (interr.length()) {
6144 ss << "error parsing integer value '" << val << "': " << interr;
6145 return -EINVAL;
6146 }
6147 p.hit_set_count = n;
6148 } else if (var == "hit_set_fpp") {
6149 if (floaterr.length()) {
6150 ss << "error parsing floating point value '" << val << "': " << floaterr;
6151 return -EINVAL;
6152 }
6153 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
6154 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
6155 return -EINVAL;
6156 }
6157 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
6158 bloomp->set_fpp(f);
6159 } else if (var == "use_gmt_hitset") {
6160 if (val == "true" || (interr.empty() && n == 1)) {
6161 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
6162 ss << "not all OSDs support GMT hit set.";
6163 return -EINVAL;
6164 }
6165 p.use_gmt_hitset = true;
6166 } else {
6167 ss << "expecting value 'true' or '1'";
6168 return -EINVAL;
6169 }
6170 } else if (var == "allow_ec_overwrites") {
6171 if (!p.is_erasure()) {
6172 ss << "ec overwrites can only be enabled for an erasure coded pool";
6173 return -EINVAL;
6174 }
224ce89b
WB
6175 stringstream err;
6176 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6177 !is_pool_currently_all_bluestore(pool, p, &err)) {
6178 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6179 return -EINVAL;
6180 }
7c673cae
FG
6181 if (val == "true" || (interr.empty() && n == 1)) {
6182 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6183 } else if (val == "false" || (interr.empty() && n == 0)) {
6184 ss << "ec overwrites cannot be disabled once enabled";
6185 return -EINVAL;
6186 } else {
6187 ss << "expecting value 'true', 'false', '0', or '1'";
6188 return -EINVAL;
6189 }
7c673cae
FG
6190 } else if (var == "target_max_objects") {
6191 if (interr.length()) {
6192 ss << "error parsing int '" << val << "': " << interr;
6193 return -EINVAL;
6194 }
6195 p.target_max_objects = n;
6196 } else if (var == "target_max_bytes") {
6197 if (interr.length()) {
6198 ss << "error parsing int '" << val << "': " << interr;
6199 return -EINVAL;
6200 }
6201 p.target_max_bytes = n;
6202 } else if (var == "cache_target_dirty_ratio") {
6203 if (floaterr.length()) {
6204 ss << "error parsing float '" << val << "': " << floaterr;
6205 return -EINVAL;
6206 }
6207 if (f < 0 || f > 1.0) {
6208 ss << "value must be in the range 0..1";
6209 return -ERANGE;
6210 }
6211 p.cache_target_dirty_ratio_micro = uf;
6212 } else if (var == "cache_target_dirty_high_ratio") {
6213 if (floaterr.length()) {
6214 ss << "error parsing float '" << val << "': " << floaterr;
6215 return -EINVAL;
6216 }
6217 if (f < 0 || f > 1.0) {
6218 ss << "value must be in the range 0..1";
6219 return -ERANGE;
6220 }
6221 p.cache_target_dirty_high_ratio_micro = uf;
6222 } else if (var == "cache_target_full_ratio") {
6223 if (floaterr.length()) {
6224 ss << "error parsing float '" << val << "': " << floaterr;
6225 return -EINVAL;
6226 }
6227 if (f < 0 || f > 1.0) {
6228 ss << "value must be in the range 0..1";
6229 return -ERANGE;
6230 }
6231 p.cache_target_full_ratio_micro = uf;
6232 } else if (var == "cache_min_flush_age") {
6233 if (interr.length()) {
6234 ss << "error parsing int '" << val << "': " << interr;
6235 return -EINVAL;
6236 }
6237 p.cache_min_flush_age = n;
6238 } else if (var == "cache_min_evict_age") {
6239 if (interr.length()) {
6240 ss << "error parsing int '" << val << "': " << interr;
6241 return -EINVAL;
6242 }
6243 p.cache_min_evict_age = n;
6244 } else if (var == "min_read_recency_for_promote") {
6245 if (interr.length()) {
6246 ss << "error parsing integer value '" << val << "': " << interr;
6247 return -EINVAL;
6248 }
6249 p.min_read_recency_for_promote = n;
6250 } else if (var == "hit_set_grade_decay_rate") {
6251 if (interr.length()) {
6252 ss << "error parsing integer value '" << val << "': " << interr;
6253 return -EINVAL;
6254 }
6255 if (n > 100 || n < 0) {
6256 ss << "value out of range,valid range is 0 - 100";
6257 return -EINVAL;
6258 }
6259 p.hit_set_grade_decay_rate = n;
6260 } else if (var == "hit_set_search_last_n") {
6261 if (interr.length()) {
6262 ss << "error parsing integer value '" << val << "': " << interr;
6263 return -EINVAL;
6264 }
6265 if (n > p.hit_set_count || n < 0) {
6266 ss << "value out of range,valid range is 0 - hit_set_count";
6267 return -EINVAL;
6268 }
6269 p.hit_set_search_last_n = n;
6270 } else if (var == "min_write_recency_for_promote") {
6271 if (interr.length()) {
6272 ss << "error parsing integer value '" << val << "': " << interr;
6273 return -EINVAL;
6274 }
6275 p.min_write_recency_for_promote = n;
6276 } else if (var == "fast_read") {
6277 if (p.is_replicated()) {
6278 ss << "fast read is not supported in replication pool";
6279 return -EINVAL;
6280 }
6281 if (val == "true" || (interr.empty() && n == 1)) {
6282 p.fast_read = true;
6283 } else if (val == "false" || (interr.empty() && n == 0)) {
6284 p.fast_read = false;
6285 } else {
6286 ss << "expecting value 'true', 'false', '0', or '1'";
6287 return -EINVAL;
6288 }
6289 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 6290 bool unset = val == "unset";
7c673cae 6291 if (var == "compression_mode") {
224ce89b
WB
6292 if (!unset) {
6293 auto cmode = Compressor::get_comp_mode_type(val);
6294 if (!cmode) {
6295 ss << "unrecognized compression mode '" << val << "'";
6296 return -EINVAL;
6297 }
7c673cae
FG
6298 }
6299 } else if (var == "compression_algorithm") {
224ce89b
WB
6300 if (!unset) {
6301 auto alg = Compressor::get_comp_alg_type(val);
6302 if (!alg) {
6303 ss << "unrecognized compression_algorithm '" << val << "'";
6304 return -EINVAL;
6305 }
7c673cae
FG
6306 }
6307 } else if (var == "compression_required_ratio") {
6308 if (floaterr.length()) {
6309 ss << "error parsing float value '" << val << "': " << floaterr;
6310 return -EINVAL;
6311 }
224ce89b 6312 if (f < 0 || f > 1) {
7c673cae 6313 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 6314 return -EINVAL;
7c673cae
FG
6315 }
6316 } else if (var == "csum_type") {
224ce89b 6317 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
6318 if (t < 0 ) {
6319 ss << "unrecognized csum_type '" << val << "'";
224ce89b 6320 return -EINVAL;
7c673cae
FG
6321 }
6322 //preserve csum_type numeric value
6323 n = t;
6324 interr.clear();
6325 } else if (var == "compression_max_blob_size" ||
6326 var == "compression_min_blob_size" ||
6327 var == "csum_max_block" ||
6328 var == "csum_min_block") {
6329 if (interr.length()) {
6330 ss << "error parsing int value '" << val << "': " << interr;
6331 return -EINVAL;
6332 }
6333 }
6334
6335 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6336 switch (desc.type) {
6337 case pool_opts_t::STR:
224ce89b 6338 if (unset) {
7c673cae
FG
6339 p.opts.unset(desc.key);
6340 } else {
6341 p.opts.set(desc.key, static_cast<std::string>(val));
6342 }
6343 break;
6344 case pool_opts_t::INT:
6345 if (interr.length()) {
6346 ss << "error parsing integer value '" << val << "': " << interr;
6347 return -EINVAL;
6348 }
6349 if (n == 0) {
6350 p.opts.unset(desc.key);
6351 } else {
6352 p.opts.set(desc.key, static_cast<int>(n));
6353 }
6354 break;
6355 case pool_opts_t::DOUBLE:
6356 if (floaterr.length()) {
6357 ss << "error parsing floating point value '" << val << "': " << floaterr;
6358 return -EINVAL;
6359 }
6360 if (f == 0) {
6361 p.opts.unset(desc.key);
6362 } else {
6363 p.opts.set(desc.key, static_cast<double>(f));
6364 }
6365 break;
6366 default:
6367 assert(!"unknown type");
6368 }
6369 } else {
6370 ss << "unrecognized variable '" << var << "'";
6371 return -EINVAL;
6372 }
224ce89b
WB
6373 if (val != "unset") {
6374 ss << "set pool " << pool << " " << var << " to " << val;
6375 } else {
6376 ss << "unset pool " << pool << " " << var;
6377 }
7c673cae
FG
6378 p.last_change = pending_inc.epoch;
6379 pending_inc.new_pools[pool] = p;
6380 return 0;
6381}
6382
c07f9fc5
FG
6383int OSDMonitor::prepare_command_pool_application(const string &prefix,
6384 map<string,cmd_vartype> &cmdmap,
6385 stringstream& ss)
6386{
6387 string pool_name;
6388 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
6389 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6390 if (pool < 0) {
6391 ss << "unrecognized pool '" << pool_name << "'";
6392 return -ENOENT;
6393 }
6394
6395 pg_pool_t p = *osdmap.get_pg_pool(pool);
6396 if (pending_inc.new_pools.count(pool)) {
6397 p = pending_inc.new_pools[pool];
6398 }
6399
6400 string app;
6401 cmd_getval(g_ceph_context, cmdmap, "app", app);
6402 bool app_exists = (p.application_metadata.count(app) > 0);
6403
6404 if (boost::algorithm::ends_with(prefix, "enable")) {
6405 if (app.empty()) {
6406 ss << "application name must be provided";
6407 return -EINVAL;
6408 }
6409
6410 if (p.is_tier()) {
6411 ss << "application must be enabled on base tier";
6412 return -EINVAL;
6413 }
6414
6415 string force;
6416 cmd_getval(g_ceph_context, cmdmap, "force", force);
6417
6418 if (!app_exists && !p.application_metadata.empty() &&
6419 force != "--yes-i-really-mean-it") {
6420 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
6421 << "application; pass --yes-i-really-mean-it to proceed anyway";
6422 return -EPERM;
6423 }
6424
6425 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
6426 ss << "too many enabled applications on pool '" << pool_name << "'; "
6427 << "max " << MAX_POOL_APPLICATIONS;
6428 return -EINVAL;
6429 }
6430
6431 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
6432 ss << "application name '" << app << "' too long; max length "
6433 << MAX_POOL_APPLICATION_LENGTH;
6434 return -EINVAL;
6435 }
6436
6437 if (!app_exists) {
6438 p.application_metadata[app] = {};
6439 }
6440 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
6441
6442 } else if (boost::algorithm::ends_with(prefix, "disable")) {
6443 string force;
6444 cmd_getval(g_ceph_context, cmdmap, "force", force);
6445
6446 if (force != "--yes-i-really-mean-it") {
6447 ss << "Are you SURE? Disabling an application within a pool might result "
6448 << "in loss of application functionality; pass "
6449 << "--yes-i-really-mean-it to proceed anyway";
6450 return -EPERM;
6451 }
6452
6453 if (!app_exists) {
6454 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6455 << "'";
6456 return 0; // idempotent
6457 }
6458
6459 p.application_metadata.erase(app);
6460 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
6461
6462 } else if (boost::algorithm::ends_with(prefix, "set")) {
6463 if (p.is_tier()) {
6464 ss << "application metadata must be set on base tier";
6465 return -EINVAL;
6466 }
6467
6468 if (!app_exists) {
6469 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6470 << "'";
6471 return -ENOENT;
6472 }
6473
6474 string key;
6475 cmd_getval(g_ceph_context, cmdmap, "key", key);
6476
6477 if (key.empty()) {
6478 ss << "key must be provided";
6479 return -EINVAL;
6480 }
6481
6482 auto &app_keys = p.application_metadata[app];
6483 if (app_keys.count(key) == 0 &&
6484 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
6485 ss << "too many keys set for application '" << app << "' on pool '"
6486 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
6487 return -EINVAL;
6488 }
6489
6490 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
6491 ss << "key '" << app << "' too long; max length "
6492 << MAX_POOL_APPLICATION_LENGTH;
6493 return -EINVAL;
6494 }
6495
6496 string value;
6497 cmd_getval(g_ceph_context, cmdmap, "value", value);
6498 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
6499 ss << "value '" << value << "' too long; max length "
6500 << MAX_POOL_APPLICATION_LENGTH;
6501 return -EINVAL;
6502 }
6503
6504 p.application_metadata[app][key] = value;
6505 ss << "set application '" << app << "' key '" << key << "' to '"
6506 << value << "' on pool '" << pool_name << "'";
6507 } else if (boost::algorithm::ends_with(prefix, "rm")) {
6508 if (!app_exists) {
6509 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6510 << "'";
6511 return -ENOENT;
6512 }
6513
6514 string key;
6515 cmd_getval(g_ceph_context, cmdmap, "key", key);
6516 auto it = p.application_metadata[app].find(key);
6517 if (it == p.application_metadata[app].end()) {
6518 ss << "application '" << app << "' on pool '" << pool_name
6519 << "' does not have key '" << key << "'";
6520 return 0; // idempotent
6521 }
6522
6523 p.application_metadata[app].erase(it);
6524 ss << "removed application '" << app << "' key '" << key << "' on pool '"
6525 << pool_name << "'";
6526 } else {
6527 assert(false);
6528 }
6529
6530 p.last_change = pending_inc.epoch;
6531 pending_inc.new_pools[pool] = p;
6532 return 0;
6533}
6534
31f18b77
FG
6535int OSDMonitor::_prepare_command_osd_crush_remove(
6536 CrushWrapper &newcrush,
6537 int32_t id,
6538 int32_t ancestor,
6539 bool has_ancestor,
6540 bool unlink_only)
6541{
6542 int err = 0;
6543
6544 if (has_ancestor) {
6545 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
6546 unlink_only);
6547 } else {
6548 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
6549 }
6550 return err;
6551}
6552
6553void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
6554{
6555 pending_inc.crush.clear();
6556 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6557}
6558
6559int OSDMonitor::prepare_command_osd_crush_remove(
6560 CrushWrapper &newcrush,
6561 int32_t id,
6562 int32_t ancestor,
6563 bool has_ancestor,
6564 bool unlink_only)
6565{
6566 int err = _prepare_command_osd_crush_remove(
6567 newcrush, id, ancestor,
6568 has_ancestor, unlink_only);
6569
6570 if (err < 0)
6571 return err;
6572
6573 assert(err == 0);
6574 do_osd_crush_remove(newcrush);
6575
6576 return 0;
6577}
6578
6579int OSDMonitor::prepare_command_osd_remove(int32_t id)
6580{
6581 if (osdmap.is_up(id)) {
6582 return -EBUSY;
6583 }
6584
6585 pending_inc.new_state[id] = osdmap.get_state(id);
6586 pending_inc.new_uuid[id] = uuid_d();
6587 pending_metadata_rm.insert(id);
6588 pending_metadata.erase(id);
6589
6590 return 0;
6591}
6592
6593int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
6594{
6595 assert(existing_id);
6596 *existing_id = -1;
6597
6598 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
6599 if (!osdmap.exists(i) &&
6600 pending_inc.new_up_client.count(i) == 0 &&
6601 (pending_inc.new_state.count(i) == 0 ||
6602 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
6603 *existing_id = i;
6604 return -1;
6605 }
6606 }
6607
6608 if (pending_inc.new_max_osd < 0) {
6609 return osdmap.get_max_osd();
6610 }
6611 return pending_inc.new_max_osd;
6612}
6613
6614void OSDMonitor::do_osd_create(
6615 const int32_t id,
6616 const uuid_d& uuid,
6617 int32_t* new_id)
6618{
6619 dout(10) << __func__ << " uuid " << uuid << dendl;
6620 assert(new_id);
6621
6622 // We presume validation has been performed prior to calling this
6623 // function. We assert with prejudice.
6624
6625 int32_t allocated_id = -1; // declare here so we can jump
6626 int32_t existing_id = -1;
6627 if (!uuid.is_zero()) {
6628 existing_id = osdmap.identify_osd(uuid);
6629 if (existing_id >= 0) {
6630 assert(id < 0 || id == existing_id);
6631 *new_id = existing_id;
6632 goto out;
6633 } else if (id >= 0) {
6634 // uuid does not exist, and id has been provided, so just create
6635 // the new osd.id
6636 *new_id = id;
6637 goto out;
6638 }
6639 }
6640
6641 // allocate a new id
6642 allocated_id = _allocate_osd_id(&existing_id);
6643 dout(10) << __func__ << " allocated id " << allocated_id
6644 << " existing id " << existing_id << dendl;
6645 if (existing_id >= 0) {
6646 assert(existing_id < osdmap.get_max_osd());
6647 assert(allocated_id < 0);
6648 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
6649 *new_id = existing_id;
6650
6651 } else if (allocated_id >= 0) {
6652 assert(existing_id < 0);
6653 // raise max_osd
6654 if (pending_inc.new_max_osd < 0) {
6655 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
6656 } else {
6657 ++pending_inc.new_max_osd;
6658 }
6659 *new_id = pending_inc.new_max_osd - 1;
6660 assert(*new_id == allocated_id);
6661 } else {
6662 assert(0 == "unexpected condition");
6663 }
6664
6665out:
6666 dout(10) << __func__ << " using id " << *new_id << dendl;
6667 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
6668 pending_inc.new_max_osd = *new_id + 1;
6669 }
6670
6671 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
6672 if (!uuid.is_zero())
6673 pending_inc.new_uuid[*new_id] = uuid;
6674}
6675
6676int OSDMonitor::validate_osd_create(
6677 const int32_t id,
6678 const uuid_d& uuid,
6679 const bool check_osd_exists,
6680 int32_t* existing_id,
6681 stringstream& ss)
6682{
6683
6684 dout(10) << __func__ << " id " << id << " uuid " << uuid
6685 << " check_osd_exists " << check_osd_exists << dendl;
6686
6687 assert(existing_id);
6688
6689 if (id < 0 && uuid.is_zero()) {
6690 // we have nothing to validate
6691 *existing_id = -1;
6692 return 0;
6693 } else if (uuid.is_zero()) {
6694 // we have an id but we will ignore it - because that's what
6695 // `osd create` does.
6696 return 0;
6697 }
6698
6699 /*
6700 * This function will be used to validate whether we are able to
6701 * create a new osd when the `uuid` is specified.
6702 *
6703 * It will be used by both `osd create` and `osd new`, as the checks
6704 * are basically the same when it pertains to osd id and uuid validation.
6705 * However, `osd create` presumes an `uuid` is optional, for legacy
6706 * reasons, while `osd new` requires the `uuid` to be provided. This
6707 * means that `osd create` will not be idempotent if an `uuid` is not
6708 * provided, but we will always guarantee the idempotency of `osd new`.
6709 */
6710
6711 assert(!uuid.is_zero());
6712 if (pending_inc.identify_osd(uuid) >= 0) {
6713 // osd is about to exist
6714 return -EAGAIN;
6715 }
6716
6717 int32_t i = osdmap.identify_osd(uuid);
6718 if (i >= 0) {
6719 // osd already exists
6720 if (id >= 0 && i != id) {
6721 ss << "uuid " << uuid << " already in use for different id " << i;
6722 return -EEXIST;
6723 }
6724 // return a positive errno to distinguish between a blocking error
6725 // and an error we consider to not be a problem (i.e., this would be
6726 // an idempotent operation).
6727 *existing_id = i;
6728 return EEXIST;
6729 }
6730 // i < 0
6731 if (id >= 0) {
6732 if (pending_inc.new_state.count(id)) {
6733 // osd is about to exist
6734 return -EAGAIN;
6735 }
6736 // we may not care if an osd exists if we are recreating a previously
6737 // destroyed osd.
6738 if (check_osd_exists && osdmap.exists(id)) {
6739 ss << "id " << id << " already in use and does not match uuid "
6740 << uuid;
6741 return -EINVAL;
6742 }
6743 }
6744 return 0;
6745}
6746
6747int OSDMonitor::prepare_command_osd_create(
6748 const int32_t id,
6749 const uuid_d& uuid,
6750 int32_t* existing_id,
6751 stringstream& ss)
6752{
6753 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6754 assert(existing_id);
b5b8bbf5
FG
6755 if (osdmap.is_destroyed(id)) {
6756 ss << "ceph osd create has been deprecated. Please use ceph osd new "
6757 "instead.";
6758 return -EINVAL;
6759 }
31f18b77
FG
6760
6761 if (uuid.is_zero()) {
6762 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
6763 }
6764
6765 return validate_osd_create(id, uuid, true, existing_id, ss);
6766}
6767
6768int OSDMonitor::prepare_command_osd_new(
6769 MonOpRequestRef op,
6770 const map<string,cmd_vartype>& cmdmap,
6771 const map<string,string>& secrets,
6772 stringstream &ss,
6773 Formatter *f)
6774{
6775 uuid_d uuid;
6776 string uuidstr;
6777 int64_t id = -1;
6778
6779 assert(paxos->is_plugged());
6780
6781 dout(10) << __func__ << " " << op << dendl;
6782
6783 /* validate command. abort now if something's wrong. */
6784
6785 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
6786 *
6787 * If `id` is not specified, we will identify any existing osd based
6788 * on `uuid`. Operation will be idempotent iff secrets match.
6789 *
6790 * If `id` is specified, we will identify any existing osd based on
6791 * `uuid` and match against `id`. If they match, operation will be
6792 * idempotent iff secrets match.
6793 *
6794 * `-i secrets.json` will be optional. If supplied, will be used
6795 * to check for idempotency when `id` and `uuid` match.
6796 *
6797 * If `id` is not specified, and `uuid` does not exist, an id will
6798 * be found or allocated for the osd.
6799 *
6800 * If `id` is specified, and the osd has been previously marked
6801 * as destroyed, then the `id` will be reused.
6802 */
6803 if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
6804 ss << "requires the OSD's UUID to be specified.";
6805 return -EINVAL;
6806 } else if (!uuid.parse(uuidstr.c_str())) {
6807 ss << "invalid UUID value '" << uuidstr << "'.";
6808 return -EINVAL;
6809 }
6810
6811 if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
6812 (id < 0)) {
6813 ss << "invalid OSD id; must be greater or equal than zero.";
6814 return -EINVAL;
6815 }
6816
6817 // are we running an `osd create`-like command, or recreating
6818 // a previously destroyed osd?
6819
6820 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
6821
6822 // we will care about `id` to assess whether osd is `destroyed`, or
6823 // to create a new osd.
6824 // we will need an `id` by the time we reach auth.
6825
6826 int32_t existing_id = -1;
6827 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
6828 &existing_id, ss);
6829
6830 bool may_be_idempotent = false;
6831 if (err == EEXIST) {
6832 // this is idempotent from the osdmon's point-of-view
6833 may_be_idempotent = true;
6834 assert(existing_id >= 0);
6835 id = existing_id;
6836 } else if (err < 0) {
6837 return err;
6838 }
6839
6840 if (!may_be_idempotent) {
6841 // idempotency is out of the window. We are either creating a new
6842 // osd or recreating a destroyed osd.
6843 //
6844 // We now need to figure out if we have an `id` (and if it's valid),
6845 // of find an `id` if we don't have one.
6846
6847 // NOTE: we need to consider the case where the `id` is specified for
6848 // `osd create`, and we must honor it. So this means checking if
6849 // the `id` is destroyed, and if so assume the destroy; otherwise,
6850 // check if it `exists` - in which case we complain about not being
6851 // `destroyed`. In the end, if nothing fails, we must allow the
6852 // creation, so that we are compatible with `create`.
6853 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
6854 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
6855 ss << "OSD " << id << " has not yet been destroyed";
6856 return -EINVAL;
6857 } else if (id < 0) {
6858 // find an `id`
6859 id = _allocate_osd_id(&existing_id);
6860 if (id < 0) {
6861 assert(existing_id >= 0);
6862 id = existing_id;
6863 }
6864 dout(10) << __func__ << " found id " << id << " to use" << dendl;
6865 } else if (id >= 0 && osdmap.is_destroyed(id)) {
6866 dout(10) << __func__ << " recreating osd." << id << dendl;
6867 } else {
6868 dout(10) << __func__ << " creating new osd." << id << dendl;
6869 }
6870 } else {
6871 assert(id >= 0);
6872 assert(osdmap.exists(id));
6873 }
6874
6875 // we are now able to either create a brand new osd or reuse an existing
6876 // osd that has been previously destroyed.
6877
6878 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6879
6880 if (may_be_idempotent && secrets.empty()) {
6881 // nothing to do, really.
6882 dout(10) << __func__ << " idempotent and no secrets -- no op." << dendl;
6883 assert(id >= 0);
6884 if (f) {
6885 f->open_object_section("created_osd");
6886 f->dump_int("osdid", id);
6887 f->close_section();
6888 } else {
6889 ss << id;
6890 }
6891 return EEXIST;
6892 }
6893
6894 string cephx_secret, lockbox_secret, dmcrypt_key;
6895 bool has_lockbox = false;
6896 bool has_secrets = (!secrets.empty());
6897
6898 ConfigKeyService *svc = nullptr;
6899 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
6900
6901 if (has_secrets) {
6902 if (secrets.count("cephx_secret") == 0) {
6903 ss << "requires a cephx secret.";
6904 return -EINVAL;
6905 }
6906 cephx_secret = secrets.at("cephx_secret");
6907
6908 bool has_lockbox_secret = (secrets.count("cephx_lockbox_secret") > 0);
6909 bool has_dmcrypt_key = (secrets.count("dmcrypt_key") > 0);
6910
6911 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
6912 << " dmcrypt " << has_dmcrypt_key << dendl;
6913
6914 if (has_lockbox_secret && has_dmcrypt_key) {
6915 has_lockbox = true;
6916 lockbox_secret = secrets.at("cephx_lockbox_secret");
6917 dmcrypt_key = secrets.at("dmcrypt_key");
6918 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
6919 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
6920 return -EINVAL;
6921 }
6922
6923 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
6924
6925 err = mon->authmon()->validate_osd_new(id, uuid,
6926 cephx_secret,
6927 lockbox_secret,
6928 cephx_entity,
6929 lockbox_entity,
6930 ss);
6931 if (err < 0) {
6932 return err;
6933 } else if (may_be_idempotent && err != EEXIST) {
6934 // for this to be idempotent, `id` should already be >= 0; no need
6935 // to use validate_id.
6936 assert(id >= 0);
6937 ss << "osd." << id << " exists but secrets do not match";
6938 return -EEXIST;
6939 }
6940
6941 if (has_lockbox) {
6942 svc = (ConfigKeyService*)mon->config_key_service;
6943 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
6944 if (err < 0) {
6945 return err;
6946 } else if (may_be_idempotent && err != EEXIST) {
6947 assert(id >= 0);
6948 ss << "osd." << id << " exists but dm-crypt key does not match.";
6949 return -EEXIST;
6950 }
6951 }
6952 }
6953 assert(!has_secrets || !cephx_secret.empty());
6954 assert(!has_lockbox || !lockbox_secret.empty());
6955
6956 if (may_be_idempotent) {
6957 // we have nothing to do for either the osdmon or the authmon,
6958 // and we have no lockbox - so the config key service will not be
6959 // touched. This is therefore an idempotent operation, and we can
6960 // just return right away.
6961 dout(10) << __func__ << " idempotent -- no op." << dendl;
6962 assert(id >= 0);
6963 if (f) {
6964 f->open_object_section("created_osd");
6965 f->dump_int("osdid", id);
6966 f->close_section();
6967 } else {
6968 ss << id;
6969 }
6970 return EEXIST;
6971 }
6972 assert(!may_be_idempotent);
6973
6974 // perform updates.
6975 if (has_secrets) {
6976 assert(!cephx_secret.empty());
6977 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
6978 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
6979
6980 err = mon->authmon()->do_osd_new(cephx_entity,
6981 lockbox_entity,
6982 has_lockbox);
6983 assert(0 == err);
6984
6985 if (has_lockbox) {
6986 assert(nullptr != svc);
6987 svc->do_osd_new(uuid, dmcrypt_key);
6988 }
6989 }
6990
6991 if (is_recreate_destroyed) {
6992 assert(id >= 0);
6993 assert(osdmap.is_destroyed(id));
6994 pending_inc.new_weight[id] = CEPH_OSD_OUT;
6995 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
c07f9fc5
FG
6996 if (osdmap.get_state(id) & CEPH_OSD_UP) {
6997 // due to http://tracker.ceph.com/issues/20751 some clusters may
6998 // have UP set for non-existent OSDs; make sure it is cleared
6999 // for a newly created osd.
7000 pending_inc.new_state[id] |= CEPH_OSD_UP;
7001 }
31f18b77
FG
7002 pending_inc.new_uuid[id] = uuid;
7003 } else {
7004 assert(id >= 0);
7005 int32_t new_id = -1;
7006 do_osd_create(id, uuid, &new_id);
7007 assert(new_id >= 0);
7008 assert(id == new_id);
7009 }
7010
7011 if (f) {
7012 f->open_object_section("created_osd");
7013 f->dump_int("osdid", id);
7014 f->close_section();
7015 } else {
7016 ss << id;
7017 }
7018
7019 return 0;
7020}
7021
7c673cae
FG
7022bool OSDMonitor::prepare_command(MonOpRequestRef op)
7023{
7024 op->mark_osdmon_event(__func__);
7025 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7026 stringstream ss;
7027 map<string, cmd_vartype> cmdmap;
7028 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
7029 string rs = ss.str();
7030 mon->reply_command(op, -EINVAL, rs, get_last_committed());
7031 return true;
7032 }
7033
7034 MonSession *session = m->get_session();
7035 if (!session) {
7036 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
7037 return true;
7038 }
7039
7040 return prepare_command_impl(op, cmdmap);
7041}
7042
7043static int parse_reweights(CephContext *cct,
7044 const map<string,cmd_vartype> &cmdmap,
7045 const OSDMap& osdmap,
7046 map<int32_t, uint32_t>* weights)
7047{
7048 string weights_str;
7049 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
7050 return -EINVAL;
7051 }
7052 std::replace(begin(weights_str), end(weights_str), '\'', '"');
7053 json_spirit::mValue json_value;
7054 if (!json_spirit::read(weights_str, json_value)) {
7055 return -EINVAL;
7056 }
7057 if (json_value.type() != json_spirit::obj_type) {
7058 return -EINVAL;
7059 }
7060 const auto obj = json_value.get_obj();
7061 try {
7062 for (auto& osd_weight : obj) {
7063 auto osd_id = std::stoi(osd_weight.first);
7064 if (!osdmap.exists(osd_id)) {
7065 return -ENOENT;
7066 }
7067 if (osd_weight.second.type() != json_spirit::str_type) {
7068 return -EINVAL;
7069 }
7070 auto weight = std::stoul(osd_weight.second.get_str());
7071 weights->insert({osd_id, weight});
7072 }
7073 } catch (const std::logic_error& e) {
7074 return -EINVAL;
7075 }
7076 return 0;
7077}
7078
31f18b77
FG
7079int OSDMonitor::prepare_command_osd_destroy(
7080 int32_t id,
7081 stringstream& ss)
7082{
7083 assert(paxos->is_plugged());
7084
7085 // we check if the osd exists for the benefit of `osd purge`, which may
7086 // have previously removed the osd. If the osd does not exist, return
7087 // -ENOENT to convey this, and let the caller deal with it.
7088 //
7089 // we presume that all auth secrets and config keys were removed prior
7090 // to this command being called. if they exist by now, we also assume
7091 // they must have been created by some other command and do not pertain
7092 // to this non-existent osd.
7093 if (!osdmap.exists(id)) {
7094 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
7095 return -ENOENT;
7096 }
7097
7098 uuid_d uuid = osdmap.get_uuid(id);
7099 dout(10) << __func__ << " destroying osd." << id
7100 << " uuid " << uuid << dendl;
7101
7102 // if it has been destroyed, we assume our work here is done.
7103 if (osdmap.is_destroyed(id)) {
7104 ss << "destroyed osd." << id;
7105 return 0;
7106 }
7107
7108 EntityName cephx_entity, lockbox_entity;
7109 bool idempotent_auth = false, idempotent_cks = false;
7110
7111 int err = mon->authmon()->validate_osd_destroy(id, uuid,
7112 cephx_entity,
7113 lockbox_entity,
7114 ss);
7115 if (err < 0) {
7116 if (err == -ENOENT) {
7117 idempotent_auth = true;
31f18b77
FG
7118 } else {
7119 return err;
7120 }
7121 }
7122
7123 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
7124 err = svc->validate_osd_destroy(id, uuid);
7125 if (err < 0) {
7126 assert(err == -ENOENT);
7127 err = 0;
7128 idempotent_cks = true;
7129 }
7130
7131 if (!idempotent_auth) {
7132 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
7133 assert(0 == err);
7134 }
7135
7136 if (!idempotent_cks) {
7137 svc->do_osd_destroy(id, uuid);
7138 }
7139
7140 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
7141 pending_inc.new_uuid[id] = uuid_d();
7142
7143 // we can only propose_pending() once per service, otherwise we'll be
7144 // defying PaxosService and all laws of nature. Therefore, as we may
7145 // be used during 'osd purge', let's keep the caller responsible for
7146 // proposing.
7147 assert(err == 0);
7148 return 0;
7149}
7150
7151int OSDMonitor::prepare_command_osd_purge(
7152 int32_t id,
7153 stringstream& ss)
7154{
7155 assert(paxos->is_plugged());
7156 dout(10) << __func__ << " purging osd." << id << dendl;
7157
7158 assert(!osdmap.is_up(id));
7159
7160 /*
7161 * This may look a bit weird, but this is what's going to happen:
7162 *
7163 * 1. we make sure that removing from crush works
7164 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7165 * error, then we abort the whole operation, as no updates
7166 * have been made. However, we this function will have
7167 * side-effects, thus we need to make sure that all operations
7168 * performed henceforth will *always* succeed.
7169 * 3. we call `prepare_command_osd_remove()`. Although this
7170 * function can return an error, it currently only checks if the
7171 * osd is up - and we have made sure that it is not so, so there
7172 * is no conflict, and it is effectively an update.
7173 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7174 * the crush update we delayed from before.
7175 */
7176
7177 CrushWrapper newcrush;
7178 _get_pending_crush(newcrush);
7179
7180 bool may_be_idempotent = false;
7181
7182 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
7183 if (err == -ENOENT) {
7184 err = 0;
7185 may_be_idempotent = true;
7186 } else if (err < 0) {
7187 ss << "error removing osd." << id << " from crush";
7188 return err;
7189 }
7190
7191 // no point destroying the osd again if it has already been marked destroyed
7192 if (!osdmap.is_destroyed(id)) {
7193 err = prepare_command_osd_destroy(id, ss);
7194 if (err < 0) {
7195 if (err == -ENOENT) {
7196 err = 0;
7197 } else {
7198 return err;
7199 }
7200 } else {
7201 may_be_idempotent = false;
7202 }
7203 }
7204 assert(0 == err);
7205
7206 if (may_be_idempotent && !osdmap.exists(id)) {
7207 dout(10) << __func__ << " osd." << id << " does not exist and "
7208 << "we are idempotent." << dendl;
7209 return -ENOENT;
7210 }
7211
7212 err = prepare_command_osd_remove(id);
7213 // we should not be busy, as we should have made sure this id is not up.
7214 assert(0 == err);
7215
7216 do_osd_crush_remove(newcrush);
7217 return 0;
7218}
7219
7c673cae
FG
7220bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
7221 map<string,cmd_vartype> &cmdmap)
7222{
7223 op->mark_osdmon_event(__func__);
7224 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7225 bool ret = false;
7226 stringstream ss;
7227 string rs;
7228 bufferlist rdata;
7229 int err = 0;
7230
7231 string format;
7232 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
7233 boost::scoped_ptr<Formatter> f(Formatter::create(format));
7234
7235 string prefix;
7236 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
7237
7238 int64_t osdid;
7239 string name;
7240 bool osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
7241 if (osdid_present) {
7242 ostringstream oss;
7243 oss << "osd." << osdid;
7244 name = oss.str();
7245 }
7246
7247 // Even if there's a pending state with changes that could affect
7248 // a command, considering that said state isn't yet committed, we
7249 // just don't care about those changes if the command currently being
7250 // handled acts as a no-op against the current committed state.
7251 // In a nutshell, we assume this command happens *before*.
7252 //
7253 // Let me make this clearer:
7254 //
7255 // - If we have only one client, and that client issues some
7256 // operation that would conflict with this operation but is
7257 // still on the pending state, then we would be sure that said
7258 // operation wouldn't have returned yet, so the client wouldn't
7259 // issue this operation (unless the client didn't wait for the
7260 // operation to finish, and that would be the client's own fault).
7261 //
7262 // - If we have more than one client, each client will observe
7263 // whatever is the state at the moment of the commit. So, if we
7264 // have two clients, one issuing an unlink and another issuing a
7265 // link, and if the link happens while the unlink is still on the
7266 // pending state, from the link's point-of-view this is a no-op.
7267 // If different clients are issuing conflicting operations and
7268 // they care about that, then the clients should make sure they
7269 // enforce some kind of concurrency mechanism -- from our
7270 // perspective that's what Douglas Adams would call an SEP.
7271 //
7272 // This should be used as a general guideline for most commands handled
7273 // in this function. Adapt as you see fit, but please bear in mind that
7274 // this is the expected behavior.
7275
7276
7277 if (prefix == "osd setcrushmap" ||
7278 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
7279 if (pending_inc.crush.length()) {
7280 dout(10) << __func__ << " waiting for pending crush update " << dendl;
7281 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7282 return true;
7283 }
7c673cae
FG
7284 dout(10) << "prepare_command setting new crush map" << dendl;
7285 bufferlist data(m->get_data());
7286 CrushWrapper crush;
7287 try {
7288 bufferlist::iterator bl(data.begin());
7289 crush.decode(bl);
7290 }
7291 catch (const std::exception &e) {
7292 err = -EINVAL;
7293 ss << "Failed to parse crushmap: " << e.what();
7294 goto reply;
7295 }
31f18b77
FG
7296
7297 int64_t prior_version = 0;
7298 if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
7299 if (prior_version == osdmap.get_crush_version() - 1) {
7300 // see if we are a resend of the last update. this is imperfect
7301 // (multiple racing updaters may not both get reliable success)
7302 // but we expect crush updaters (via this interface) to be rare-ish.
7303 bufferlist current, proposed;
7304 osdmap.crush->encode(current, mon->get_quorum_con_features());
7305 crush.encode(proposed, mon->get_quorum_con_features());
7306 if (current.contents_equal(proposed)) {
7307 dout(10) << __func__
7308 << " proposed matches current and version equals previous"
7309 << dendl;
7310 err = 0;
7311 ss << osdmap.get_crush_version();
7312 goto reply;
7313 }
7314 }
7315 if (prior_version != osdmap.get_crush_version()) {
7316 err = -EPERM;
7317 ss << "prior_version " << prior_version << " != crush version "
7318 << osdmap.get_crush_version();
7319 goto reply;
7320 }
7321 }
7c673cae 7322
31f18b77
FG
7323 if (crush.has_legacy_rulesets()) {
7324 err = -EINVAL;
7325 ss << "crush maps with ruleset != ruleid are no longer allowed";
7326 goto reply;
7327 }
7c673cae
FG
7328 if (!validate_crush_against_features(&crush, ss)) {
7329 err = -EINVAL;
7330 goto reply;
7331 }
31f18b77 7332
7c673cae
FG
7333 const auto& osdmap_pools = osdmap.get_pools();
7334 for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
7335 const int64_t pool_id = pit->first;
7336 const pg_pool_t &pool = pit->second;
31f18b77 7337 int ruleno = pool.get_crush_rule();
7c673cae
FG
7338 if (!crush.rule_exists(ruleno)) {
7339 ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
7340 err = -EINVAL;
7341 goto reply;
7342 }
7343 }
7344
224ce89b
WB
7345 if (g_conf->mon_osd_crush_smoke_test) {
7346 // sanity check: test some inputs to make sure this map isn't
7347 // totally broken
7348 dout(10) << " testing map" << dendl;
7349 stringstream ess;
7350 CrushTester tester(crush, ess);
b5b8bbf5 7351 tester.set_min_x(0);
224ce89b 7352 tester.set_max_x(50);
b5b8bbf5 7353 auto start = ceph::coarse_mono_clock::now();
224ce89b 7354 int r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 7355 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
7356 if (r < 0) {
7357 dout(10) << " tester.test_with_fork returns " << r
7358 << ": " << ess.str() << dendl;
7359 ss << "crush smoke test failed with " << r << ": " << ess.str();
7360 err = r;
7361 goto reply;
7362 }
b5b8bbf5
FG
7363 dout(10) << __func__ << " crush somke test duration: "
7364 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
7365 }
7366
7c673cae 7367 pending_inc.crush = data;
31f18b77 7368 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
7369 goto update;
7370
7371 } else if (prefix == "osd crush set-device-class") {
224ce89b
WB
7372 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7373 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7374 << "luminous' before using crush device classes";
7375 err = -EPERM;
7c673cae
FG
7376 goto reply;
7377 }
7378
7379 string device_class;
7380 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7381 err = -EINVAL; // no value!
7382 goto reply;
7383 }
7384
224ce89b
WB
7385 bool stop = false;
7386 vector<string> idvec;
7387 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7c673cae
FG
7388 CrushWrapper newcrush;
7389 _get_pending_crush(newcrush);
224ce89b
WB
7390 set<int> updated;
7391 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7392 set<int> osds;
7393 // wildcard?
7394 if (j == 0 &&
7395 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7396 osdmap.get_all_osds(osds);
7397 stop = true;
7398 } else {
7399 // try traditional single osd way
7400 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7401 if (osd < 0) {
7402 // ss has reason for failure
7403 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7404 err = -EINVAL;
7405 continue;
7406 }
7407 osds.insert(osd);
7408 }
7c673cae 7409
224ce89b
WB
7410 for (auto &osd : osds) {
7411 if (!osdmap.exists(osd)) {
7412 ss << "osd." << osd << " does not exist. ";
7413 continue;
7414 }
7c673cae 7415
224ce89b
WB
7416 ostringstream oss;
7417 oss << "osd." << osd;
7418 string name = oss.str();
7c673cae 7419
224ce89b
WB
7420 string action;
7421 if (newcrush.item_exists(osd)) {
7422 action = "updating";
7423 } else {
7424 action = "creating";
7425 newcrush.set_item_name(osd, name);
7426 }
7c673cae 7427
224ce89b
WB
7428 dout(5) << action << " crush item id " << osd << " name '" << name
7429 << "' device_class '" << device_class << "'"
7430 << dendl;
7431 err = newcrush.update_device_class(osd, device_class, name, &ss);
7432 if (err < 0) {
7433 goto reply;
7434 }
7435 if (err == 0 && !_have_pending_crush()) {
7436 if (!stop) {
7437 // for single osd only, wildcard makes too much noise
7438 ss << "set-device-class item id " << osd << " name '" << name
7439 << "' device_class '" << device_class << "': no change";
7440 }
7441 } else {
7442 updated.insert(osd);
7443 }
7444 }
7c673cae
FG
7445 }
7446
224ce89b
WB
7447 if (!updated.empty()) {
7448 pending_inc.crush.clear();
7449 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7450 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
7451 getline(ss, rs);
7452 wait_for_finished_proposal(op,
7453 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7454 return true;
7455 }
7c673cae 7456
c07f9fc5
FG
7457 } else if (prefix == "osd crush rm-device-class") {
7458 bool stop = false;
7459 vector<string> idvec;
7460 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7461 CrushWrapper newcrush;
7462 _get_pending_crush(newcrush);
7463 set<int> updated;
7464
7465 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7466 set<int> osds;
7467
7468 // wildcard?
7469 if (j == 0 &&
7470 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7471 osdmap.get_all_osds(osds);
7472 stop = true;
7473 } else {
7474 // try traditional single osd way
7475 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7476 if (osd < 0) {
7477 // ss has reason for failure
7478 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7479 err = -EINVAL;
7480 goto reply;
7481 }
7482 osds.insert(osd);
7483 }
7484
7485 for (auto &osd : osds) {
7486 if (!osdmap.exists(osd)) {
7487 ss << "osd." << osd << " does not exist. ";
7488 continue;
7489 }
7490
7491 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
7492 if (!class_name) {
7493 ss << "osd." << osd << " belongs to no class, ";
7494 continue;
7495 }
7496 // note that we do not verify if class_is_in_use here
7497 // in case the device is misclassified and user wants
7498 // to overridely reset...
7499
7500 err = newcrush.remove_device_class(g_ceph_context, osd, &ss);
7501 if (err < 0) {
7502 // ss has reason for failure
7503 goto reply;
7504 }
7505 updated.insert(osd);
7506 }
7507 }
7508
7509 if (!updated.empty()) {
7510 pending_inc.crush.clear();
7511 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7512 ss << "done removing class of osd(s): " << updated;
7513 getline(ss, rs);
7514 wait_for_finished_proposal(op,
7515 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7516 return true;
7517 }
35e4c445
FG
7518 } else if (prefix == "osd crush class rename") {
7519 string srcname, dstname;
7520 if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
7521 err = -EINVAL;
7522 goto reply;
7523 }
7524 if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
7525 err = -EINVAL;
7526 goto reply;
7527 }
7528
7529 CrushWrapper newcrush;
7530 _get_pending_crush(newcrush);
7531
7532 if (!newcrush.class_exists(srcname)) {
7533 err = -ENOENT;
7534 ss << "class '" << srcname << "' does not exist";
7535 goto reply;
7536 }
7537
7538 if (newcrush.class_exists(dstname)) {
7539 err = -EEXIST;
7540 ss << "class '" << dstname << "' already exists";
7541 goto reply;
7542 }
c07f9fc5 7543
35e4c445
FG
7544 err = newcrush.rename_class(srcname, dstname);
7545 if (err < 0) {
7546 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
7547 << cpp_strerror(err);
7548 goto reply;
7549 }
7550
7551 pending_inc.crush.clear();
7552 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7553 ss << "rename class '" << srcname << "' to '" << dstname << "'";
7554 goto update;
7c673cae
FG
7555 } else if (prefix == "osd crush add-bucket") {
7556 // os crush add-bucket <name> <type>
7557 string name, typestr;
7558 cmd_getval(g_ceph_context, cmdmap, "name", name);
7559 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
7560
7561 if (!_have_pending_crush() &&
7562 _get_stable_crush().name_exists(name)) {
7563 ss << "bucket '" << name << "' already exists";
7564 goto reply;
7565 }
7566
7567 CrushWrapper newcrush;
7568 _get_pending_crush(newcrush);
7569
7570 if (newcrush.name_exists(name)) {
7571 ss << "bucket '" << name << "' already exists";
7572 goto update;
7573 }
7574 int type = newcrush.get_type_id(typestr);
7575 if (type < 0) {
7576 ss << "type '" << typestr << "' does not exist";
7577 err = -EINVAL;
7578 goto reply;
7579 }
7580 if (type == 0) {
7581 ss << "type '" << typestr << "' is for devices, not buckets";
7582 err = -EINVAL;
7583 goto reply;
7584 }
7585 int bucketno;
7586 err = newcrush.add_bucket(0, 0,
7587 CRUSH_HASH_DEFAULT, type, 0, NULL,
7588 NULL, &bucketno);
7589 if (err < 0) {
7590 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
7591 goto reply;
7592 }
7593 err = newcrush.set_item_name(bucketno, name);
7594 if (err < 0) {
7595 ss << "error setting bucket name to '" << name << "'";
7596 goto reply;
7597 }
7598
7599 pending_inc.crush.clear();
7600 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7601 ss << "added bucket " << name << " type " << typestr
7602 << " to crush map";
7603 goto update;
7604 } else if (prefix == "osd crush rename-bucket") {
7605 string srcname, dstname;
7606 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
7607 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
7608
7609 err = crush_rename_bucket(srcname, dstname, &ss);
7610 if (err == -EALREADY) // equivalent to success for idempotency
7611 err = 0;
7612 if (err)
7613 goto reply;
7614 else
7615 goto update;
c07f9fc5
FG
7616 } else if (prefix == "osd crush weight-set create" ||
7617 prefix == "osd crush weight-set create-compat") {
7618 CrushWrapper newcrush;
7619 _get_pending_crush(newcrush);
7620 int64_t pool;
7621 int positions;
7622 if (newcrush.has_non_straw2_buckets()) {
7623 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
7624 err = -EPERM;
7625 goto reply;
7626 }
c07f9fc5
FG
7627 if (prefix == "osd crush weight-set create") {
7628 if (osdmap.require_min_compat_client > 0 &&
7629 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
7630 ss << "require_min_compat_client "
7631 << ceph_release_name(osdmap.require_min_compat_client)
7632 << " < luminous, which is required for per-pool weight-sets. "
7633 << "Try 'ceph osd set-require-min-compat-client luminous' "
7634 << "before using the new interface";
7635 err = -EPERM;
7636 goto reply;
7637 }
7638 string poolname, mode;
7639 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
7640 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
7641 if (pool < 0) {
7642 ss << "pool '" << poolname << "' not found";
7643 err = -ENOENT;
7644 goto reply;
7645 }
7646 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
7647 if (mode != "flat" && mode != "positional") {
7648 ss << "unrecognized weight-set mode '" << mode << "'";
7649 err = -EINVAL;
7650 goto reply;
7651 }
7652 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
7653 } else {
7654 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
7655 positions = 1;
224ce89b 7656 }
c07f9fc5
FG
7657 newcrush.create_choose_args(pool, positions);
7658 pending_inc.crush.clear();
7659 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7660 goto update;
224ce89b 7661
c07f9fc5
FG
7662 } else if (prefix == "osd crush weight-set rm" ||
7663 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
7664 CrushWrapper newcrush;
7665 _get_pending_crush(newcrush);
c07f9fc5
FG
7666 int64_t pool;
7667 if (prefix == "osd crush weight-set rm") {
7668 string poolname;
7669 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
7670 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
7671 if (pool < 0) {
7672 ss << "pool '" << poolname << "' not found";
7673 err = -ENOENT;
7674 goto reply;
7675 }
7676 } else {
7677 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 7678 }
c07f9fc5
FG
7679 newcrush.rm_choose_args(pool);
7680 pending_inc.crush.clear();
7681 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7682 goto update;
224ce89b 7683
c07f9fc5
FG
7684 } else if (prefix == "osd crush weight-set reweight" ||
7685 prefix == "osd crush weight-set reweight-compat") {
7686 string poolname, item;
7687 vector<double> weight;
7688 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
7689 cmd_getval(g_ceph_context, cmdmap, "item", item);
7690 cmd_getval(g_ceph_context, cmdmap, "weight", weight);
7691 CrushWrapper newcrush;
7692 _get_pending_crush(newcrush);
7693 int64_t pool;
7694 if (prefix == "osd crush weight-set reweight") {
7695 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
7696 if (pool < 0) {
7697 ss << "pool '" << poolname << "' not found";
7698 err = -ENOENT;
7699 goto reply;
7700 }
7701 if (!newcrush.have_choose_args(pool)) {
7702 ss << "no weight-set for pool '" << poolname << "'";
7703 err = -ENOENT;
7704 goto reply;
7705 }
7706 auto arg_map = newcrush.choose_args_get(pool);
7707 int positions = newcrush.get_choose_args_positions(arg_map);
7708 if (weight.size() != (size_t)positions) {
7709 ss << "must specify exact " << positions << " weight values";
7710 err = -EINVAL;
7711 goto reply;
7712 }
7713 } else {
7714 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
7715 if (!newcrush.have_choose_args(pool)) {
7716 ss << "no backward-compatible weight-set";
7717 err = -ENOENT;
7718 goto reply;
7719 }
224ce89b 7720 }
c07f9fc5
FG
7721 if (!newcrush.name_exists(item)) {
7722 ss << "item '" << item << "' does not exist";
7723 err = -ENOENT;
224ce89b
WB
7724 goto reply;
7725 }
c07f9fc5
FG
7726 err = newcrush.choose_args_adjust_item_weightf(
7727 g_ceph_context,
7728 newcrush.choose_args_get(pool),
7729 newcrush.get_item_id(item),
7730 weight,
7731 &ss);
224ce89b 7732 if (err < 0) {
224ce89b
WB
7733 goto reply;
7734 }
c07f9fc5 7735 err = 0;
224ce89b
WB
7736 pending_inc.crush.clear();
7737 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
224ce89b 7738 goto update;
7c673cae
FG
7739 } else if (osdid_present &&
7740 (prefix == "osd crush set" || prefix == "osd crush add")) {
7741 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
7742 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
7743 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
7744
7745 if (!osdmap.exists(osdid)) {
7746 err = -ENOENT;
c07f9fc5 7747 ss << name << " does not exist. Create it before updating the crush map";
7c673cae
FG
7748 goto reply;
7749 }
7750
7751 double weight;
7752 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7753 ss << "unable to parse weight value '"
7754 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7755 err = -EINVAL;
7756 goto reply;
7757 }
7758
7759 string args;
7760 vector<string> argvec;
7761 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7762 map<string,string> loc;
7763 CrushWrapper::parse_loc_map(argvec, &loc);
7764
7765 if (prefix == "osd crush set"
7766 && !_get_stable_crush().item_exists(osdid)) {
7767 err = -ENOENT;
7768 ss << "unable to set item id " << osdid << " name '" << name
7769 << "' weight " << weight << " at location " << loc
7770 << ": does not exist";
7771 goto reply;
7772 }
7773
7774 dout(5) << "adding/updating crush item id " << osdid << " name '"
7775 << name << "' weight " << weight << " at location "
7776 << loc << dendl;
7777 CrushWrapper newcrush;
7778 _get_pending_crush(newcrush);
7779
7780 string action;
7781 if (prefix == "osd crush set" ||
7782 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
7783 action = "set";
7784 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
7785 } else {
7786 action = "add";
7787 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
7788 if (err == 0)
7789 err = 1;
7790 }
7791
7792 if (err < 0)
7793 goto reply;
7794
7795 if (err == 0 && !_have_pending_crush()) {
7796 ss << action << " item id " << osdid << " name '" << name << "' weight "
7797 << weight << " at location " << loc << ": no change";
7798 goto reply;
7799 }
7800
7801 pending_inc.crush.clear();
7802 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7803 ss << action << " item id " << osdid << " name '" << name << "' weight "
7804 << weight << " at location " << loc << " to crush map";
7805 getline(ss, rs);
7806 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7807 get_last_committed() + 1));
7808 return true;
7809
7810 } else if (prefix == "osd crush create-or-move") {
7811 do {
7812 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
7813 if (!osdmap.exists(osdid)) {
7814 err = -ENOENT;
7815 ss << name << " does not exist. create it before updating the crush map";
7816 goto reply;
7817 }
7818
7819 double weight;
7820 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7821 ss << "unable to parse weight value '"
7822 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7823 err = -EINVAL;
7824 goto reply;
7825 }
7826
7827 string args;
7828 vector<string> argvec;
7829 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7830 map<string,string> loc;
7831 CrushWrapper::parse_loc_map(argvec, &loc);
7832
7833 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
7834 << " at location " << loc << dendl;
7835
7836 CrushWrapper newcrush;
7837 _get_pending_crush(newcrush);
7838
7839 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
7840 if (err == 0) {
7841 ss << "create-or-move updated item name '" << name << "' weight " << weight
7842 << " at location " << loc << " to crush map";
7843 break;
7844 }
7845 if (err > 0) {
7846 pending_inc.crush.clear();
7847 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7848 ss << "create-or-move updating item name '" << name << "' weight " << weight
7849 << " at location " << loc << " to crush map";
7850 getline(ss, rs);
7851 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7852 get_last_committed() + 1));
7853 return true;
7854 }
7855 } while (false);
7856
7857 } else if (prefix == "osd crush move") {
7858 do {
7859 // osd crush move <name> <loc1> [<loc2> ...]
7860
7861 string args;
7862 vector<string> argvec;
7863 cmd_getval(g_ceph_context, cmdmap, "name", name);
7864 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7865 map<string,string> loc;
7866 CrushWrapper::parse_loc_map(argvec, &loc);
7867
7868 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
7869 CrushWrapper newcrush;
7870 _get_pending_crush(newcrush);
7871
7872 if (!newcrush.name_exists(name)) {
7873 err = -ENOENT;
7874 ss << "item " << name << " does not exist";
7875 break;
7876 }
7877 int id = newcrush.get_item_id(name);
7878
7879 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
7880 if (id >= 0) {
7881 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
7882 } else {
7883 err = newcrush.move_bucket(g_ceph_context, id, loc);
7884 }
7885 if (err >= 0) {
7886 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7887 pending_inc.crush.clear();
7888 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7889 getline(ss, rs);
7890 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7891 get_last_committed() + 1));
7892 return true;
7893 }
7894 } else {
7895 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7896 err = 0;
7897 }
7898 } while (false);
31f18b77
FG
7899 } else if (prefix == "osd crush swap-bucket") {
7900 string source, dest, force;
7901 cmd_getval(g_ceph_context, cmdmap, "source", source);
7902 cmd_getval(g_ceph_context, cmdmap, "dest", dest);
7903 cmd_getval(g_ceph_context, cmdmap, "force", force);
7904 CrushWrapper newcrush;
7905 _get_pending_crush(newcrush);
7906 if (!newcrush.name_exists(source)) {
7907 ss << "source item " << source << " does not exist";
7908 err = -ENOENT;
7909 goto reply;
7910 }
7911 if (!newcrush.name_exists(dest)) {
7912 ss << "dest item " << dest << " does not exist";
7913 err = -ENOENT;
7914 goto reply;
7915 }
7916 int sid = newcrush.get_item_id(source);
7917 int did = newcrush.get_item_id(dest);
7918 int sparent;
7919 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
7920 force != "--yes-i-really-mean-it") {
7921 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
7922 err = -EPERM;
7923 goto reply;
7924 }
7925 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
7926 force != "--yes-i-really-mean-it") {
7927 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
7928 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
7929 << "; pass --yes-i-really-mean-it to proceed anyway";
7930 err = -EPERM;
7931 goto reply;
7932 }
7933 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
7934 if (r < 0) {
7935 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 7936 err = r;
31f18b77
FG
7937 goto reply;
7938 }
7939 ss << "swapped bucket of " << source << " to " << dest;
7940 pending_inc.crush.clear();
7941 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7942 wait_for_finished_proposal(op,
7943 new Monitor::C_Command(mon, op, err, ss.str(),
7944 get_last_committed() + 1));
7945 return true;
7946 } else if (prefix == "osd crush link") {
7947 // osd crush link <name> <loc1> [<loc2> ...]
7948 string name;
7949 cmd_getval(g_ceph_context, cmdmap, "name", name);
7950 vector<string> argvec;
7951 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7952 map<string,string> loc;
7953 CrushWrapper::parse_loc_map(argvec, &loc);
7954
7955 // Need an explicit check for name_exists because get_item_id returns
7956 // 0 on unfound.
7957 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
7958 if (!osdmap.crush->name_exists(name)) {
7959 err = -ENOENT;
7960 ss << "item " << name << " does not exist";
7961 goto reply;
7962 } else {
7963 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
7964 }
7965 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
7966 ss << "no need to move item id " << id << " name '" << name
7967 << "' to location " << loc << " in crush map";
7968 err = 0;
7969 goto reply;
7970 }
7971
7972 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
7973 CrushWrapper newcrush;
7974 _get_pending_crush(newcrush);
7975
7976 if (!newcrush.name_exists(name)) {
7977 err = -ENOENT;
7978 ss << "item " << name << " does not exist";
7979 goto reply;
7980 } else {
7981 int id = newcrush.get_item_id(name);
7982 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
7983 err = newcrush.link_bucket(g_ceph_context, id, loc);
7984 if (err >= 0) {
7985 ss << "linked item id " << id << " name '" << name
7986 << "' to location " << loc << " in crush map";
7987 pending_inc.crush.clear();
7988 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7989 } else {
7990 ss << "cannot link item id " << id << " name '" << name
7991 << "' to location " << loc;
7992 goto reply;
7993 }
7994 } else {
7995 ss << "no need to move item id " << id << " name '" << name
7996 << "' to location " << loc << " in crush map";
7997 err = 0;
7998 }
7999 }
8000 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
8001 get_last_committed() + 1));
8002 return true;
8003 } else if (prefix == "osd crush rm" ||
8004 prefix == "osd crush remove" ||
8005 prefix == "osd crush unlink") {
8006 do {
8007 // osd crush rm <id> [ancestor]
8008 CrushWrapper newcrush;
8009 _get_pending_crush(newcrush);
8010
8011 string name;
8012 cmd_getval(g_ceph_context, cmdmap, "name", name);
8013
8014 if (!osdmap.crush->name_exists(name)) {
8015 err = 0;
8016 ss << "device '" << name << "' does not appear in the crush map";
8017 break;
8018 }
8019 if (!newcrush.name_exists(name)) {
8020 err = 0;
8021 ss << "device '" << name << "' does not appear in the crush map";
8022 getline(ss, rs);
8023 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8024 get_last_committed() + 1));
8025 return true;
8026 }
8027 int id = newcrush.get_item_id(name);
31f18b77
FG
8028 int ancestor = 0;
8029
7c673cae
FG
8030 bool unlink_only = prefix == "osd crush unlink";
8031 string ancestor_str;
8032 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
8033 if (!newcrush.name_exists(ancestor_str)) {
8034 err = -ENOENT;
8035 ss << "ancestor item '" << ancestor_str
8036 << "' does not appear in the crush map";
8037 break;
8038 }
31f18b77 8039 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 8040 }
31f18b77
FG
8041
8042 err = prepare_command_osd_crush_remove(
8043 newcrush,
8044 id, ancestor,
8045 (ancestor < 0), unlink_only);
8046
7c673cae
FG
8047 if (err == -ENOENT) {
8048 ss << "item " << id << " does not appear in that position";
8049 err = 0;
8050 break;
8051 }
8052 if (err == 0) {
7c673cae
FG
8053 ss << "removed item id " << id << " name '" << name << "' from crush map";
8054 getline(ss, rs);
8055 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8056 get_last_committed() + 1));
8057 return true;
8058 }
8059 } while (false);
8060
8061 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
8062 CrushWrapper newcrush;
8063 _get_pending_crush(newcrush);
8064
8065 newcrush.reweight(g_ceph_context);
8066 pending_inc.crush.clear();
8067 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8068 ss << "reweighted crush hierarchy";
8069 getline(ss, rs);
8070 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8071 get_last_committed() + 1));
8072 return true;
8073 } else if (prefix == "osd crush reweight") {
8074 // osd crush reweight <name> <weight>
8075 CrushWrapper newcrush;
8076 _get_pending_crush(newcrush);
8077
8078 string name;
8079 cmd_getval(g_ceph_context, cmdmap, "name", name);
8080 if (!newcrush.name_exists(name)) {
8081 err = -ENOENT;
8082 ss << "device '" << name << "' does not appear in the crush map";
8083 goto reply;
8084 }
8085
8086 int id = newcrush.get_item_id(name);
8087 if (id < 0) {
8088 ss << "device '" << name << "' is not a leaf in the crush map";
8089 err = -EINVAL;
8090 goto reply;
8091 }
8092 double w;
8093 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8094 ss << "unable to parse weight value '"
8095 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8096 err = -EINVAL;
8097 goto reply;
8098 }
8099
8100 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
8101 if (err < 0)
8102 goto reply;
8103 pending_inc.crush.clear();
8104 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8105 ss << "reweighted item id " << id << " name '" << name << "' to " << w
8106 << " in crush map";
8107 getline(ss, rs);
8108 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8109 get_last_committed() + 1));
8110 return true;
8111 } else if (prefix == "osd crush reweight-subtree") {
8112 // osd crush reweight <name> <weight>
8113 CrushWrapper newcrush;
8114 _get_pending_crush(newcrush);
8115
8116 string name;
8117 cmd_getval(g_ceph_context, cmdmap, "name", name);
8118 if (!newcrush.name_exists(name)) {
8119 err = -ENOENT;
8120 ss << "device '" << name << "' does not appear in the crush map";
8121 goto reply;
8122 }
8123
8124 int id = newcrush.get_item_id(name);
8125 if (id >= 0) {
8126 ss << "device '" << name << "' is not a subtree in the crush map";
8127 err = -EINVAL;
8128 goto reply;
8129 }
8130 double w;
8131 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8132 ss << "unable to parse weight value '"
8133 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8134 err = -EINVAL;
8135 goto reply;
8136 }
8137
8138 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
8139 if (err < 0)
8140 goto reply;
8141 pending_inc.crush.clear();
8142 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8143 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
8144 << " in crush map";
8145 getline(ss, rs);
8146 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8147 get_last_committed() + 1));
8148 return true;
8149 } else if (prefix == "osd crush tunables") {
8150 CrushWrapper newcrush;
8151 _get_pending_crush(newcrush);
8152
8153 err = 0;
8154 string profile;
8155 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8156 if (profile == "legacy" || profile == "argonaut") {
8157 newcrush.set_tunables_legacy();
8158 } else if (profile == "bobtail") {
8159 newcrush.set_tunables_bobtail();
8160 } else if (profile == "firefly") {
8161 newcrush.set_tunables_firefly();
8162 } else if (profile == "hammer") {
8163 newcrush.set_tunables_hammer();
8164 } else if (profile == "jewel") {
8165 newcrush.set_tunables_jewel();
8166 } else if (profile == "optimal") {
8167 newcrush.set_tunables_optimal();
8168 } else if (profile == "default") {
8169 newcrush.set_tunables_default();
8170 } else {
8171 ss << "unrecognized profile '" << profile << "'";
8172 err = -EINVAL;
8173 goto reply;
8174 }
8175
8176 if (!validate_crush_against_features(&newcrush, ss)) {
8177 err = -EINVAL;
8178 goto reply;
8179 }
8180
8181 pending_inc.crush.clear();
8182 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8183 ss << "adjusted tunables profile to " << profile;
8184 getline(ss, rs);
8185 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8186 get_last_committed() + 1));
8187 return true;
8188 } else if (prefix == "osd crush set-tunable") {
8189 CrushWrapper newcrush;
8190 _get_pending_crush(newcrush);
8191
8192 err = 0;
8193 string tunable;
8194 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
8195
8196 int64_t value = -1;
8197 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
8198 err = -EINVAL;
8199 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
8200 goto reply;
8201 }
8202
8203 if (tunable == "straw_calc_version") {
224ce89b 8204 if (value != 0 && value != 1) {
7c673cae
FG
8205 ss << "value must be 0 or 1; got " << value;
8206 err = -EINVAL;
8207 goto reply;
8208 }
8209 newcrush.set_straw_calc_version(value);
8210 } else {
8211 ss << "unrecognized tunable '" << tunable << "'";
8212 err = -EINVAL;
8213 goto reply;
8214 }
8215
8216 if (!validate_crush_against_features(&newcrush, ss)) {
8217 err = -EINVAL;
8218 goto reply;
8219 }
8220
8221 pending_inc.crush.clear();
8222 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8223 ss << "adjusted tunable " << tunable << " to " << value;
8224 getline(ss, rs);
8225 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8226 get_last_committed() + 1));
8227 return true;
8228
8229 } else if (prefix == "osd crush rule create-simple") {
8230 string name, root, type, mode;
8231 cmd_getval(g_ceph_context, cmdmap, "name", name);
8232 cmd_getval(g_ceph_context, cmdmap, "root", root);
8233 cmd_getval(g_ceph_context, cmdmap, "type", type);
8234 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8235 if (mode == "")
8236 mode = "firstn";
8237
8238 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
8239 // The name is uniquely associated to a ruleid and the rule it contains
8240 // From the user point of view, the rule is more meaningfull.
8241 ss << "rule " << name << " already exists";
7c673cae
FG
8242 err = 0;
8243 goto reply;
8244 }
8245
8246 CrushWrapper newcrush;
8247 _get_pending_crush(newcrush);
8248
8249 if (newcrush.rule_exists(name)) {
31f18b77
FG
8250 // The name is uniquely associated to a ruleid and the rule it contains
8251 // From the user point of view, the rule is more meaningfull.
8252 ss << "rule " << name << " already exists";
7c673cae
FG
8253 err = 0;
8254 } else {
224ce89b 8255 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
8256 pg_pool_t::TYPE_REPLICATED, &ss);
8257 if (ruleno < 0) {
8258 err = ruleno;
8259 goto reply;
8260 }
8261
8262 pending_inc.crush.clear();
8263 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8264 }
8265 getline(ss, rs);
8266 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8267 get_last_committed() + 1));
8268 return true;
8269
224ce89b
WB
8270 } else if (prefix == "osd crush rule create-replicated") {
8271 string name, root, type, device_class;
8272 cmd_getval(g_ceph_context, cmdmap, "name", name);
8273 cmd_getval(g_ceph_context, cmdmap, "root", root);
8274 cmd_getval(g_ceph_context, cmdmap, "type", type);
8275 cmd_getval(g_ceph_context, cmdmap, "class", device_class);
8276
8277 if (!device_class.empty()) {
8278 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8279 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8280 << "luminous' before using crush device classes";
8281 err = -EPERM;
8282 goto reply;
8283 }
8284 }
8285
8286 if (osdmap.crush->rule_exists(name)) {
8287 // The name is uniquely associated to a ruleid and the rule it contains
8288 // From the user point of view, the rule is more meaningfull.
8289 ss << "rule " << name << " already exists";
8290 err = 0;
8291 goto reply;
8292 }
8293
8294 CrushWrapper newcrush;
8295 _get_pending_crush(newcrush);
8296
8297 if (newcrush.rule_exists(name)) {
8298 // The name is uniquely associated to a ruleid and the rule it contains
8299 // From the user point of view, the rule is more meaningfull.
8300 ss << "rule " << name << " already exists";
8301 err = 0;
8302 } else {
8303 int ruleno = newcrush.add_simple_rule(
8304 name, root, type, device_class,
8305 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
8306 if (ruleno < 0) {
8307 err = ruleno;
8308 goto reply;
8309 }
8310
8311 pending_inc.crush.clear();
8312 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8313 }
8314 getline(ss, rs);
8315 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8316 get_last_committed() + 1));
8317 return true;
8318
7c673cae
FG
8319 } else if (prefix == "osd erasure-code-profile rm") {
8320 string name;
8321 cmd_getval(g_ceph_context, cmdmap, "name", name);
8322
8323 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
8324 goto wait;
8325
8326 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
8327 err = -EBUSY;
8328 goto reply;
8329 }
8330
8331 if (osdmap.has_erasure_code_profile(name) ||
8332 pending_inc.new_erasure_code_profiles.count(name)) {
8333 if (osdmap.has_erasure_code_profile(name)) {
8334 pending_inc.old_erasure_code_profiles.push_back(name);
8335 } else {
8336 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
8337 pending_inc.new_erasure_code_profiles.erase(name);
8338 }
8339
8340 getline(ss, rs);
8341 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8342 get_last_committed() + 1));
8343 return true;
8344 } else {
8345 ss << "erasure-code-profile " << name << " does not exist";
8346 err = 0;
8347 goto reply;
8348 }
8349
8350 } else if (prefix == "osd erasure-code-profile set") {
8351 string name;
8352 cmd_getval(g_ceph_context, cmdmap, "name", name);
8353 vector<string> profile;
8354 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8355 bool force;
8356 if (profile.size() > 0 && profile.back() == "--force") {
8357 profile.pop_back();
8358 force = true;
8359 } else {
8360 force = false;
8361 }
8362 map<string,string> profile_map;
8363 err = parse_erasure_code_profile(profile, &profile_map, &ss);
8364 if (err)
8365 goto reply;
8366 if (profile_map.find("plugin") == profile_map.end()) {
8367 ss << "erasure-code-profile " << profile_map
8368 << " must contain a plugin entry" << std::endl;
8369 err = -EINVAL;
8370 goto reply;
8371 }
8372 string plugin = profile_map["plugin"];
8373
8374 if (pending_inc.has_erasure_code_profile(name)) {
8375 dout(20) << "erasure code profile " << name << " try again" << dendl;
8376 goto wait;
8377 } else {
8378 if (plugin == "isa" || plugin == "lrc") {
8379 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
8380 if (err == -EAGAIN)
8381 goto wait;
8382 if (err)
8383 goto reply;
8384 } else if (plugin == "shec") {
8385 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
8386 if (err == -EAGAIN)
8387 goto wait;
8388 if (err)
8389 goto reply;
8390 }
8391 err = normalize_profile(name, profile_map, force, &ss);
8392 if (err)
8393 goto reply;
8394
8395 if (osdmap.has_erasure_code_profile(name)) {
8396 ErasureCodeProfile existing_profile_map =
8397 osdmap.get_erasure_code_profile(name);
8398 err = normalize_profile(name, existing_profile_map, force, &ss);
8399 if (err)
8400 goto reply;
8401
8402 if (existing_profile_map == profile_map) {
8403 err = 0;
8404 goto reply;
8405 }
8406 if (!force) {
8407 err = -EPERM;
8408 ss << "will not override erasure code profile " << name
8409 << " because the existing profile "
8410 << existing_profile_map
8411 << " is different from the proposed profile "
8412 << profile_map;
8413 goto reply;
8414 }
8415 }
8416
8417 dout(20) << "erasure code profile set " << name << "="
8418 << profile_map << dendl;
8419 pending_inc.set_erasure_code_profile(name, profile_map);
8420 }
8421
8422 getline(ss, rs);
8423 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8424 get_last_committed() + 1));
8425 return true;
8426
8427 } else if (prefix == "osd crush rule create-erasure") {
8428 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
8429 if (err == -EAGAIN)
8430 goto wait;
8431 if (err)
8432 goto reply;
8433 string name, poolstr;
8434 cmd_getval(g_ceph_context, cmdmap, "name", name);
8435 string profile;
8436 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8437 if (profile == "")
8438 profile = "default";
8439 if (profile == "default") {
8440 if (!osdmap.has_erasure_code_profile(profile)) {
8441 if (pending_inc.has_erasure_code_profile(profile)) {
8442 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
8443 goto wait;
8444 }
8445
8446 map<string,string> profile_map;
8447 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
8448 profile_map,
8449 &ss);
8450 if (err)
8451 goto reply;
8452 err = normalize_profile(name, profile_map, true, &ss);
8453 if (err)
8454 goto reply;
8455 dout(20) << "erasure code profile set " << profile << "="
8456 << profile_map << dendl;
8457 pending_inc.set_erasure_code_profile(profile, profile_map);
8458 goto wait;
8459 }
8460 }
8461
31f18b77
FG
8462 int rule;
8463 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
8464 if (err < 0) {
8465 switch(err) {
8466 case -EEXIST: // return immediately
8467 ss << "rule " << name << " already exists";
8468 err = 0;
8469 goto reply;
8470 break;
8471 case -EALREADY: // wait for pending to be proposed
8472 ss << "rule " << name << " already exists";
8473 err = 0;
8474 break;
8475 default: // non recoverable error
8476 goto reply;
8477 break;
8478 }
8479 } else {
31f18b77 8480 ss << "created rule " << name << " at " << rule;
7c673cae
FG
8481 }
8482
8483 getline(ss, rs);
8484 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8485 get_last_committed() + 1));
8486 return true;
8487
8488 } else if (prefix == "osd crush rule rm") {
8489 string name;
8490 cmd_getval(g_ceph_context, cmdmap, "name", name);
8491
8492 if (!osdmap.crush->rule_exists(name)) {
8493 ss << "rule " << name << " does not exist";
8494 err = 0;
8495 goto reply;
8496 }
8497
8498 CrushWrapper newcrush;
8499 _get_pending_crush(newcrush);
8500
8501 if (!newcrush.rule_exists(name)) {
8502 ss << "rule " << name << " does not exist";
8503 err = 0;
8504 } else {
8505 int ruleno = newcrush.get_rule_id(name);
8506 assert(ruleno >= 0);
8507
8508 // make sure it is not in use.
8509 // FIXME: this is ok in some situations, but let's not bother with that
8510 // complexity now.
8511 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
8512 if (osdmap.crush_ruleset_in_use(ruleset)) {
8513 ss << "crush ruleset " << name << " " << ruleset << " is in use";
8514 err = -EBUSY;
8515 goto reply;
8516 }
8517
8518 err = newcrush.remove_rule(ruleno);
8519 if (err < 0) {
8520 goto reply;
8521 }
8522
8523 pending_inc.crush.clear();
8524 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8525 }
8526 getline(ss, rs);
8527 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8528 get_last_committed() + 1));
8529 return true;
8530
b5b8bbf5
FG
8531 } else if (prefix == "osd crush rule rename") {
8532 string srcname;
8533 string dstname;
8534 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
8535 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
8536 if (srcname.empty() || dstname.empty()) {
8537 ss << "must specify both source rule name and destination rule name";
8538 err = -EINVAL;
8539 goto reply;
8540 }
8541 if (srcname == dstname) {
8542 ss << "destination rule name is equal to source rule name";
8543 err = 0;
8544 goto reply;
8545 }
8546
8547 CrushWrapper newcrush;
8548 _get_pending_crush(newcrush);
8549 err = newcrush.rename_rule(srcname, dstname, &ss);
8550 if (err < 0) {
8551 // ss has reason for failure
8552 goto reply;
8553 }
8554 pending_inc.crush.clear();
8555 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8556 getline(ss, rs);
8557 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8558 get_last_committed() + 1));
8559 return true;
8560
7c673cae
FG
8561 } else if (prefix == "osd setmaxosd") {
8562 int64_t newmax;
8563 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
8564 ss << "unable to parse 'newmax' value '"
8565 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
8566 err = -EINVAL;
8567 goto reply;
8568 }
8569
8570 if (newmax > g_conf->mon_max_osd) {
8571 err = -ERANGE;
8572 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
8573 << g_conf->mon_max_osd << ")";
8574 goto reply;
8575 }
8576
8577 // Don't allow shrinking OSD number as this will cause data loss
8578 // and may cause kernel crashes.
8579 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
8580 if (newmax < osdmap.get_max_osd()) {
8581 // Check if the OSDs exist between current max and new value.
8582 // If there are any OSDs exist, then don't allow shrinking number
8583 // of OSDs.
8584 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
8585 if (osdmap.exists(i)) {
8586 err = -EBUSY;
8587 ss << "cannot shrink max_osd to " << newmax
8588 << " because osd." << i << " (and possibly others) still in use";
8589 goto reply;
8590 }
8591 }
8592 }
8593
8594 pending_inc.new_max_osd = newmax;
8595 ss << "set new max_osd = " << pending_inc.new_max_osd;
8596 getline(ss, rs);
8597 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8598 get_last_committed() + 1));
8599 return true;
8600
8601 } else if (prefix == "osd set-full-ratio" ||
8602 prefix == "osd set-backfillfull-ratio" ||
8603 prefix == "osd set-nearfull-ratio") {
31f18b77 8604 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
8605 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8606 << "luminous' before using the new interface";
7c673cae
FG
8607 err = -EPERM;
8608 goto reply;
8609 }
8610 double n;
8611 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
8612 ss << "unable to parse 'ratio' value '"
224ce89b 8613 << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
7c673cae
FG
8614 err = -EINVAL;
8615 goto reply;
8616 }
8617 if (prefix == "osd set-full-ratio")
8618 pending_inc.new_full_ratio = n;
8619 else if (prefix == "osd set-backfillfull-ratio")
8620 pending_inc.new_backfillfull_ratio = n;
8621 else if (prefix == "osd set-nearfull-ratio")
8622 pending_inc.new_nearfull_ratio = n;
8623 ss << prefix << " " << n;
8624 getline(ss, rs);
8625 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8626 get_last_committed() + 1));
8627 return true;
8628 } else if (prefix == "osd set-require-min-compat-client") {
31f18b77 8629 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
8630 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8631 << "luminous' before using the new interface";
7c673cae
FG
8632 err = -EPERM;
8633 goto reply;
8634 }
8635 string v;
8636 cmd_getval(g_ceph_context, cmdmap, "version", v);
31f18b77
FG
8637 int vno = ceph_release_from_name(v.c_str());
8638 if (vno <= 0) {
7c673cae
FG
8639 ss << "version " << v << " is not recognized";
8640 err = -EINVAL;
8641 goto reply;
8642 }
8643 OSDMap newmap;
8644 newmap.deepish_copy_from(osdmap);
8645 newmap.apply_incremental(pending_inc);
31f18b77
FG
8646 newmap.require_min_compat_client = vno;
8647 auto mvno = newmap.get_min_compat_client();
8648 if (vno < mvno) {
8649 ss << "osdmap current utilizes features that require "
8650 << ceph_release_name(mvno)
8651 << "; cannot set require_min_compat_client below that to "
8652 << ceph_release_name(vno);
7c673cae
FG
8653 err = -EPERM;
8654 goto reply;
8655 }
31f18b77
FG
8656 string sure;
8657 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
8658 if (sure != "--yes-i-really-mean-it") {
8659 FeatureMap m;
8660 mon->get_combined_feature_map(&m);
8661 uint64_t features = ceph_release_features(vno);
8662 bool first = true;
8663 bool ok = true;
8664 for (int type : {
8665 CEPH_ENTITY_TYPE_CLIENT,
8666 CEPH_ENTITY_TYPE_MDS,
8667 CEPH_ENTITY_TYPE_MGR }) {
8668 auto p = m.m.find(type);
8669 if (p == m.m.end()) {
8670 continue;
8671 }
8672 for (auto& q : p->second) {
8673 uint64_t missing = ~q.first & features;
8674 if (missing) {
8675 if (first) {
8676 ss << "cannot set require_min_compat_client to " << v << ": ";
8677 } else {
8678 ss << "; ";
8679 }
8680 first = false;
8681 ss << q.second << " connected " << ceph_entity_type_name(type)
8682 << "(s) look like " << ceph_release_name(
8683 ceph_release_from_features(q.first))
8684 << " (missing 0x" << std::hex << missing << std::dec << ")";
8685 ok = false;
8686 }
8687 }
8688 }
8689 if (!ok) {
8690 ss << "; add --yes-i-really-mean-it to do it anyway";
8691 err = -EPERM;
8692 goto reply;
8693 }
8694 }
8695 ss << "set require_min_compat_client to " << ceph_release_name(vno);
8696 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
8697 getline(ss, rs);
8698 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8699 get_last_committed() + 1));
8700 return true;
8701 } else if (prefix == "osd pause") {
8702 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8703
8704 } else if (prefix == "osd unpause") {
8705 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8706
8707 } else if (prefix == "osd set") {
8708 string key;
8709 cmd_getval(g_ceph_context, cmdmap, "key", key);
8710 if (key == "full")
8711 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
8712 else if (key == "pause")
8713 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8714 else if (key == "noup")
8715 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
8716 else if (key == "nodown")
8717 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
8718 else if (key == "noout")
8719 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
8720 else if (key == "noin")
8721 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
8722 else if (key == "nobackfill")
8723 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
8724 else if (key == "norebalance")
8725 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
8726 else if (key == "norecover")
8727 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
8728 else if (key == "noscrub")
8729 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
8730 else if (key == "nodeep-scrub")
8731 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8732 else if (key == "notieragent")
8733 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
8734 else if (key == "sortbitwise") {
8735 if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
8736 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
8737 } else {
8738 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
8739 err = -EPERM;
31f18b77 8740 goto reply;
7c673cae 8741 }
c07f9fc5
FG
8742 } else if (key == "recovery_deletes") {
8743 if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)) {
8744 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
8745 } else {
8746 ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
8747 err = -EPERM;
8748 goto reply;
8749 }
7c673cae
FG
8750 } else if (key == "require_jewel_osds") {
8751 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8752 ss << "the sortbitwise flag must be set before require_jewel_osds";
8753 err = -EPERM;
31f18b77
FG
8754 goto reply;
8755 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
8756 ss << "require_osd_release is already >= jewel";
8757 err = 0;
8758 goto reply;
7c673cae
FG
8759 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
8760 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
8761 } else {
8762 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
8763 err = -EPERM;
8764 }
8765 } else if (key == "require_kraken_osds") {
8766 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8767 ss << "the sortbitwise flag must be set before require_kraken_osds";
8768 err = -EPERM;
31f18b77
FG
8769 goto reply;
8770 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
8771 ss << "require_osd_release is already >= kraken";
8772 err = 0;
8773 goto reply;
7c673cae
FG
8774 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
8775 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
8776 // ensure JEWEL is also set
8777 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
8778 return r;
8779 } else {
8780 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
8781 err = -EPERM;
8782 }
7c673cae
FG
8783 } else {
8784 ss << "unrecognized flag '" << key << "'";
8785 err = -EINVAL;
8786 }
8787
8788 } else if (prefix == "osd unset") {
8789 string key;
8790 cmd_getval(g_ceph_context, cmdmap, "key", key);
8791 if (key == "full")
8792 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
8793 else if (key == "pause")
8794 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8795 else if (key == "noup")
8796 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
8797 else if (key == "nodown")
8798 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
8799 else if (key == "noout")
8800 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
8801 else if (key == "noin")
8802 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
8803 else if (key == "nobackfill")
8804 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
8805 else if (key == "norebalance")
8806 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
8807 else if (key == "norecover")
8808 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
8809 else if (key == "noscrub")
8810 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
8811 else if (key == "nodeep-scrub")
8812 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8813 else if (key == "notieragent")
8814 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
224ce89b 8815 else {
7c673cae
FG
8816 ss << "unrecognized flag '" << key << "'";
8817 err = -EINVAL;
8818 }
8819
31f18b77
FG
8820 } else if (prefix == "osd require-osd-release") {
8821 string release;
8822 cmd_getval(g_ceph_context, cmdmap, "release", release);
8823 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8824 ss << "the sortbitwise flag must be set first";
8825 err = -EPERM;
8826 goto reply;
8827 }
8828 int rel = ceph_release_from_name(release.c_str());
8829 if (rel <= 0) {
8830 ss << "unrecognized release " << release;
8831 err = -EINVAL;
8832 goto reply;
8833 }
8834 if (rel < CEPH_RELEASE_LUMINOUS) {
8835 ss << "use this command only for luminous and later";
8836 err = -EINVAL;
8837 goto reply;
8838 }
d2e6a577
FG
8839 if (rel == osdmap.require_osd_release) {
8840 // idempotent
8841 err = 0;
8842 goto reply;
8843 }
31f18b77
FG
8844 if (rel == CEPH_RELEASE_LUMINOUS) {
8845 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
8846 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
8847 err = -EPERM;
8848 goto reply;
8849 }
8850 } else {
8851 ss << "not supported for this release yet";
8852 err = -EPERM;
8853 goto reply;
8854 }
8855 if (rel < osdmap.require_osd_release) {
8856 ss << "require_osd_release cannot be lowered once it has been set";
8857 err = -EPERM;
8858 goto reply;
8859 }
8860 pending_inc.new_require_osd_release = rel;
c07f9fc5
FG
8861 if (rel >= CEPH_RELEASE_LUMINOUS &&
8862 !osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
8863 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
8864 }
31f18b77 8865 goto update;
7c673cae
FG
8866 } else if (prefix == "osd cluster_snap") {
8867 // ** DISABLE THIS FOR NOW **
8868 ss << "cluster snapshot currently disabled (broken implementation)";
8869 // ** DISABLE THIS FOR NOW **
8870
8871 } else if (prefix == "osd down" ||
8872 prefix == "osd out" ||
8873 prefix == "osd in" ||
8874 prefix == "osd rm") {
8875
8876 bool any = false;
31f18b77
FG
8877 bool stop = false;
8878 bool verbose = true;
7c673cae
FG
8879
8880 vector<string> idvec;
8881 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
31f18b77
FG
8882 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8883 set<int> osds;
8884
8885 // wildcard?
8886 if (j == 0 &&
8887 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8888 if (prefix == "osd in") {
8889 // touch out osds only
8890 osdmap.get_out_osds(osds);
8891 } else {
8892 osdmap.get_all_osds(osds);
8893 }
8894 stop = true;
8895 verbose = false; // so the output is less noisy.
8896 } else {
8897 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8898 if (osd < 0) {
8899 ss << "invalid osd id" << osd;
8900 err = -EINVAL;
8901 continue;
8902 } else if (!osdmap.exists(osd)) {
8903 ss << "osd." << osd << " does not exist. ";
8904 continue;
8905 }
8906
8907 osds.insert(osd);
7c673cae 8908 }
31f18b77
FG
8909
8910 for (auto &osd : osds) {
8911 if (prefix == "osd down") {
8912 if (osdmap.is_down(osd)) {
8913 if (verbose)
8914 ss << "osd." << osd << " is already down. ";
8915 } else {
8916 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
8917 ss << "marked down osd." << osd << ". ";
8918 any = true;
8919 }
8920 } else if (prefix == "osd out") {
8921 if (osdmap.is_out(osd)) {
8922 if (verbose)
8923 ss << "osd." << osd << " is already out. ";
8924 } else {
8925 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
8926 if (osdmap.osd_weight[osd]) {
8927 if (pending_inc.new_xinfo.count(osd) == 0) {
8928 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
8929 }
8930 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 8931 }
31f18b77 8932 ss << "marked out osd." << osd << ". ";
224ce89b
WB
8933 std::ostringstream msg;
8934 msg << "Client " << op->get_session()->entity_name
8935 << " marked osd." << osd << " out";
8936 if (osdmap.is_up(osd)) {
8937 msg << ", while it was still marked up";
8938 } else {
8939 msg << ", after it was down for " << int(down_pending_out[osd].sec())
8940 << " seconds";
8941 }
8942
8943 mon->clog->info() << msg.str();
31f18b77 8944 any = true;
7c673cae 8945 }
31f18b77
FG
8946 } else if (prefix == "osd in") {
8947 if (osdmap.is_in(osd)) {
8948 if (verbose)
8949 ss << "osd." << osd << " is already in. ";
8950 } else {
8951 if (osdmap.osd_xinfo[osd].old_weight > 0) {
8952 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
8953 if (pending_inc.new_xinfo.count(osd) == 0) {
8954 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
8955 }
8956 pending_inc.new_xinfo[osd].old_weight = 0;
8957 } else {
8958 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 8959 }
31f18b77
FG
8960 ss << "marked in osd." << osd << ". ";
8961 any = true;
8962 }
8963 } else if (prefix == "osd rm") {
8964 err = prepare_command_osd_remove(osd);
8965
8966 if (err == -EBUSY) {
8967 if (any)
8968 ss << ", ";
8969 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 8970 } else {
31f18b77
FG
8971 assert(err == 0);
8972 if (any) {
8973 ss << ", osd." << osd;
8974 } else {
8975 ss << "removed osd." << osd;
8976 }
8977 any = true;
7c673cae 8978 }
31f18b77
FG
8979 }
8980 }
8981 }
8982 if (any) {
8983 getline(ss, rs);
8984 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
8985 get_last_committed() + 1));
8986 return true;
8987 }
8988 } else if (prefix == "osd add-noup" ||
8989 prefix == "osd add-nodown" ||
8990 prefix == "osd add-noin" ||
8991 prefix == "osd add-noout") {
8992
8993 enum {
8994 OP_NOUP,
8995 OP_NODOWN,
8996 OP_NOIN,
8997 OP_NOOUT,
8998 } option;
8999
9000 if (prefix == "osd add-noup") {
9001 option = OP_NOUP;
9002 } else if (prefix == "osd add-nodown") {
9003 option = OP_NODOWN;
9004 } else if (prefix == "osd add-noin") {
9005 option = OP_NOIN;
9006 } else {
9007 option = OP_NOOUT;
9008 }
9009
9010 bool any = false;
9011 bool stop = false;
9012
9013 vector<string> idvec;
9014 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9015 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9016
9017 set<int> osds;
9018
9019 // wildcard?
9020 if (j == 0 &&
9021 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9022 osdmap.get_all_osds(osds);
9023 stop = true;
9024 } else {
9025 // try traditional single osd way
9026
9027 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9028 if (osd < 0) {
9029 // ss has reason for failure
9030 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9031 err = -EINVAL;
9032 continue;
9033 }
9034
9035 osds.insert(osd);
9036 }
9037
9038 for (auto &osd : osds) {
9039
9040 if (!osdmap.exists(osd)) {
9041 ss << "osd." << osd << " does not exist. ";
9042 continue;
9043 }
9044
9045 switch (option) {
9046 case OP_NOUP:
9047 if (osdmap.is_up(osd)) {
9048 ss << "osd." << osd << " is already up. ";
9049 continue;
9050 }
9051
9052 if (osdmap.is_noup(osd)) {
9053 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
9054 any = true;
7c673cae 9055 } else {
31f18b77
FG
9056 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9057 any = true;
7c673cae 9058 }
31f18b77
FG
9059
9060 break;
9061
9062 case OP_NODOWN:
9063 if (osdmap.is_down(osd)) {
9064 ss << "osd." << osd << " is already down. ";
9065 continue;
9066 }
9067
9068 if (osdmap.is_nodown(osd)) {
9069 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
9070 any = true;
9071 } else {
9072 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9073 any = true;
9074 }
9075
9076 break;
9077
9078 case OP_NOIN:
9079 if (osdmap.is_in(osd)) {
9080 ss << "osd." << osd << " is already in. ";
9081 continue;
9082 }
9083
9084 if (osdmap.is_noin(osd)) {
9085 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
9086 any = true;
9087 } else {
9088 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9089 any = true;
9090 }
9091
9092 break;
9093
9094 case OP_NOOUT:
9095 if (osdmap.is_out(osd)) {
9096 ss << "osd." << osd << " is already out. ";
9097 continue;
9098 }
9099
9100 if (osdmap.is_noout(osd)) {
9101 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
9102 any = true;
9103 } else {
9104 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9105 any = true;
9106 }
9107
9108 break;
9109
9110 default:
9111 assert(0 == "invalid option");
9112 }
7c673cae
FG
9113 }
9114 }
31f18b77 9115
7c673cae
FG
9116 if (any) {
9117 getline(ss, rs);
9118 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
31f18b77
FG
9119 get_last_committed() + 1));
9120 return true;
9121 }
9122 } else if (prefix == "osd rm-noup" ||
9123 prefix == "osd rm-nodown" ||
9124 prefix == "osd rm-noin" ||
9125 prefix == "osd rm-noout") {
9126
9127 enum {
9128 OP_NOUP,
9129 OP_NODOWN,
9130 OP_NOIN,
9131 OP_NOOUT,
9132 } option;
9133
9134 if (prefix == "osd rm-noup") {
9135 option = OP_NOUP;
9136 } else if (prefix == "osd rm-nodown") {
9137 option = OP_NODOWN;
9138 } else if (prefix == "osd rm-noin") {
9139 option = OP_NOIN;
9140 } else {
9141 option = OP_NOOUT;
9142 }
9143
9144 bool any = false;
9145 bool stop = false;
9146
9147 vector<string> idvec;
9148 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9149
9150 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9151
9152 vector<int> osds;
9153
9154 // wildcard?
9155 if (j == 0 &&
9156 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9157
9158 // touch previous noup/nodown/noin/noout osds only
9159 switch (option) {
9160 case OP_NOUP:
9161 osdmap.get_noup_osds(&osds);
9162 break;
9163 case OP_NODOWN:
9164 osdmap.get_nodown_osds(&osds);
9165 break;
9166 case OP_NOIN:
9167 osdmap.get_noin_osds(&osds);
9168 break;
9169 case OP_NOOUT:
9170 osdmap.get_noout_osds(&osds);
9171 break;
9172 default:
9173 assert(0 == "invalid option");
9174 }
9175
9176 // cancel any pending noup/nodown/noin/noout requests too
9177 vector<int> pending_state_osds;
9178 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
9179 for (auto &p : pending_state_osds) {
9180
9181 switch (option) {
9182 case OP_NOUP:
9183 if (!osdmap.is_noup(p) &&
9184 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
9185 any = true;
9186 }
9187 break;
9188
9189 case OP_NODOWN:
9190 if (!osdmap.is_nodown(p) &&
9191 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
9192 any = true;
9193 }
9194 break;
9195
9196 case OP_NOIN:
9197 if (!osdmap.is_noin(p) &&
9198 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
9199 any = true;
9200 }
9201 break;
9202
9203 case OP_NOOUT:
9204 if (!osdmap.is_noout(p) &&
9205 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
9206 any = true;
9207 }
9208 break;
9209
9210 default:
9211 assert(0 == "invalid option");
9212 }
9213 }
9214
9215 stop = true;
9216 } else {
9217 // try traditional single osd way
9218
9219 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9220 if (osd < 0) {
9221 // ss has reason for failure
9222 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9223 err = -EINVAL;
9224 continue;
9225 }
9226
9227 osds.push_back(osd);
9228 }
9229
9230 for (auto &osd : osds) {
9231
9232 if (!osdmap.exists(osd)) {
9233 ss << "osd." << osd << " does not exist. ";
9234 continue;
9235 }
9236
9237 switch (option) {
9238 case OP_NOUP:
9239 if (osdmap.is_noup(osd)) {
9240 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9241 any = true;
9242 } else if (pending_inc.pending_osd_state_clear(
9243 osd, CEPH_OSD_NOUP)) {
9244 any = true;
9245 }
9246 break;
9247
9248 case OP_NODOWN:
9249 if (osdmap.is_nodown(osd)) {
9250 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9251 any = true;
9252 } else if (pending_inc.pending_osd_state_clear(
9253 osd, CEPH_OSD_NODOWN)) {
9254 any = true;
9255 }
9256 break;
9257
9258 case OP_NOIN:
9259 if (osdmap.is_noin(osd)) {
9260 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9261 any = true;
9262 } else if (pending_inc.pending_osd_state_clear(
9263 osd, CEPH_OSD_NOIN)) {
9264 any = true;
9265 }
9266 break;
9267
9268 case OP_NOOUT:
9269 if (osdmap.is_noout(osd)) {
9270 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9271 any = true;
9272 } else if (pending_inc.pending_osd_state_clear(
9273 osd, CEPH_OSD_NOOUT)) {
9274 any = true;
9275 }
9276 break;
9277
9278 default:
9279 assert(0 == "invalid option");
9280 }
9281 }
9282 }
9283
9284 if (any) {
9285 getline(ss, rs);
9286 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9287 get_last_committed() + 1));
7c673cae
FG
9288 return true;
9289 }
9290 } else if (prefix == "osd pg-temp") {
9291 string pgidstr;
9292 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9293 ss << "unable to parse 'pgid' value '"
9294 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9295 err = -EINVAL;
9296 goto reply;
9297 }
9298 pg_t pgid;
9299 if (!pgid.parse(pgidstr.c_str())) {
9300 ss << "invalid pgid '" << pgidstr << "'";
9301 err = -EINVAL;
9302 goto reply;
9303 }
9304 if (!osdmap.pg_exists(pgid)) {
9305 ss << "pg " << pgid << " does not exist";
9306 err = -ENOENT;
9307 goto reply;
9308 }
9309 if (pending_inc.new_pg_temp.count(pgid)) {
9310 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
9311 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9312 return true;
9313 }
9314
9315 vector<int64_t> id_vec;
9316 vector<int32_t> new_pg_temp;
9317 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9318 ss << "unable to parse 'id' value(s) '"
9319 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9320 err = -EINVAL;
9321 goto reply;
9322 }
9323 for (auto osd : id_vec) {
9324 if (!osdmap.exists(osd)) {
9325 ss << "osd." << osd << " does not exist";
9326 err = -ENOENT;
9327 goto reply;
9328 }
9329 new_pg_temp.push_back(osd);
9330 }
9331
224ce89b
WB
9332 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9333 if ((int)new_pg_temp.size() < pool_min_size) {
9334 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
9335 << pool_min_size << ")";
9336 err = -EINVAL;
9337 goto reply;
9338 }
9339
9340 int pool_size = osdmap.get_pg_pool_size(pgid);
9341 if ((int)new_pg_temp.size() > pool_size) {
9342 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
9343 << pool_size << ")";
9344 err = -EINVAL;
9345 goto reply;
9346 }
9347
7c673cae
FG
9348 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
9349 new_pg_temp.begin(), new_pg_temp.end());
9350 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
9351 goto update;
9352 } else if (prefix == "osd primary-temp") {
9353 string pgidstr;
9354 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9355 ss << "unable to parse 'pgid' value '"
9356 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9357 err = -EINVAL;
9358 goto reply;
9359 }
9360 pg_t pgid;
9361 if (!pgid.parse(pgidstr.c_str())) {
9362 ss << "invalid pgid '" << pgidstr << "'";
9363 err = -EINVAL;
9364 goto reply;
9365 }
9366 if (!osdmap.pg_exists(pgid)) {
9367 ss << "pg " << pgid << " does not exist";
9368 err = -ENOENT;
9369 goto reply;
9370 }
9371
9372 int64_t osd;
9373 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
9374 ss << "unable to parse 'id' value '"
9375 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9376 err = -EINVAL;
9377 goto reply;
9378 }
9379 if (osd != -1 && !osdmap.exists(osd)) {
9380 ss << "osd." << osd << " does not exist";
9381 err = -ENOENT;
9382 goto reply;
9383 }
9384
31f18b77
FG
9385 if (osdmap.require_min_compat_client > 0 &&
9386 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9387 ss << "require_min_compat_client "
9388 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
9389 << " < firefly, which is required for primary-temp";
9390 err = -EPERM;
9391 goto reply;
9392 } else if (!g_conf->mon_osd_allow_primary_temp) {
9393 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
9394 err = -EPERM;
9395 goto reply;
9396 }
9397
9398 pending_inc.new_primary_temp[pgid] = osd;
9399 ss << "set " << pgid << " primary_temp mapping to " << osd;
9400 goto update;
224ce89b
WB
9401 } else if (prefix == "osd pg-upmap" ||
9402 prefix == "osd rm-pg-upmap" ||
9403 prefix == "osd pg-upmap-items" ||
9404 prefix == "osd rm-pg-upmap-items") {
31f18b77 9405 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9406 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9407 << "luminous' before using the new interface";
7c673cae
FG
9408 err = -EPERM;
9409 goto reply;
9410 }
31f18b77
FG
9411 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9412 ss << "min_compat_client "
9413 << ceph_release_name(osdmap.require_min_compat_client)
224ce89b
WB
9414 << " < luminous, which is required for pg-upmap. "
9415 << "Try 'ceph osd set-require-min-compat-client luminous' "
9416 << "before using the new interface";
7c673cae
FG
9417 err = -EPERM;
9418 goto reply;
9419 }
9420 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
9421 if (err == -EAGAIN)
9422 goto wait;
9423 if (err < 0)
9424 goto reply;
9425 string pgidstr;
9426 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9427 ss << "unable to parse 'pgid' value '"
9428 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9429 err = -EINVAL;
9430 goto reply;
9431 }
9432 pg_t pgid;
9433 if (!pgid.parse(pgidstr.c_str())) {
9434 ss << "invalid pgid '" << pgidstr << "'";
9435 err = -EINVAL;
9436 goto reply;
9437 }
9438 if (!osdmap.pg_exists(pgid)) {
9439 ss << "pg " << pgid << " does not exist";
9440 err = -ENOENT;
9441 goto reply;
9442 }
224ce89b
WB
9443
9444 enum {
9445 OP_PG_UPMAP,
9446 OP_RM_PG_UPMAP,
9447 OP_PG_UPMAP_ITEMS,
9448 OP_RM_PG_UPMAP_ITEMS,
9449 } option;
9450
9451 if (prefix == "osd pg-upmap") {
9452 option = OP_PG_UPMAP;
9453 } else if (prefix == "osd rm-pg-upmap") {
9454 option = OP_RM_PG_UPMAP;
9455 } else if (prefix == "osd pg-upmap-items") {
9456 option = OP_PG_UPMAP_ITEMS;
9457 } else {
9458 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 9459 }
224ce89b
WB
9460
9461 // check pending upmap changes
9462 switch (option) {
9463 case OP_PG_UPMAP: // fall through
9464 case OP_RM_PG_UPMAP:
9465 if (pending_inc.new_pg_upmap.count(pgid) ||
9466 pending_inc.old_pg_upmap.count(pgid)) {
9467 dout(10) << __func__ << " waiting for pending update on "
9468 << pgid << dendl;
9469 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9470 return true;
7c673cae 9471 }
224ce89b 9472 break;
7c673cae 9473
224ce89b
WB
9474 case OP_PG_UPMAP_ITEMS: // fall through
9475 case OP_RM_PG_UPMAP_ITEMS:
9476 if (pending_inc.new_pg_upmap_items.count(pgid) ||
9477 pending_inc.old_pg_upmap_items.count(pgid)) {
9478 dout(10) << __func__ << " waiting for pending update on "
9479 << pgid << dendl;
9480 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9481 return true;
9482 }
9483 break;
7c673cae 9484
224ce89b
WB
9485 default:
9486 assert(0 == "invalid option");
7c673cae 9487 }
224ce89b
WB
9488
9489 switch (option) {
9490 case OP_PG_UPMAP:
9491 {
9492 vector<int64_t> id_vec;
9493 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9494 ss << "unable to parse 'id' value(s) '"
9495 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9496 err = -EINVAL;
9497 goto reply;
9498 }
9499
9500 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9501 if ((int)id_vec.size() < pool_min_size) {
9502 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
9503 << pool_min_size << ")";
9504 err = -EINVAL;
9505 goto reply;
9506 }
9507
9508 int pool_size = osdmap.get_pg_pool_size(pgid);
9509 if ((int)id_vec.size() > pool_size) {
9510 ss << "num of osds (" << id_vec.size() <<") > pool size ("
9511 << pool_size << ")";
9512 err = -EINVAL;
9513 goto reply;
9514 }
9515
9516 vector<int32_t> new_pg_upmap;
9517 for (auto osd : id_vec) {
9518 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
9519 ss << "osd." << osd << " does not exist";
9520 err = -ENOENT;
9521 goto reply;
9522 }
9523 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
9524 if (it != new_pg_upmap.end()) {
9525 ss << "osd." << osd << " already exists, ";
9526 continue;
9527 }
9528 new_pg_upmap.push_back(osd);
9529 }
9530
9531 if (new_pg_upmap.empty()) {
9532 ss << "no valid upmap items(pairs) is specified";
9533 err = -EINVAL;
9534 goto reply;
9535 }
9536
9537 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
9538 new_pg_upmap.begin(), new_pg_upmap.end());
9539 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 9540 }
224ce89b
WB
9541 break;
9542
9543 case OP_RM_PG_UPMAP:
9544 {
9545 pending_inc.old_pg_upmap.insert(pgid);
9546 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 9547 }
224ce89b 9548 break;
7c673cae 9549
224ce89b
WB
9550 case OP_PG_UPMAP_ITEMS:
9551 {
9552 vector<int64_t> id_vec;
9553 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9554 ss << "unable to parse 'id' value(s) '"
9555 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9556 err = -EINVAL;
9557 goto reply;
9558 }
9559
9560 if (id_vec.size() % 2) {
9561 ss << "you must specify pairs of osd ids to be remapped";
9562 err = -EINVAL;
9563 goto reply;
9564 }
9565
9566 int pool_size = osdmap.get_pg_pool_size(pgid);
9567 if ((int)(id_vec.size() / 2) > pool_size) {
9568 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
9569 << pool_size << ")";
9570 err = -EINVAL;
9571 goto reply;
9572 }
9573
9574 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
9575 ostringstream items;
9576 items << "[";
9577 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
9578 int from = *p++;
9579 int to = *p;
9580 if (from == to) {
9581 ss << "from osd." << from << " == to osd." << to << ", ";
9582 continue;
9583 }
9584 if (!osdmap.exists(from)) {
9585 ss << "osd." << from << " does not exist";
9586 err = -ENOENT;
9587 goto reply;
9588 }
9589 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
9590 ss << "osd." << to << " does not exist";
9591 err = -ENOENT;
9592 goto reply;
9593 }
c07f9fc5
FG
9594 pair<int32_t,int32_t> entry = make_pair(from, to);
9595 auto it = std::find(new_pg_upmap_items.begin(),
9596 new_pg_upmap_items.end(), entry);
9597 if (it != new_pg_upmap_items.end()) {
9598 ss << "osd." << from << " -> osd." << to << " already exists, ";
9599 continue;
9600 }
9601 new_pg_upmap_items.push_back(entry);
224ce89b
WB
9602 items << from << "->" << to << ",";
9603 }
9604 string out(items.str());
9605 out.resize(out.size() - 1); // drop last ','
9606 out += "]";
9607
9608 if (new_pg_upmap_items.empty()) {
9609 ss << "no valid upmap items(pairs) is specified";
9610 err = -EINVAL;
9611 goto reply;
9612 }
9613
9614 pending_inc.new_pg_upmap_items[pgid] =
9615 mempool::osdmap::vector<pair<int32_t,int32_t>>(
9616 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
9617 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
9618 }
9619 break;
9620
9621 case OP_RM_PG_UPMAP_ITEMS:
9622 {
9623 pending_inc.old_pg_upmap_items.insert(pgid);
9624 ss << "clear " << pgid << " pg_upmap_items mapping";
9625 }
9626 break;
9627
9628 default:
9629 assert(0 == "invalid option");
7c673cae
FG
9630 }
9631
7c673cae
FG
9632 goto update;
9633 } else if (prefix == "osd primary-affinity") {
9634 int64_t id;
9635 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9636 ss << "invalid osd id value '"
9637 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9638 err = -EINVAL;
9639 goto reply;
9640 }
9641 double w;
9642 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
9643 ss << "unable to parse 'weight' value '"
9644 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
9645 err = -EINVAL;
9646 goto reply;
9647 }
9648 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
9649 if (ww < 0L) {
9650 ss << "weight must be >= 0";
9651 err = -EINVAL;
9652 goto reply;
9653 }
31f18b77
FG
9654 if (osdmap.require_min_compat_client > 0 &&
9655 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9656 ss << "require_min_compat_client "
9657 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
9658 << " < firefly, which is required for primary-affinity";
9659 err = -EPERM;
9660 goto reply;
9661 } else if (!g_conf->mon_osd_allow_primary_affinity) {
9662 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
9663 err = -EPERM;
9664 goto reply;
9665 }
9666 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
9667 if (err == -EAGAIN)
9668 goto wait;
9669 if (err < 0)
9670 goto reply;
9671 if (osdmap.exists(id)) {
9672 pending_inc.new_primary_affinity[id] = ww;
9673 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
9674 getline(ss, rs);
9675 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9676 get_last_committed() + 1));
9677 return true;
9678 } else {
9679 ss << "osd." << id << " does not exist";
9680 err = -ENOENT;
9681 goto reply;
9682 }
9683 } else if (prefix == "osd reweight") {
9684 int64_t id;
9685 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9686 ss << "unable to parse osd id value '"
9687 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9688 err = -EINVAL;
9689 goto reply;
9690 }
9691 double w;
9692 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
9693 ss << "unable to parse weight value '"
9694 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
9695 err = -EINVAL;
9696 goto reply;
9697 }
9698 long ww = (int)((double)CEPH_OSD_IN*w);
9699 if (ww < 0L) {
9700 ss << "weight must be >= 0";
9701 err = -EINVAL;
9702 goto reply;
9703 }
9704 if (osdmap.exists(id)) {
9705 pending_inc.new_weight[id] = ww;
9706 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
9707 getline(ss, rs);
9708 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9709 get_last_committed() + 1));
9710 return true;
9711 } else {
9712 ss << "osd." << id << " does not exist";
9713 err = -ENOENT;
9714 goto reply;
9715 }
9716 } else if (prefix == "osd reweightn") {
9717 map<int32_t, uint32_t> weights;
9718 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
9719 if (err) {
9720 ss << "unable to parse 'weights' value '"
9721 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
9722 goto reply;
9723 }
9724 pending_inc.new_weight.insert(weights.begin(), weights.end());
9725 wait_for_finished_proposal(
9726 op,
9727 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 9728 return true;
7c673cae
FG
9729 } else if (prefix == "osd lost") {
9730 int64_t id;
9731 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9732 ss << "unable to parse osd id value '"
9733 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9734 err = -EINVAL;
9735 goto reply;
9736 }
9737 string sure;
9738 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
9739 ss << "are you SURE? this might mean real, permanent data loss. pass "
9740 "--yes-i-really-mean-it if you really do.";
9741 err = -EPERM;
9742 goto reply;
9743 } else if (!osdmap.exists(id)) {
9744 ss << "osd." << id << " does not exist";
9745 err = -ENOENT;
9746 goto reply;
9747 } else if (!osdmap.is_down(id)) {
9748 ss << "osd." << id << " is not down";
9749 err = -EBUSY;
9750 goto reply;
9751 } else {
9752 epoch_t e = osdmap.get_info(id).down_at;
9753 pending_inc.new_lost[id] = e;
9754 ss << "marked osd lost in epoch " << e;
9755 getline(ss, rs);
9756 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9757 get_last_committed() + 1));
9758 return true;
9759 }
9760
31f18b77
FG
9761 } else if (prefix == "osd destroy" || prefix == "osd purge") {
9762 /* Destroying an OSD means that we don't expect to further make use of
9763 * the OSDs data (which may even become unreadable after this operation),
9764 * and that we are okay with scrubbing all its cephx keys and config-key
9765 * data (which may include lockbox keys, thus rendering the osd's data
9766 * unreadable).
9767 *
9768 * The OSD will not be removed. Instead, we will mark it as destroyed,
9769 * such that a subsequent call to `create` will not reuse the osd id.
9770 * This will play into being able to recreate the OSD, at the same
9771 * crush location, with minimal data movement.
9772 */
9773
9774 // make sure authmon is writeable.
9775 if (!mon->authmon()->is_writeable()) {
9776 dout(10) << __func__ << " waiting for auth mon to be writeable for "
9777 << "osd destroy" << dendl;
9778 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9779 return false;
9780 }
9781
9782 int64_t id;
9783 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9784 ss << "unable to parse osd id value '"
9785 << cmd_vartype_stringify(cmdmap["id"]) << "";
9786 err = -EINVAL;
9787 goto reply;
9788 }
9789
9790 bool is_destroy = (prefix == "osd destroy");
9791 if (!is_destroy) {
9792 assert("osd purge" == prefix);
9793 }
9794
9795 string sure;
9796 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
9797 sure != "--yes-i-really-mean-it") {
9798 ss << "Are you SURE? This will mean real, permanent data loss, as well "
9799 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
9800 << "really do.";
9801 err = -EPERM;
9802 goto reply;
d2e6a577 9803 } else if (!osdmap.exists(id)) {
31f18b77 9804 ss << "osd." << id << " does not exist";
d2e6a577 9805 err = 0; // idempotent
31f18b77
FG
9806 goto reply;
9807 } else if (osdmap.is_up(id)) {
9808 ss << "osd." << id << " is not `down`.";
9809 err = -EBUSY;
9810 goto reply;
9811 } else if (is_destroy && osdmap.is_destroyed(id)) {
9812 ss << "destroyed osd." << id;
9813 err = 0;
9814 goto reply;
9815 }
9816
9817 bool goto_reply = false;
9818
9819 paxos->plug();
9820 if (is_destroy) {
9821 err = prepare_command_osd_destroy(id, ss);
9822 // we checked above that it should exist.
9823 assert(err != -ENOENT);
9824 } else {
9825 err = prepare_command_osd_purge(id, ss);
9826 if (err == -ENOENT) {
9827 err = 0;
9828 ss << "osd." << id << " does not exist.";
9829 goto_reply = true;
9830 }
9831 }
9832 paxos->unplug();
9833
9834 if (err < 0 || goto_reply) {
9835 goto reply;
9836 }
9837
9838 if (is_destroy) {
9839 ss << "destroyed osd." << id;
9840 } else {
9841 ss << "purged osd." << id;
9842 }
9843
9844 getline(ss, rs);
9845 wait_for_finished_proposal(op,
9846 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
9847 force_immediate_propose();
9848 return true;
9849
9850 } else if (prefix == "osd new") {
9851
9852 // make sure authmon is writeable.
9853 if (!mon->authmon()->is_writeable()) {
9854 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 9855 << "osd new" << dendl;
31f18b77
FG
9856 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9857 return false;
9858 }
9859
9860 map<string,string> secrets_map;
9861
9862 bufferlist bl = m->get_data();
9863 string secrets_json = bl.to_str();
9864 dout(20) << __func__ << " osd new json = " << secrets_json << dendl;
9865
9866 err = get_json_str_map(secrets_json, ss, &secrets_map);
9867 if (err < 0)
9868 goto reply;
9869
9870 dout(20) << __func__ << " osd new secrets " << secrets_map << dendl;
9871
9872 paxos->plug();
9873 err = prepare_command_osd_new(op, cmdmap, secrets_map, ss, f.get());
9874 paxos->unplug();
9875
9876 if (err < 0) {
9877 goto reply;
9878 }
9879
9880 if (f) {
9881 f->flush(rdata);
9882 } else {
9883 rdata.append(ss);
9884 }
9885
9886 if (err == EEXIST) {
9887 // idempotent operation
9888 err = 0;
9889 goto reply;
9890 }
9891
9892 wait_for_finished_proposal(op,
9893 new Monitor::C_Command(mon, op, 0, rs, rdata,
9894 get_last_committed() + 1));
9895 force_immediate_propose();
9896 return true;
9897
7c673cae 9898 } else if (prefix == "osd create") {
7c673cae
FG
9899
9900 // optional id provided?
31f18b77
FG
9901 int64_t id = -1, cmd_id = -1;
9902 if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
9903 if (cmd_id < 0) {
9904 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
9905 err = -EINVAL;
9906 goto reply;
9907 }
31f18b77 9908 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
9909 }
9910
7c673cae
FG
9911 uuid_d uuid;
9912 string uuidstr;
9913 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
9914 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
9915 ss << "invalid uuid value '" << uuidstr << "'";
9916 err = -EINVAL;
9917 goto reply;
7c673cae 9918 }
31f18b77
FG
9919 // we only care about the id if we also have the uuid, to
9920 // ensure the operation's idempotency.
9921 id = cmd_id;
7c673cae
FG
9922 }
9923
31f18b77
FG
9924 int32_t new_id = -1;
9925 err = prepare_command_osd_create(id, uuid, &new_id, ss);
9926 if (err < 0) {
9927 if (err == -EAGAIN) {
9928 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9929 return true;
9930 }
9931 // a check has failed; reply to the user.
9932 goto reply;
9933
9934 } else if (err == EEXIST) {
9935 // this is an idempotent operation; we can go ahead and reply.
9936 if (f) {
9937 f->open_object_section("created_osd");
9938 f->dump_int("osdid", new_id);
9939 f->close_section();
9940 f->flush(rdata);
9941 } else {
9942 ss << new_id;
9943 rdata.append(ss);
7c673cae 9944 }
31f18b77
FG
9945 err = 0;
9946 goto reply;
7c673cae
FG
9947 }
9948
31f18b77
FG
9949 do_osd_create(id, uuid, &new_id);
9950
7c673cae
FG
9951 if (f) {
9952 f->open_object_section("created_osd");
31f18b77 9953 f->dump_int("osdid", new_id);
7c673cae
FG
9954 f->close_section();
9955 f->flush(rdata);
9956 } else {
31f18b77 9957 ss << new_id;
7c673cae
FG
9958 rdata.append(ss);
9959 }
31f18b77
FG
9960 wait_for_finished_proposal(op,
9961 new Monitor::C_Command(mon, op, 0, rs, rdata,
9962 get_last_committed() + 1));
7c673cae
FG
9963 return true;
9964
9965 } else if (prefix == "osd blacklist clear") {
9966 pending_inc.new_blacklist.clear();
9967 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
9968 osdmap.get_blacklist(&blacklist);
9969 for (const auto &entry : blacklist) {
9970 pending_inc.old_blacklist.push_back(entry.first);
9971 }
9972 ss << " removed all blacklist entries";
9973 getline(ss, rs);
9974 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9975 get_last_committed() + 1));
9976 return true;
9977 } else if (prefix == "osd blacklist") {
9978 string addrstr;
9979 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
9980 entity_addr_t addr;
9981 if (!addr.parse(addrstr.c_str(), 0)) {
9982 ss << "unable to parse address " << addrstr;
9983 err = -EINVAL;
9984 goto reply;
9985 }
9986 else {
9987 string blacklistop;
9988 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
9989 if (blacklistop == "add") {
9990 utime_t expires = ceph_clock_now();
9991 double d;
9992 // default one hour
224ce89b
WB
9993 cmd_getval(g_ceph_context, cmdmap, "expire", d,
9994 g_conf->mon_osd_blacklist_default_expire);
7c673cae
FG
9995 expires += d;
9996
9997 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
9998
9999 {
10000 // cancel any pending un-blacklisting request too
10001 auto it = std::find(pending_inc.old_blacklist.begin(),
10002 pending_inc.old_blacklist.end(), addr);
10003 if (it != pending_inc.old_blacklist.end()) {
10004 pending_inc.old_blacklist.erase(it);
10005 }
10006 }
10007
7c673cae
FG
10008 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
10009 getline(ss, rs);
10010 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10011 get_last_committed() + 1));
10012 return true;
10013 } else if (blacklistop == "rm") {
10014 if (osdmap.is_blacklisted(addr) ||
10015 pending_inc.new_blacklist.count(addr)) {
10016 if (osdmap.is_blacklisted(addr))
10017 pending_inc.old_blacklist.push_back(addr);
10018 else
10019 pending_inc.new_blacklist.erase(addr);
10020 ss << "un-blacklisting " << addr;
10021 getline(ss, rs);
10022 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10023 get_last_committed() + 1));
10024 return true;
10025 }
10026 ss << addr << " isn't blacklisted";
10027 err = 0;
10028 goto reply;
10029 }
10030 }
10031 } else if (prefix == "osd pool mksnap") {
10032 string poolstr;
10033 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10034 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10035 if (pool < 0) {
10036 ss << "unrecognized pool '" << poolstr << "'";
10037 err = -ENOENT;
10038 goto reply;
10039 }
10040 string snapname;
10041 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10042 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10043 if (p->is_unmanaged_snaps_mode()) {
10044 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10045 err = -EINVAL;
10046 goto reply;
10047 } else if (p->snap_exists(snapname.c_str())) {
10048 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10049 err = 0;
10050 goto reply;
10051 } else if (p->is_tier()) {
10052 ss << "pool " << poolstr << " is a cache tier";
10053 err = -EINVAL;
10054 goto reply;
10055 }
10056 pg_pool_t *pp = 0;
10057 if (pending_inc.new_pools.count(pool))
10058 pp = &pending_inc.new_pools[pool];
10059 if (!pp) {
10060 pp = &pending_inc.new_pools[pool];
10061 *pp = *p;
10062 }
10063 if (pp->snap_exists(snapname.c_str())) {
10064 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10065 } else {
10066 pp->add_snap(snapname.c_str(), ceph_clock_now());
10067 pp->set_snap_epoch(pending_inc.epoch);
10068 ss << "created pool " << poolstr << " snap " << snapname;
10069 }
10070 getline(ss, rs);
10071 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10072 get_last_committed() + 1));
10073 return true;
10074 } else if (prefix == "osd pool rmsnap") {
10075 string poolstr;
10076 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10077 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10078 if (pool < 0) {
10079 ss << "unrecognized pool '" << poolstr << "'";
10080 err = -ENOENT;
10081 goto reply;
10082 }
10083 string snapname;
10084 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10085 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10086 if (p->is_unmanaged_snaps_mode()) {
10087 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10088 err = -EINVAL;
10089 goto reply;
10090 } else if (!p->snap_exists(snapname.c_str())) {
10091 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
10092 err = 0;
10093 goto reply;
10094 }
10095 pg_pool_t *pp = 0;
10096 if (pending_inc.new_pools.count(pool))
10097 pp = &pending_inc.new_pools[pool];
10098 if (!pp) {
10099 pp = &pending_inc.new_pools[pool];
10100 *pp = *p;
10101 }
10102 snapid_t sn = pp->snap_exists(snapname.c_str());
10103 if (sn) {
10104 pp->remove_snap(sn);
10105 pp->set_snap_epoch(pending_inc.epoch);
10106 ss << "removed pool " << poolstr << " snap " << snapname;
10107 } else {
10108 ss << "already removed pool " << poolstr << " snap " << snapname;
10109 }
10110 getline(ss, rs);
10111 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10112 get_last_committed() + 1));
10113 return true;
10114 } else if (prefix == "osd pool create") {
10115 int64_t pg_num;
10116 int64_t pgp_num;
10117 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
10118 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
10119
10120 string pool_type_str;
10121 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
10122 if (pool_type_str.empty())
224ce89b 10123 pool_type_str = g_conf->osd_pool_default_type;
7c673cae
FG
10124
10125 string poolstr;
10126 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10127 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10128 if (pool_id >= 0) {
10129 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10130 if (pool_type_str != p->get_type_name()) {
10131 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
10132 err = -EINVAL;
10133 } else {
10134 ss << "pool '" << poolstr << "' already exists";
10135 err = 0;
10136 }
10137 goto reply;
10138 }
10139
10140 int pool_type;
10141 if (pool_type_str == "replicated") {
10142 pool_type = pg_pool_t::TYPE_REPLICATED;
10143 } else if (pool_type_str == "erasure") {
10144 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
10145 CEPH_FEATURE_OSD_ERASURE_CODES,
10146 ss);
10147 if (err == -EAGAIN)
10148 goto wait;
10149 if (err)
10150 goto reply;
10151 pool_type = pg_pool_t::TYPE_ERASURE;
10152 } else {
10153 ss << "unknown pool type '" << pool_type_str << "'";
10154 err = -EINVAL;
10155 goto reply;
10156 }
10157
31f18b77
FG
10158 bool implicit_rule_creation = false;
10159 string rule_name;
10160 cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
7c673cae
FG
10161 string erasure_code_profile;
10162 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
10163
10164 if (pool_type == pg_pool_t::TYPE_ERASURE) {
10165 if (erasure_code_profile == "")
10166 erasure_code_profile = "default";
10167 //handle the erasure code profile
10168 if (erasure_code_profile == "default") {
10169 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
10170 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
10171 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
10172 goto wait;
10173 }
10174
10175 map<string,string> profile_map;
10176 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
10177 profile_map,
10178 &ss);
10179 if (err)
10180 goto reply;
10181 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
10182 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
10183 goto wait;
10184 }
10185 }
31f18b77
FG
10186 if (rule_name == "") {
10187 implicit_rule_creation = true;
7c673cae 10188 if (erasure_code_profile == "default") {
31f18b77 10189 rule_name = "erasure-code";
7c673cae 10190 } else {
31f18b77 10191 dout(1) << "implicitly use rule named after the pool: "
7c673cae 10192 << poolstr << dendl;
31f18b77 10193 rule_name = poolstr;
7c673cae
FG
10194 }
10195 }
10196 } else {
31f18b77
FG
10197 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
10198 rule_name = erasure_code_profile;
7c673cae
FG
10199 }
10200
31f18b77
FG
10201 if (!implicit_rule_creation && rule_name != "") {
10202 int rule;
10203 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
10204 if (err == -EAGAIN) {
10205 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10206 return true;
10207 }
10208 if (err)
10209 goto reply;
10210 }
10211
10212 int64_t expected_num_objects;
10213 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
10214 if (expected_num_objects < 0) {
10215 ss << "'expected_num_objects' must be non-negative";
10216 err = -EINVAL;
10217 goto reply;
10218 }
10219
10220 int64_t fast_read_param;
10221 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
10222 FastReadType fast_read = FAST_READ_DEFAULT;
10223 if (fast_read_param == 0)
10224 fast_read = FAST_READ_OFF;
10225 else if (fast_read_param > 0)
10226 fast_read = FAST_READ_ON;
10227
10228 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
10229 -1, // default crush rule
31f18b77 10230 rule_name,
7c673cae
FG
10231 pg_num, pgp_num,
10232 erasure_code_profile, pool_type,
10233 (uint64_t)expected_num_objects,
10234 fast_read,
10235 &ss);
10236 if (err < 0) {
10237 switch(err) {
10238 case -EEXIST:
10239 ss << "pool '" << poolstr << "' already exists";
10240 break;
10241 case -EAGAIN:
10242 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10243 return true;
10244 case -ERANGE:
10245 goto reply;
10246 default:
10247 goto reply;
10248 break;
10249 }
10250 } else {
10251 ss << "pool '" << poolstr << "' created";
10252 }
10253 getline(ss, rs);
10254 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10255 get_last_committed() + 1));
10256 return true;
10257
10258 } else if (prefix == "osd pool delete" ||
10259 prefix == "osd pool rm") {
10260 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10261 string poolstr, poolstr2, sure;
10262 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10263 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
10264 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10265 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10266 if (pool < 0) {
10267 ss << "pool '" << poolstr << "' does not exist";
10268 err = 0;
10269 goto reply;
10270 }
10271
10272 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
10273 if (poolstr2 != poolstr ||
10274 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
10275 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10276 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10277 << "followed by --yes-i-really-really-mean-it.";
10278 err = -EPERM;
10279 goto reply;
10280 }
10281 err = _prepare_remove_pool(pool, &ss, force_no_fake);
10282 if (err == -EAGAIN) {
10283 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10284 return true;
10285 }
10286 if (err < 0)
10287 goto reply;
10288 goto update;
10289 } else if (prefix == "osd pool rename") {
10290 string srcpoolstr, destpoolstr;
10291 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
10292 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
10293 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
10294 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
10295
10296 if (pool_src < 0) {
10297 if (pool_dst >= 0) {
10298 // src pool doesn't exist, dst pool does exist: to ensure idempotency
10299 // of operations, assume this rename succeeded, as it is not changing
10300 // the current state. Make sure we output something understandable
10301 // for whoever is issuing the command, if they are paying attention,
10302 // in case it was not intentional; or to avoid a "wtf?" and a bug
10303 // report in case it was intentional, while expecting a failure.
10304 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
10305 << destpoolstr << "' does -- assuming successful rename";
10306 err = 0;
10307 } else {
10308 ss << "unrecognized pool '" << srcpoolstr << "'";
10309 err = -ENOENT;
10310 }
10311 goto reply;
10312 } else if (pool_dst >= 0) {
10313 // source pool exists and so does the destination pool
10314 ss << "pool '" << destpoolstr << "' already exists";
10315 err = -EEXIST;
10316 goto reply;
10317 }
10318
10319 int ret = _prepare_rename_pool(pool_src, destpoolstr);
10320 if (ret == 0) {
10321 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
10322 } else {
10323 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
10324 << cpp_strerror(ret);
10325 }
10326 getline(ss, rs);
10327 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
10328 get_last_committed() + 1));
10329 return true;
10330
10331 } else if (prefix == "osd pool set") {
10332 err = prepare_command_pool_set(cmdmap, ss);
10333 if (err == -EAGAIN)
10334 goto wait;
10335 if (err < 0)
10336 goto reply;
10337
10338 getline(ss, rs);
10339 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10340 get_last_committed() + 1));
10341 return true;
10342 } else if (prefix == "osd tier add") {
10343 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10344 if (err == -EAGAIN)
10345 goto wait;
10346 if (err)
10347 goto reply;
10348 string poolstr;
10349 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10350 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10351 if (pool_id < 0) {
10352 ss << "unrecognized pool '" << poolstr << "'";
10353 err = -ENOENT;
10354 goto reply;
10355 }
10356 string tierpoolstr;
10357 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10358 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10359 if (tierpool_id < 0) {
10360 ss << "unrecognized pool '" << tierpoolstr << "'";
10361 err = -ENOENT;
10362 goto reply;
10363 }
10364 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10365 assert(p);
10366 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10367 assert(tp);
10368
10369 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10370 goto reply;
10371 }
10372
10373 // make sure new tier is empty
10374 string force_nonempty;
10375 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
31f18b77
FG
10376 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
10377 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
10378 force_nonempty != "--force-nonempty") {
10379 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
10380 err = -ENOTEMPTY;
10381 goto reply;
10382 }
10383 if (tp->ec_pool()) {
10384 ss << "tier pool '" << tierpoolstr
10385 << "' is an ec pool, which cannot be a tier";
10386 err = -ENOTSUP;
10387 goto reply;
10388 }
10389 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
10390 ((force_nonempty != "--force-nonempty") ||
10391 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
10392 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
10393 err = -ENOTEMPTY;
10394 goto reply;
10395 }
10396 // go
10397 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10398 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10399 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10400 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10401 return true;
10402 }
10403 np->tiers.insert(tierpool_id);
10404 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10405 ntp->tier_of = pool_id;
10406 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
10407 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10408 get_last_committed() + 1));
10409 return true;
10410 } else if (prefix == "osd tier remove" ||
10411 prefix == "osd tier rm") {
10412 string poolstr;
10413 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10414 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10415 if (pool_id < 0) {
10416 ss << "unrecognized pool '" << poolstr << "'";
10417 err = -ENOENT;
10418 goto reply;
10419 }
10420 string tierpoolstr;
10421 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10422 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10423 if (tierpool_id < 0) {
10424 ss << "unrecognized pool '" << tierpoolstr << "'";
10425 err = -ENOENT;
10426 goto reply;
10427 }
10428 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10429 assert(p);
10430 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10431 assert(tp);
10432
10433 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
10434 goto reply;
10435 }
10436
10437 if (p->tiers.count(tierpool_id) == 0) {
10438 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10439 err = 0;
10440 goto reply;
10441 }
10442 if (tp->tier_of != pool_id) {
10443 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
10444 << osdmap.get_pool_name(tp->tier_of) << "': "
10445 // be scary about it; this is an inconsistency and bells must go off
10446 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
10447 err = -EINVAL;
10448 goto reply;
10449 }
10450 if (p->read_tier == tierpool_id) {
10451 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
10452 err = -EBUSY;
10453 goto reply;
10454 }
10455 // go
10456 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10457 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10458 if (np->tiers.count(tierpool_id) == 0 ||
10459 ntp->tier_of != pool_id ||
10460 np->read_tier == tierpool_id) {
10461 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10462 return true;
10463 }
10464 np->tiers.erase(tierpool_id);
10465 ntp->clear_tier();
10466 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10467 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10468 get_last_committed() + 1));
10469 return true;
10470 } else if (prefix == "osd tier set-overlay") {
10471 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10472 if (err == -EAGAIN)
10473 goto wait;
10474 if (err)
10475 goto reply;
10476 string poolstr;
10477 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10478 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10479 if (pool_id < 0) {
10480 ss << "unrecognized pool '" << poolstr << "'";
10481 err = -ENOENT;
10482 goto reply;
10483 }
10484 string overlaypoolstr;
10485 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
10486 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
10487 if (overlaypool_id < 0) {
10488 ss << "unrecognized pool '" << overlaypoolstr << "'";
10489 err = -ENOENT;
10490 goto reply;
10491 }
10492 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10493 assert(p);
10494 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
10495 assert(overlay_p);
10496 if (p->tiers.count(overlaypool_id) == 0) {
10497 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
10498 err = -EINVAL;
10499 goto reply;
10500 }
10501 if (p->read_tier == overlaypool_id) {
10502 err = 0;
10503 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10504 goto reply;
10505 }
10506 if (p->has_read_tier()) {
10507 ss << "pool '" << poolstr << "' has overlay '"
10508 << osdmap.get_pool_name(p->read_tier)
10509 << "'; please remove-overlay first";
10510 err = -EINVAL;
10511 goto reply;
10512 }
10513
10514 // go
10515 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10516 np->read_tier = overlaypool_id;
10517 np->write_tier = overlaypool_id;
10518 np->set_last_force_op_resend(pending_inc.epoch);
10519 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
10520 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
10521 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10522 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
10523 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
10524 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10525 get_last_committed() + 1));
10526 return true;
10527 } else if (prefix == "osd tier remove-overlay" ||
10528 prefix == "osd tier rm-overlay") {
10529 string poolstr;
10530 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10531 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10532 if (pool_id < 0) {
10533 ss << "unrecognized pool '" << poolstr << "'";
10534 err = -ENOENT;
10535 goto reply;
10536 }
10537 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10538 assert(p);
10539 if (!p->has_read_tier()) {
10540 err = 0;
10541 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
10542 goto reply;
10543 }
10544
10545 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
10546 goto reply;
10547 }
10548
10549 // go
10550 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10551 if (np->has_read_tier()) {
10552 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
10553 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
10554 nop->set_last_force_op_resend(pending_inc.epoch);
10555 }
10556 if (np->has_write_tier()) {
10557 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
10558 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
10559 nop->set_last_force_op_resend(pending_inc.epoch);
10560 }
10561 np->clear_read_tier();
10562 np->clear_write_tier();
10563 np->set_last_force_op_resend(pending_inc.epoch);
10564 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
10565 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10566 get_last_committed() + 1));
10567 return true;
10568 } else if (prefix == "osd tier cache-mode") {
10569 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10570 if (err == -EAGAIN)
10571 goto wait;
10572 if (err)
10573 goto reply;
10574 string poolstr;
10575 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10576 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10577 if (pool_id < 0) {
10578 ss << "unrecognized pool '" << poolstr << "'";
10579 err = -ENOENT;
10580 goto reply;
10581 }
10582 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10583 assert(p);
10584 if (!p->is_tier()) {
10585 ss << "pool '" << poolstr << "' is not a tier";
10586 err = -EINVAL;
10587 goto reply;
10588 }
10589 string modestr;
10590 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
10591 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
10592 if (mode < 0) {
10593 ss << "'" << modestr << "' is not a valid cache mode";
10594 err = -EINVAL;
10595 goto reply;
10596 }
10597
10598 string sure;
10599 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10600 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10601 mode != pg_pool_t::CACHEMODE_NONE &&
10602 mode != pg_pool_t::CACHEMODE_PROXY &&
10603 mode != pg_pool_t::CACHEMODE_READPROXY) &&
10604 sure != "--yes-i-really-mean-it") {
10605 ss << "'" << modestr << "' is not a well-supported cache mode and may "
10606 << "corrupt your data. pass --yes-i-really-mean-it to force.";
10607 err = -EPERM;
10608 goto reply;
10609 }
10610
10611 // pool already has this cache-mode set and there are no pending changes
10612 if (p->cache_mode == mode &&
10613 (pending_inc.new_pools.count(pool_id) == 0 ||
10614 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
10615 ss << "set cache-mode for pool '" << poolstr << "'"
10616 << " to " << pg_pool_t::get_cache_mode_name(mode);
10617 err = 0;
10618 goto reply;
10619 }
10620
10621 /* Mode description:
10622 *
10623 * none: No cache-mode defined
10624 * forward: Forward all reads and writes to base pool
10625 * writeback: Cache writes, promote reads from base pool
10626 * readonly: Forward writes to base pool
10627 * readforward: Writes are in writeback mode, Reads are in forward mode
10628 * proxy: Proxy all reads and writes to base pool
10629 * readproxy: Writes are in writeback mode, Reads are in proxy mode
10630 *
10631 * Hence, these are the allowed transitions:
10632 *
10633 * none -> any
10634 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10635 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10636 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
10637 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
10638 * writeback -> readforward || readproxy || forward || proxy
10639 * readonly -> any
10640 */
10641
10642 // We check if the transition is valid against the current pool mode, as
10643 // it is the only committed state thus far. We will blantly squash
10644 // whatever mode is on the pending state.
10645
10646 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
10647 (mode != pg_pool_t::CACHEMODE_FORWARD &&
10648 mode != pg_pool_t::CACHEMODE_PROXY &&
10649 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10650 mode != pg_pool_t::CACHEMODE_READPROXY)) {
10651 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
10652 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
10653 << "' pool; only '"
10654 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
10655 << "','"
10656 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
10657 << "','"
10658 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
10659 << "','"
10660 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
10661 << "' allowed.";
10662 err = -EINVAL;
10663 goto reply;
10664 }
10665 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
10666 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10667 mode != pg_pool_t::CACHEMODE_FORWARD &&
10668 mode != pg_pool_t::CACHEMODE_PROXY &&
10669 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10670
10671 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
10672 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10673 mode != pg_pool_t::CACHEMODE_FORWARD &&
10674 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10675 mode != pg_pool_t::CACHEMODE_PROXY)) ||
10676
10677 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
10678 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10679 mode != pg_pool_t::CACHEMODE_FORWARD &&
10680 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10681 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10682
10683 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
10684 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10685 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10686 mode != pg_pool_t::CACHEMODE_PROXY &&
10687 mode != pg_pool_t::CACHEMODE_READPROXY))) {
10688
31f18b77
FG
10689 const pool_stat_t* pstats =
10690 mon->pgservice->get_pool_stat(pool_id);
7c673cae 10691
31f18b77 10692 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
10693 ss << "unable to set cache-mode '"
10694 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
10695 << "': dirty objects found";
10696 err = -EBUSY;
10697 goto reply;
10698 }
10699 }
10700 // go
10701 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10702 np->cache_mode = mode;
10703 // set this both when moving to and from cache_mode NONE. this is to
10704 // capture legacy pools that were set up before this flag existed.
10705 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
10706 ss << "set cache-mode for pool '" << poolstr
10707 << "' to " << pg_pool_t::get_cache_mode_name(mode);
10708 if (mode == pg_pool_t::CACHEMODE_NONE) {
10709 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
10710 assert(base_pool);
10711 if (base_pool->read_tier == pool_id ||
10712 base_pool->write_tier == pool_id)
10713 ss <<" (WARNING: pool is still configured as read or write tier)";
10714 }
10715 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10716 get_last_committed() + 1));
10717 return true;
10718 } else if (prefix == "osd tier add-cache") {
10719 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10720 if (err == -EAGAIN)
10721 goto wait;
10722 if (err)
10723 goto reply;
10724 string poolstr;
10725 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10726 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10727 if (pool_id < 0) {
10728 ss << "unrecognized pool '" << poolstr << "'";
10729 err = -ENOENT;
10730 goto reply;
10731 }
10732 string tierpoolstr;
10733 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10734 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10735 if (tierpool_id < 0) {
10736 ss << "unrecognized pool '" << tierpoolstr << "'";
10737 err = -ENOENT;
10738 goto reply;
10739 }
10740 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10741 assert(p);
10742 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10743 assert(tp);
10744
10745 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10746 goto reply;
10747 }
10748
10749 int64_t size = 0;
10750 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
10751 ss << "unable to parse 'size' value '"
10752 << cmd_vartype_stringify(cmdmap["size"]) << "'";
10753 err = -EINVAL;
10754 goto reply;
10755 }
10756 // make sure new tier is empty
31f18b77
FG
10757 const pool_stat_t *pstats =
10758 mon->pgservice->get_pool_stat(tierpool_id);
10759 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
10760 ss << "tier pool '" << tierpoolstr << "' is not empty";
10761 err = -ENOTEMPTY;
10762 goto reply;
10763 }
10764 string modestr = g_conf->osd_tier_default_cache_mode;
10765 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
10766 if (mode < 0) {
10767 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
10768 err = -EINVAL;
10769 goto reply;
10770 }
10771 HitSet::Params hsp;
10772 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
10773 BloomHitSet::Params *bsp = new BloomHitSet::Params;
10774 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
10775 hsp = HitSet::Params(bsp);
10776 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
10777 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
10778 }
10779 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
10780 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
10781 } else {
10782 ss << "osd tier cache default hit set type '" <<
10783 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
10784 err = -EINVAL;
10785 goto reply;
10786 }
10787 // go
10788 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10789 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10790 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10791 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10792 return true;
10793 }
10794 np->tiers.insert(tierpool_id);
10795 np->read_tier = np->write_tier = tierpool_id;
10796 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10797 np->set_last_force_op_resend(pending_inc.epoch);
10798 ntp->set_last_force_op_resend(pending_inc.epoch);
10799 ntp->tier_of = pool_id;
10800 ntp->cache_mode = mode;
10801 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
10802 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
10803 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
10804 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
10805 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
10806 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
10807 ntp->hit_set_params = hsp;
10808 ntp->target_max_bytes = size;
10809 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
10810 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10811 get_last_committed() + 1));
10812 return true;
10813 } else if (prefix == "osd pool set-quota") {
10814 string poolstr;
10815 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10816 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10817 if (pool_id < 0) {
10818 ss << "unrecognized pool '" << poolstr << "'";
10819 err = -ENOENT;
10820 goto reply;
10821 }
10822
10823 string field;
10824 cmd_getval(g_ceph_context, cmdmap, "field", field);
10825 if (field != "max_objects" && field != "max_bytes") {
10826 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
10827 err = -EINVAL;
10828 goto reply;
10829 }
10830
10831 // val could contain unit designations, so we treat as a string
10832 string val;
10833 cmd_getval(g_ceph_context, cmdmap, "val", val);
10834 stringstream tss;
10835 int64_t value = unit_to_bytesize(val, &tss);
10836 if (value < 0) {
10837 ss << "error parsing value '" << value << "': " << tss.str();
10838 err = value;
10839 goto reply;
10840 }
10841
10842 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
10843 if (field == "max_objects") {
10844 pi->quota_max_objects = value;
10845 } else if (field == "max_bytes") {
10846 pi->quota_max_bytes = value;
10847 } else {
10848 assert(0 == "unrecognized option");
10849 }
10850 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
10851 rs = ss.str();
10852 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10853 get_last_committed() + 1));
10854 return true;
c07f9fc5
FG
10855 } else if (prefix == "osd pool application enable" ||
10856 prefix == "osd pool application disable" ||
10857 prefix == "osd pool application set" ||
10858 prefix == "osd pool application rm") {
10859 err = prepare_command_pool_application(prefix, cmdmap, ss);
10860 if (err == -EAGAIN)
10861 goto wait;
10862 if (err < 0)
10863 goto reply;
7c673cae 10864
c07f9fc5
FG
10865 getline(ss, rs);
10866 wait_for_finished_proposal(
10867 op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
10868 return true;
7c673cae
FG
10869 } else if (prefix == "osd reweight-by-pg" ||
10870 prefix == "osd reweight-by-utilization" ||
10871 prefix == "osd test-reweight-by-pg" ||
10872 prefix == "osd test-reweight-by-utilization") {
10873 bool by_pg =
10874 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
10875 bool dry_run =
10876 prefix == "osd test-reweight-by-pg" ||
10877 prefix == "osd test-reweight-by-utilization";
10878 int64_t oload;
10879 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
10880 set<int64_t> pools;
10881 vector<string> poolnamevec;
10882 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
10883 for (unsigned j = 0; j < poolnamevec.size(); j++) {
10884 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
10885 if (pool < 0) {
10886 ss << "pool '" << poolnamevec[j] << "' does not exist";
10887 err = -ENOENT;
10888 goto reply;
10889 }
10890 pools.insert(pool);
10891 }
10892 double max_change = g_conf->mon_reweight_max_change;
10893 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
10894 if (max_change <= 0.0) {
10895 ss << "max_change " << max_change << " must be positive";
10896 err = -EINVAL;
10897 goto reply;
10898 }
10899 int64_t max_osds = g_conf->mon_reweight_max_osds;
10900 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
10901 if (max_osds <= 0) {
10902 ss << "max_osds " << max_osds << " must be positive";
10903 err = -EINVAL;
10904 goto reply;
10905 }
10906 string no_increasing;
10907 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
10908 string out_str;
10909 mempool::osdmap::map<int32_t, uint32_t> new_weights;
31f18b77
FG
10910 err = mon->pgservice->reweight_by_utilization(osdmap,
10911 oload,
10912 max_change,
10913 max_osds,
10914 by_pg,
10915 pools.empty() ? NULL : &pools,
10916 no_increasing == "--no-increasing",
10917 &new_weights,
10918 &ss, &out_str, f.get());
7c673cae
FG
10919 if (err >= 0) {
10920 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
10921 }
10922 if (f)
10923 f->flush(rdata);
10924 else
10925 rdata.append(out_str);
10926 if (err < 0) {
10927 ss << "FAILED reweight-by-pg";
10928 } else if (err == 0 || dry_run) {
10929 ss << "no change";
10930 } else {
10931 ss << "SUCCESSFUL reweight-by-pg";
10932 pending_inc.new_weight = std::move(new_weights);
10933 wait_for_finished_proposal(
10934 op,
10935 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
10936 return true;
10937 }
c07f9fc5
FG
10938 } else if (prefix == "osd force-create-pg") {
10939 pg_t pgid;
10940 string pgidstr;
10941 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
10942 if (!pgid.parse(pgidstr.c_str())) {
10943 ss << "invalid pgid '" << pgidstr << "'";
10944 err = -EINVAL;
10945 goto reply;
10946 }
10947 bool creating_now;
10948 {
10949 std::lock_guard<std::mutex> l(creating_pgs_lock);
10950 auto emplaced = creating_pgs.pgs.emplace(pgid,
10951 make_pair(osdmap.get_epoch(),
10952 ceph_clock_now()));
10953 creating_now = emplaced.second;
10954 }
10955 if (creating_now) {
10956 ss << "pg " << pgidstr << " now creating, ok";
10957 err = 0;
10958 goto update;
10959 } else {
10960 ss << "pg " << pgid << " already creating";
10961 err = 0;
10962 goto reply;
10963 }
7c673cae
FG
10964 } else {
10965 err = -EINVAL;
10966 }
10967
10968 reply:
10969 getline(ss, rs);
10970 if (err < 0 && rs.length() == 0)
10971 rs = cpp_strerror(err);
10972 mon->reply_command(op, err, rs, rdata, get_last_committed());
10973 return ret;
10974
10975 update:
10976 getline(ss, rs);
10977 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10978 get_last_committed() + 1));
10979 return true;
10980
10981 wait:
10982 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10983 return true;
10984}
10985
10986bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
10987{
10988 op->mark_osdmon_event(__func__);
10989 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10990
10991 if (m->fsid != mon->monmap->fsid) {
10992 dout(0) << __func__ << " drop message on fsid " << m->fsid
10993 << " != " << mon->monmap->fsid << " for " << *m << dendl;
10994 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10995 return true;
10996 }
10997
10998 if (m->op == POOL_OP_CREATE)
10999 return preprocess_pool_op_create(op);
11000
11001 if (!osdmap.get_pg_pool(m->pool)) {
11002 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11003 _pool_op_reply(op, 0, osdmap.get_epoch());
11004 return true;
11005 }
11006
11007 // check if the snap and snapname exist
11008 bool snap_exists = false;
11009 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
11010 if (p->snap_exists(m->name.c_str()))
11011 snap_exists = true;
11012
11013 switch (m->op) {
11014 case POOL_OP_CREATE_SNAP:
11015 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
11016 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11017 return true;
11018 }
11019 if (snap_exists) {
11020 _pool_op_reply(op, 0, osdmap.get_epoch());
11021 return true;
11022 }
11023 return false;
11024 case POOL_OP_CREATE_UNMANAGED_SNAP:
11025 if (p->is_pool_snaps_mode()) {
11026 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11027 return true;
11028 }
11029 return false;
11030 case POOL_OP_DELETE_SNAP:
11031 if (p->is_unmanaged_snaps_mode()) {
11032 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11033 return true;
11034 }
11035 if (!snap_exists) {
11036 _pool_op_reply(op, 0, osdmap.get_epoch());
11037 return true;
11038 }
11039 return false;
11040 case POOL_OP_DELETE_UNMANAGED_SNAP:
11041 if (p->is_pool_snaps_mode()) {
11042 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11043 return true;
11044 }
11045 if (p->is_removed_snap(m->snapid)) {
11046 _pool_op_reply(op, 0, osdmap.get_epoch());
11047 return true;
11048 }
11049 return false;
11050 case POOL_OP_DELETE:
11051 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
11052 _pool_op_reply(op, 0, osdmap.get_epoch());
11053 return true;
11054 }
11055 return false;
11056 case POOL_OP_AUID_CHANGE:
11057 return false;
11058 default:
11059 ceph_abort();
11060 break;
11061 }
11062
11063 return false;
11064}
11065
11066bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
11067{
11068 op->mark_osdmon_event(__func__);
11069 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11070 MonSession *session = m->get_session();
11071 if (!session) {
11072 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11073 return true;
11074 }
11075 if (!session->is_capable("osd", MON_CAP_W)) {
11076 dout(5) << "attempt to create new pool without sufficient auid privileges!"
11077 << "message: " << *m << std::endl
11078 << "caps: " << session->caps << dendl;
11079 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11080 return true;
11081 }
11082
11083 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
11084 if (pool >= 0) {
11085 _pool_op_reply(op, 0, osdmap.get_epoch());
11086 return true;
11087 }
11088
11089 return false;
11090}
11091
11092bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
11093{
11094 op->mark_osdmon_event(__func__);
11095 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11096 dout(10) << "prepare_pool_op " << *m << dendl;
11097 if (m->op == POOL_OP_CREATE) {
11098 return prepare_pool_op_create(op);
11099 } else if (m->op == POOL_OP_DELETE) {
11100 return prepare_pool_op_delete(op);
11101 }
11102
11103 int ret = 0;
11104 bool changed = false;
11105
11106 if (!osdmap.have_pg_pool(m->pool)) {
11107 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11108 return false;
11109 }
11110
11111 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
11112
11113 switch (m->op) {
11114 case POOL_OP_CREATE_SNAP:
11115 if (pool->is_tier()) {
11116 ret = -EINVAL;
11117 _pool_op_reply(op, ret, osdmap.get_epoch());
11118 return false;
11119 } // else, fall through
11120 case POOL_OP_DELETE_SNAP:
11121 if (!pool->is_unmanaged_snaps_mode()) {
11122 bool snap_exists = pool->snap_exists(m->name.c_str());
11123 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
11124 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
11125 ret = 0;
11126 } else {
11127 break;
11128 }
11129 } else {
11130 ret = -EINVAL;
11131 }
11132 _pool_op_reply(op, ret, osdmap.get_epoch());
11133 return false;
11134
11135 case POOL_OP_DELETE_UNMANAGED_SNAP:
11136 // we won't allow removal of an unmanaged snapshot from a pool
11137 // not in unmanaged snaps mode.
11138 if (!pool->is_unmanaged_snaps_mode()) {
11139 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
11140 return false;
11141 }
11142 /* fall-thru */
11143 case POOL_OP_CREATE_UNMANAGED_SNAP:
11144 // but we will allow creating an unmanaged snapshot on any pool
11145 // as long as it is not in 'pool' snaps mode.
11146 if (pool->is_pool_snaps_mode()) {
11147 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11148 return false;
11149 }
11150 }
11151
11152 // projected pool info
11153 pg_pool_t pp;
11154 if (pending_inc.new_pools.count(m->pool))
11155 pp = pending_inc.new_pools[m->pool];
11156 else
11157 pp = *osdmap.get_pg_pool(m->pool);
11158
11159 bufferlist reply_data;
11160
11161 // pool snaps vs unmanaged snaps are mutually exclusive
11162 switch (m->op) {
11163 case POOL_OP_CREATE_SNAP:
11164 case POOL_OP_DELETE_SNAP:
11165 if (pp.is_unmanaged_snaps_mode()) {
11166 ret = -EINVAL;
11167 goto out;
11168 }
11169 break;
11170
11171 case POOL_OP_CREATE_UNMANAGED_SNAP:
11172 case POOL_OP_DELETE_UNMANAGED_SNAP:
11173 if (pp.is_pool_snaps_mode()) {
11174 ret = -EINVAL;
11175 goto out;
11176 }
11177 }
11178
11179 switch (m->op) {
11180 case POOL_OP_CREATE_SNAP:
11181 if (!pp.snap_exists(m->name.c_str())) {
11182 pp.add_snap(m->name.c_str(), ceph_clock_now());
11183 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
11184 changed = true;
11185 }
11186 break;
11187
11188 case POOL_OP_DELETE_SNAP:
11189 {
11190 snapid_t s = pp.snap_exists(m->name.c_str());
11191 if (s) {
11192 pp.remove_snap(s);
11193 changed = true;
11194 }
11195 }
11196 break;
11197
11198 case POOL_OP_CREATE_UNMANAGED_SNAP:
11199 {
11200 uint64_t snapid;
11201 pp.add_unmanaged_snap(snapid);
11202 ::encode(snapid, reply_data);
11203 changed = true;
11204 }
11205 break;
11206
11207 case POOL_OP_DELETE_UNMANAGED_SNAP:
11208 if (!pp.is_removed_snap(m->snapid)) {
11209 pp.remove_unmanaged_snap(m->snapid);
11210 changed = true;
11211 }
11212 break;
11213
11214 case POOL_OP_AUID_CHANGE:
11215 if (pp.auid != m->auid) {
11216 pp.auid = m->auid;
11217 changed = true;
11218 }
11219 break;
11220
11221 default:
11222 ceph_abort();
11223 break;
11224 }
11225
11226 if (changed) {
11227 pp.set_snap_epoch(pending_inc.epoch);
11228 pending_inc.new_pools[m->pool] = pp;
11229 }
11230
11231 out:
11232 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
11233 return true;
11234}
11235
11236bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
11237{
11238 op->mark_osdmon_event(__func__);
11239 int err = prepare_new_pool(op);
11240 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
11241 return true;
11242}
11243
11244int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
11245 ostream *ss)
11246{
11247 const string& poolstr = osdmap.get_pool_name(pool_id);
11248
11249 // If the Pool is in use by CephFS, refuse to delete it
11250 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
11251 if (pending_fsmap.pool_in_use(pool_id)) {
11252 *ss << "pool '" << poolstr << "' is in use by CephFS";
11253 return -EBUSY;
11254 }
11255
11256 if (pool.tier_of >= 0) {
11257 *ss << "pool '" << poolstr << "' is a tier of '"
11258 << osdmap.get_pool_name(pool.tier_of) << "'";
11259 return -EBUSY;
11260 }
11261 if (!pool.tiers.empty()) {
11262 *ss << "pool '" << poolstr << "' has tiers";
11263 for(auto tier : pool.tiers) {
11264 *ss << " " << osdmap.get_pool_name(tier);
11265 }
11266 return -EBUSY;
11267 }
11268
11269 if (!g_conf->mon_allow_pool_delete) {
11270 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
11271 return -EPERM;
11272 }
11273
11274 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
11275 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
11276 return -EPERM;
11277 }
11278
11279 *ss << "pool '" << poolstr << "' removed";
11280 return 0;
11281}
11282
11283/**
11284 * Check if it is safe to add a tier to a base pool
11285 *
11286 * @return
11287 * True if the operation should proceed, false if we should abort here
11288 * (abort doesn't necessarily mean error, could be idempotency)
11289 */
11290bool OSDMonitor::_check_become_tier(
11291 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
11292 const int64_t base_pool_id, const pg_pool_t *base_pool,
11293 int *err,
11294 ostream *ss) const
11295{
11296 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
11297 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11298
11299 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11300 if (pending_fsmap.pool_in_use(tier_pool_id)) {
11301 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
11302 *err = -EBUSY;
11303 return false;
11304 }
11305
11306 if (base_pool->tiers.count(tier_pool_id)) {
11307 assert(tier_pool->tier_of == base_pool_id);
11308 *err = 0;
11309 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
11310 << base_pool_name << "'";
11311 return false;
11312 }
11313
11314 if (base_pool->is_tier()) {
11315 *ss << "pool '" << base_pool_name << "' is already a tier of '"
11316 << osdmap.get_pool_name(base_pool->tier_of) << "', "
11317 << "multiple tiers are not yet supported.";
11318 *err = -EINVAL;
11319 return false;
11320 }
11321
11322 if (tier_pool->has_tiers()) {
11323 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
11324 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
11325 it != tier_pool->tiers.end(); ++it)
11326 *ss << "'" << osdmap.get_pool_name(*it) << "',";
11327 *ss << " multiple tiers are not yet supported.";
11328 *err = -EINVAL;
11329 return false;
11330 }
11331
11332 if (tier_pool->is_tier()) {
11333 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
11334 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
11335 *err = -EINVAL;
11336 return false;
11337 }
11338
11339 *err = 0;
11340 return true;
11341}
11342
11343
11344/**
11345 * Check if it is safe to remove a tier from this base pool
11346 *
11347 * @return
11348 * True if the operation should proceed, false if we should abort here
11349 * (abort doesn't necessarily mean error, could be idempotency)
11350 */
11351bool OSDMonitor::_check_remove_tier(
11352 const int64_t base_pool_id, const pg_pool_t *base_pool,
11353 const pg_pool_t *tier_pool,
11354 int *err, ostream *ss) const
11355{
11356 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11357
11358 // Apply CephFS-specific checks
11359 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11360 if (pending_fsmap.pool_in_use(base_pool_id)) {
11361 if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
11362 // If the underlying pool is erasure coded, we can't permit the
11363 // removal of the replicated tier that CephFS relies on to access it
11364 *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
11365 *err = -EBUSY;
11366 return false;
11367 }
11368
11369 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
11370 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
11371 "tier is still in use as a writeback cache. Change the cache "
11372 "mode and flush the cache before removing it";
11373 *err = -EBUSY;
11374 return false;
11375 }
11376 }
11377
11378 *err = 0;
11379 return true;
11380}
11381
11382int OSDMonitor::_prepare_remove_pool(
11383 int64_t pool, ostream *ss, bool no_fake)
11384{
224ce89b 11385 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
11386 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11387 int r = _check_remove_pool(pool, *p, ss);
11388 if (r < 0)
11389 return r;
11390
11391 auto new_pool = pending_inc.new_pools.find(pool);
11392 if (new_pool != pending_inc.new_pools.end()) {
11393 // if there is a problem with the pending info, wait and retry
11394 // this op.
11395 const auto& p = new_pool->second;
11396 int r = _check_remove_pool(pool, p, ss);
11397 if (r < 0)
11398 return -EAGAIN;
11399 }
11400
11401 if (pending_inc.old_pools.count(pool)) {
224ce89b 11402 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
11403 << dendl;
11404 return 0;
11405 }
11406
11407 if (g_conf->mon_fake_pool_delete && !no_fake) {
11408 string old_name = osdmap.get_pool_name(pool);
11409 string new_name = old_name + "." + stringify(pool) + ".DELETED";
11410 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
11411 << old_name << " -> " << new_name << dendl;
11412 pending_inc.new_pool_names[pool] = new_name;
11413 return 0;
11414 }
11415
11416 // remove
11417 pending_inc.old_pools.insert(pool);
11418
224ce89b 11419 // remove any pg_temp mappings for this pool
7c673cae
FG
11420 for (auto p = osdmap.pg_temp->begin();
11421 p != osdmap.pg_temp->end();
11422 ++p) {
11423 if (p->first.pool() == (uint64_t)pool) {
224ce89b 11424 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
11425 << p->first << dendl;
11426 pending_inc.new_pg_temp[p->first].clear();
11427 }
11428 }
224ce89b 11429 // remove any primary_temp mappings for this pool
7c673cae
FG
11430 for (auto p = osdmap.primary_temp->begin();
11431 p != osdmap.primary_temp->end();
11432 ++p) {
11433 if (p->first.pool() == (uint64_t)pool) {
224ce89b 11434 dout(10) << __func__ << " " << pool
7c673cae
FG
11435 << " removing obsolete primary_temp" << p->first << dendl;
11436 pending_inc.new_primary_temp[p->first] = -1;
11437 }
11438 }
224ce89b
WB
11439 // remove any pg_upmap mappings for this pool
11440 for (auto& p : osdmap.pg_upmap) {
11441 if (p.first.pool() == (uint64_t)pool) {
11442 dout(10) << __func__ << " " << pool
11443 << " removing obsolete pg_upmap "
11444 << p.first << dendl;
11445 pending_inc.old_pg_upmap.insert(p.first);
11446 }
11447 }
11448 // remove any pg_upmap_items mappings for this pool
11449 for (auto& p : osdmap.pg_upmap_items) {
11450 if (p.first.pool() == (uint64_t)pool) {
11451 dout(10) << __func__ << " " << pool
11452 << " removing obsolete pg_upmap_items " << p.first
11453 << dendl;
11454 pending_inc.old_pg_upmap_items.insert(p.first);
11455 }
11456 }
35e4c445
FG
11457
11458 // remove any choose_args for this pool
11459 CrushWrapper newcrush;
11460 _get_pending_crush(newcrush);
11461 if (newcrush.have_choose_args(pool)) {
11462 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
11463 newcrush.rm_choose_args(pool);
11464 pending_inc.crush.clear();
11465 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11466 }
7c673cae
FG
11467 return 0;
11468}
11469
11470int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
11471{
11472 dout(10) << "_prepare_rename_pool " << pool << dendl;
11473 if (pending_inc.old_pools.count(pool)) {
11474 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
11475 return -ENOENT;
11476 }
11477 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
11478 p != pending_inc.new_pool_names.end();
11479 ++p) {
11480 if (p->second == newname && p->first != pool) {
11481 return -EEXIST;
11482 }
11483 }
11484
11485 pending_inc.new_pool_names[pool] = newname;
11486 return 0;
11487}
11488
11489bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
11490{
11491 op->mark_osdmon_event(__func__);
11492 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11493 ostringstream ss;
11494 int ret = _prepare_remove_pool(m->pool, &ss, false);
11495 if (ret == -EAGAIN) {
11496 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11497 return true;
11498 }
11499 if (ret < 0)
11500 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
11501 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
11502 pending_inc.epoch));
11503 return true;
11504}
11505
11506void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
11507 int ret, epoch_t epoch, bufferlist *blp)
11508{
11509 op->mark_osdmon_event(__func__);
11510 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11511 dout(20) << "_pool_op_reply " << ret << dendl;
11512 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
11513 ret, epoch, get_last_committed(), blp);
11514 mon->send_reply(op, reply);
11515}