]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
bump version to 12.2.5-pve1
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b
WB
20#include <boost/algorithm/string.hpp>
21#include <locale>
7c673cae
FG
22#include <sstream>
23
31f18b77
FG
24#include "mon/OSDMonitor.h"
25#include "mon/Monitor.h"
26#include "mon/MDSMonitor.h"
27#include "mon/PGMonitor.h"
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
42#include "messages/MOSDFull.h"
43#include "messages/MOSDMap.h"
44#include "messages/MMonGetOSDMap.h"
45#include "messages/MOSDBoot.h"
46#include "messages/MOSDAlive.h"
47#include "messages/MPoolOp.h"
48#include "messages/MPoolOpReply.h"
49#include "messages/MOSDPGCreate.h"
50#include "messages/MOSDPGCreated.h"
51#include "messages/MOSDPGTemp.h"
52#include "messages/MMonCommand.h"
53#include "messages/MRemoveSnaps.h"
54#include "messages/MOSDScrub.h"
55#include "messages/MRoute.h"
56
57#include "common/TextTable.h"
58#include "common/Timer.h"
59#include "common/ceph_argparse.h"
60#include "common/perf_counters.h"
61#include "common/strtol.h"
62
63#include "common/config.h"
64#include "common/errno.h"
65
66#include "erasure-code/ErasureCodePlugin.h"
67#include "compressor/Compressor.h"
68#include "common/Checksummer.h"
69
70#include "include/compat.h"
71#include "include/assert.h"
72#include "include/stringify.h"
73#include "include/util.h"
74#include "common/cmdparse.h"
75#include "include/str_list.h"
76#include "include/str_map.h"
224ce89b 77#include "include/scope_guard.h"
7c673cae
FG
78
79#include "json_spirit/json_spirit_reader.h"
80
c07f9fc5
FG
81#include <boost/algorithm/string/predicate.hpp>
82
7c673cae 83#define dout_subsys ceph_subsys_mon
3efd9988
FG
84static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
85static const string OSD_METADATA_PREFIX("osd_metadata");
7c673cae 86
c07f9fc5
FG
87namespace {
88
89const uint32_t MAX_POOL_APPLICATIONS = 4;
90const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
91const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
92
93} // anonymous namespace
94
7c673cae
FG
95void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
96{
97 if (epoch_by_pg.size() <= ps) {
98 epoch_by_pg.resize(ps + 1, 0);
99 }
100 const auto old_lec = epoch_by_pg[ps];
101 if (old_lec >= last_epoch_clean) {
102 // stale lec
103 return;
104 }
105 epoch_by_pg[ps] = last_epoch_clean;
106 if (last_epoch_clean < floor) {
107 floor = last_epoch_clean;
108 } else if (last_epoch_clean > floor) {
109 if (old_lec == floor) {
110 // probably should increase floor?
111 auto new_floor = std::min_element(std::begin(epoch_by_pg),
112 std::end(epoch_by_pg));
113 floor = *new_floor;
114 }
115 }
116 if (ps != next_missing) {
117 return;
118 }
119 for (; next_missing < epoch_by_pg.size(); next_missing++) {
120 if (epoch_by_pg[next_missing] == 0) {
121 break;
122 }
123 }
124}
125
126void LastEpochClean::remove_pool(uint64_t pool)
127{
128 report_by_pool.erase(pool);
129}
130
131void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
132{
133 auto& lec = report_by_pool[pg.pool()];
134 return lec.report(pg.ps(), last_epoch_clean);
135}
136
137epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
138{
139 auto floor = latest.get_epoch();
140 for (auto& pool : latest.get_pools()) {
141 auto reported = report_by_pool.find(pool.first);
142 if (reported == report_by_pool.end()) {
143 return 0;
144 }
145 if (reported->second.next_missing < pool.second.get_pg_num()) {
146 return 0;
147 }
148 if (reported->second.floor < floor) {
149 floor = reported->second.floor;
150 }
151 }
152 return floor;
153}
154
155
156struct C_UpdateCreatingPGs : public Context {
157 OSDMonitor *osdmon;
158 utime_t start;
159 epoch_t epoch;
160 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
161 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
162 void finish(int r) override {
163 if (r >= 0) {
164 utime_t end = ceph_clock_now();
165 dout(10) << "osdmap epoch " << epoch << " mapping took "
166 << (end - start) << " seconds" << dendl;
167 osdmon->update_creating_pgs();
168 osdmon->check_pg_creates_subs();
169 }
170 }
171};
172
173#undef dout_prefix
174#define dout_prefix _prefix(_dout, mon, osdmap)
175static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
176 return *_dout << "mon." << mon->name << "@" << mon->rank
177 << "(" << mon->get_state_name()
178 << ").osd e" << osdmap.get_epoch() << " ";
179}
180
181OSDMonitor::OSDMonitor(
182 CephContext *cct,
183 Monitor *mn,
184 Paxos *p,
185 const string& service_name)
186 : PaxosService(mn, p, service_name),
187 cct(cct),
188 inc_osd_cache(g_conf->mon_osd_cache_size),
189 full_osd_cache(g_conf->mon_osd_cache_size),
190 last_attempted_minwait_time(utime_t()),
191 mapper(mn->cct, &mn->cpu_tp),
192 op_tracker(cct, true, 1)
193{}
194
195bool OSDMonitor::_have_pending_crush()
196{
197 return pending_inc.crush.length() > 0;
198}
199
200CrushWrapper &OSDMonitor::_get_stable_crush()
201{
202 return *osdmap.crush;
203}
204
205void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
206{
207 bufferlist bl;
208 if (pending_inc.crush.length())
209 bl = pending_inc.crush;
210 else
211 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
212
213 bufferlist::iterator p = bl.begin();
214 newcrush.decode(p);
215}
216
217void OSDMonitor::create_initial()
218{
219 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
220
221 OSDMap newmap;
222
223 bufferlist bl;
224 mon->store->get("mkfs", "osdmap", bl);
225
226 if (bl.length()) {
227 newmap.decode(bl);
228 newmap.set_fsid(mon->monmap->fsid);
229 } else {
224ce89b 230 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0);
7c673cae
FG
231 }
232 newmap.set_epoch(1);
233 newmap.created = newmap.modified = ceph_clock_now();
234
235 // new clusters should sort bitwise by default.
236 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
237
238 // new cluster should require latest by default
31f18b77
FG
239 if (g_conf->mon_debug_no_require_luminous) {
240 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
241 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
242 } else {
243 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
181888fb
FG
244 newmap.flags |=
245 CEPH_OSDMAP_RECOVERY_DELETES |
246 CEPH_OSDMAP_PURGED_SNAPDIRS;
7c673cae
FG
247 newmap.full_ratio = g_conf->mon_osd_full_ratio;
248 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
249 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
250 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
251 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
252 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
31f18b77
FG
253 int r = ceph_release_from_name(
254 g_conf->mon_osd_initial_require_min_compat_client.c_str());
255 if (r <= 0) {
256 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
257 }
258 newmap.require_min_compat_client = r;
7c673cae
FG
259 }
260
261 // encode into pending incremental
262 newmap.encode(pending_inc.fullmap,
263 mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
264 pending_inc.full_crc = newmap.get_crc();
265 dout(20) << " full crc " << pending_inc.full_crc << dendl;
266}
267
268void OSDMonitor::get_store_prefixes(std::set<string>& s)
269{
270 s.insert(service_name);
271 s.insert(OSD_PG_CREATING_PREFIX);
3efd9988 272 s.insert(OSD_METADATA_PREFIX);
7c673cae
FG
273}
274
275void OSDMonitor::update_from_paxos(bool *need_bootstrap)
276{
277 version_t version = get_last_committed();
278 if (version == osdmap.epoch)
279 return;
280 assert(version > osdmap.epoch);
281
282 dout(15) << "update_from_paxos paxos e " << version
283 << ", my e " << osdmap.epoch << dendl;
284
31f18b77
FG
285 if (mapping_job) {
286 if (!mapping_job->is_done()) {
287 dout(1) << __func__ << " mapping job "
288 << mapping_job.get() << " did not complete, "
289 << mapping_job->shards << " left, canceling" << dendl;
290 mapping_job->abort();
291 }
292 mapping_job.reset();
293 }
7c673cae 294
224ce89b
WB
295 load_health();
296
7c673cae
FG
297 /*
298 * We will possibly have a stashed latest that *we* wrote, and we will
299 * always be sure to have the oldest full map in the first..last range
300 * due to encode_trim_extra(), which includes the oldest full map in the trim
301 * transaction.
302 *
303 * encode_trim_extra() does not however write the full map's
304 * version to 'full_latest'. This is only done when we are building the
305 * full maps from the incremental versions. But don't panic! We make sure
306 * that the following conditions find whichever full map version is newer.
307 */
308 version_t latest_full = get_version_latest_full();
309 if (latest_full == 0 && get_first_committed() > 1)
310 latest_full = get_first_committed();
311
312 if (get_first_committed() > 1 &&
313 latest_full < get_first_committed()) {
314 // the monitor could be just sync'ed with its peer, and the latest_full key
315 // is not encoded in the paxos commits in encode_pending(), so we need to
316 // make sure we get it pointing to a proper version.
317 version_t lc = get_last_committed();
318 version_t fc = get_first_committed();
319
320 dout(10) << __func__ << " looking for valid full map in interval"
321 << " [" << fc << ", " << lc << "]" << dendl;
322
323 latest_full = 0;
324 for (version_t v = lc; v >= fc; v--) {
325 string full_key = "full_" + stringify(v);
326 if (mon->store->exists(get_service_name(), full_key)) {
327 dout(10) << __func__ << " found latest full map v " << v << dendl;
328 latest_full = v;
329 break;
330 }
331 }
332
333 assert(latest_full > 0);
334 auto t(std::make_shared<MonitorDBStore::Transaction>());
335 put_version_latest_full(t, latest_full);
336 mon->store->apply_transaction(t);
337 dout(10) << __func__ << " updated the on-disk full map version to "
338 << latest_full << dendl;
339 }
340
341 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
342 bufferlist latest_bl;
343 get_version_full(latest_full, latest_bl);
344 assert(latest_bl.length() != 0);
345 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
346 osdmap.decode(latest_bl);
347 }
348
349 if (mon->monmap->get_required_features().contains_all(
350 ceph::features::mon::FEATURE_LUMINOUS)) {
351 bufferlist bl;
352 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
353 auto p = bl.begin();
354 std::lock_guard<std::mutex> l(creating_pgs_lock);
355 creating_pgs.decode(p);
356 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
357 << creating_pgs.last_scan_epoch
358 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
359 } else {
360 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
361 << dendl;
362 }
363 }
364
31f18b77
FG
365 // make sure we're using the right pg service.. remove me post-luminous!
366 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
367 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
368 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
369 } else {
370 dout(10) << __func__ << " pgservice is pg" << dendl;
371 mon->pgservice = mon->pgmon()->get_pg_stat_service();
372 }
373
7c673cae
FG
374 // walk through incrementals
375 MonitorDBStore::TransactionRef t;
376 size_t tx_size = 0;
377 while (version > osdmap.epoch) {
378 bufferlist inc_bl;
379 int err = get_version(osdmap.epoch+1, inc_bl);
380 assert(err == 0);
381 assert(inc_bl.length());
382
383 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
384 << dendl;
385 OSDMap::Incremental inc(inc_bl);
386 err = osdmap.apply_incremental(inc);
387 assert(err == 0);
388
389 if (!t)
390 t.reset(new MonitorDBStore::Transaction);
391
392 // Write out the full map for all past epochs. Encode the full
393 // map with the same features as the incremental. If we don't
394 // know, use the quorum features. If we don't know those either,
395 // encode with all features.
396 uint64_t f = inc.encode_features;
397 if (!f)
398 f = mon->get_quorum_con_features();
399 if (!f)
400 f = -1;
401 bufferlist full_bl;
402 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
403 tx_size += full_bl.length();
404
405 bufferlist orig_full_bl;
406 get_version_full(osdmap.epoch, orig_full_bl);
407 if (orig_full_bl.length()) {
408 // the primary provided the full map
409 assert(inc.have_crc);
410 if (inc.full_crc != osdmap.crc) {
411 // This will happen if the mons were running mixed versions in
412 // the past or some other circumstance made the full encoded
413 // maps divergent. Reloading here will bring us back into
414 // sync with the primary for this and all future maps. OSDs
415 // will also be brought back into sync when they discover the
416 // crc mismatch and request a full map from a mon.
417 derr << __func__ << " full map CRC mismatch, resetting to canonical"
418 << dendl;
419 osdmap = OSDMap();
420 osdmap.decode(orig_full_bl);
421 }
422 } else {
423 assert(!inc.have_crc);
424 put_version_full(t, osdmap.epoch, full_bl);
425 }
426 put_version_latest_full(t, osdmap.epoch);
427
428 // share
429 dout(1) << osdmap << dendl;
430
431 if (osdmap.epoch == 1) {
432 t->erase("mkfs", "osdmap");
433 }
434
31f18b77
FG
435 // make sure we're using the right pg service.. remove me post-luminous!
436 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
437 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
438 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
439 } else {
440 dout(10) << __func__ << " pgservice is pg" << dendl;
441 mon->pgservice = mon->pgmon()->get_pg_stat_service();
442 }
443
7c673cae
FG
444 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
445 mon->store->apply_transaction(t);
446 t = MonitorDBStore::TransactionRef();
447 tx_size = 0;
448 }
449 if (mon->monmap->get_required_features().contains_all(
450 ceph::features::mon::FEATURE_LUMINOUS)) {
7c673cae
FG
451 for (const auto &osd_state : inc.new_state) {
452 if (osd_state.second & CEPH_OSD_UP) {
453 // could be marked up *or* down, but we're too lazy to check which
454 last_osd_report.erase(osd_state.first);
455 }
456 if (osd_state.second & CEPH_OSD_EXISTS) {
457 // could be created *or* destroyed, but we can safely drop it
458 osd_epochs.erase(osd_state.first);
459 }
460 }
461 }
462 }
463
464 if (t) {
465 mon->store->apply_transaction(t);
466 }
467
468 for (int o = 0; o < osdmap.get_max_osd(); o++) {
469 if (osdmap.is_out(o))
470 continue;
471 auto found = down_pending_out.find(o);
472 if (osdmap.is_down(o)) {
473 // populate down -> out map
474 if (found == down_pending_out.end()) {
475 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
476 down_pending_out[o] = ceph_clock_now();
477 }
478 } else {
479 if (found != down_pending_out.end()) {
480 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
481 down_pending_out.erase(found);
482 }
483 }
484 }
485 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
486
487 if (mon->is_leader()) {
488 // kick pgmon, make sure it's seen the latest map
489 mon->pgmon()->check_osd_map(osdmap.epoch);
490 }
491
492 check_osdmap_subs();
493 check_pg_creates_subs();
494
495 share_map_with_random_osd();
496 update_logger();
497
498 process_failures();
499
500 // make sure our feature bits reflect the latest map
501 update_msgr_features();
502
503 if (!mon->is_leader()) {
504 // will be called by on_active() on the leader, avoid doing so twice
505 start_mapping();
506 }
507}
508
509void OSDMonitor::start_mapping()
510{
511 // initiate mapping job
512 if (mapping_job) {
513 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
514 << dendl;
515 mapping_job->abort();
516 }
224ce89b
WB
517 if (!osdmap.get_pools().empty()) {
518 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
519 mapping_job = mapping.start_update(osdmap, mapper,
520 g_conf->mon_osd_mapping_pgs_per_chunk);
521 dout(10) << __func__ << " started mapping job " << mapping_job.get()
522 << " at " << fin->start << dendl;
523 mapping_job->set_finish_event(fin);
524 } else {
525 dout(10) << __func__ << " no pools, no mapping job" << dendl;
526 mapping_job = nullptr;
527 }
7c673cae
FG
528}
529
530void OSDMonitor::update_msgr_features()
531{
532 set<int> types;
533 types.insert((int)entity_name_t::TYPE_OSD);
534 types.insert((int)entity_name_t::TYPE_CLIENT);
535 types.insert((int)entity_name_t::TYPE_MDS);
536 types.insert((int)entity_name_t::TYPE_MON);
537 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
538 uint64_t mask;
539 uint64_t features = osdmap.get_features(*q, &mask);
540 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
541 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
542 Messenger::Policy p = mon->messenger->get_policy(*q);
543 p.features_required = (p.features_required & ~mask) | features;
544 mon->messenger->set_policy(*q, p);
545 }
546 }
547}
548
549void OSDMonitor::on_active()
550{
551 update_logger();
552
553 if (mon->is_leader()) {
224ce89b 554 mon->clog->debug() << "osdmap " << osdmap;
7c673cae
FG
555 } else {
556 list<MonOpRequestRef> ls;
557 take_all_failures(ls);
558 while (!ls.empty()) {
559 MonOpRequestRef op = ls.front();
560 op->mark_osdmon_event(__func__);
561 dispatch(op);
562 ls.pop_front();
563 }
564 }
565 start_mapping();
566}
567
568void OSDMonitor::on_restart()
569{
570 last_osd_report.clear();
571}
572
573void OSDMonitor::on_shutdown()
574{
575 dout(10) << __func__ << dendl;
576 if (mapping_job) {
577 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
578 << dendl;
579 mapping_job->abort();
580 }
581
582 // discard failure info, waiters
583 list<MonOpRequestRef> ls;
584 take_all_failures(ls);
585 ls.clear();
586}
587
588void OSDMonitor::update_logger()
589{
590 dout(10) << "update_logger" << dendl;
591
592 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
593 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
594 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
595 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
596}
597
7c673cae
FG
598void OSDMonitor::create_pending()
599{
600 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
601 pending_inc.fsid = mon->monmap->fsid;
602
603 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
604
605 // clean up pg_temp, primary_temp
606 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
607 dout(10) << "create_pending did clean_temps" << dendl;
608
609 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
610 // instead of osd_backfill_full_ratio config
611 if (osdmap.backfillfull_ratio <= 0) {
612 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
613 if (pending_inc.new_backfillfull_ratio > 1.0)
614 pending_inc.new_backfillfull_ratio /= 100;
615 dout(1) << __func__ << " setting backfillfull_ratio = "
616 << pending_inc.new_backfillfull_ratio << dendl;
617 }
31f18b77
FG
618 if (osdmap.get_epoch() > 0 &&
619 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae 620 // transition full ratios from PGMap to OSDMap (on upgrade)
31f18b77
FG
621 float full_ratio = mon->pgservice->get_full_ratio();
622 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
623 if (osdmap.full_ratio != full_ratio) {
7c673cae 624 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
31f18b77
FG
625 << " -> " << full_ratio << " (from pgmap)" << dendl;
626 pending_inc.new_full_ratio = full_ratio;
7c673cae 627 }
31f18b77 628 if (osdmap.nearfull_ratio != nearfull_ratio) {
7c673cae 629 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
31f18b77
FG
630 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
631 pending_inc.new_nearfull_ratio = nearfull_ratio;
7c673cae
FG
632 }
633 } else {
634 // safety check (this shouldn't really happen)
635 if (osdmap.full_ratio <= 0) {
636 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
637 if (pending_inc.new_full_ratio > 1.0)
638 pending_inc.new_full_ratio /= 100;
639 dout(1) << __func__ << " setting full_ratio = "
640 << pending_inc.new_full_ratio << dendl;
641 }
642 if (osdmap.nearfull_ratio <= 0) {
643 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
644 if (pending_inc.new_nearfull_ratio > 1.0)
645 pending_inc.new_nearfull_ratio /= 100;
646 dout(1) << __func__ << " setting nearfull_ratio = "
647 << pending_inc.new_nearfull_ratio << dendl;
648 }
649 }
3efd9988
FG
650
651 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
652 // structure.
653 if (osdmap.crush->has_legacy_rule_ids()) {
654 CrushWrapper newcrush;
655 _get_pending_crush(newcrush);
656
657 // First, for all pools, work out which rule they really used
658 // by resolving ruleset to rule.
659 for (const auto &i : osdmap.get_pools()) {
660 const auto pool_id = i.first;
661 const auto &pool = i.second;
662 int new_rule_id = newcrush.find_rule(pool.crush_rule,
663 pool.type, pool.size);
664
665 dout(1) << __func__ << " rewriting pool "
666 << osdmap.get_pool_name(pool_id) << " crush ruleset "
667 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
668 if (pending_inc.new_pools.count(pool_id) == 0) {
669 pending_inc.new_pools[pool_id] = pool;
670 }
671 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
672 }
673
674 // Now, go ahead and renumber all the rules so that their
675 // rule_id field corresponds to their position in the array
676 auto old_to_new = newcrush.renumber_rules();
677 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
678 for (const auto &i : old_to_new) {
679 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
680 }
681 pending_inc.crush.clear();
682 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
683 }
7c673cae
FG
684}
685
686creating_pgs_t
94b18763
FG
687OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
688 const OSDMap& nextmap)
7c673cae 689{
31f18b77 690 dout(10) << __func__ << dendl;
7c673cae
FG
691 creating_pgs_t pending_creatings;
692 {
693 std::lock_guard<std::mutex> l(creating_pgs_lock);
694 pending_creatings = creating_pgs;
695 }
31f18b77
FG
696 // check for new or old pools
697 if (pending_creatings.last_scan_epoch < inc.epoch) {
698 if (osdmap.get_epoch() &&
699 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
700 auto added =
701 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
702 osdmap.get_pools(),
703 &pending_creatings);
704 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
705 }
706 unsigned queued = 0;
707 queued += scan_for_creating_pgs(osdmap.get_pools(),
708 inc.old_pools,
709 inc.modified,
710 &pending_creatings);
711 queued += scan_for_creating_pgs(inc.new_pools,
712 inc.old_pools,
713 inc.modified,
714 &pending_creatings);
715 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
716 for (auto deleted_pool : inc.old_pools) {
717 auto removed = pending_creatings.remove_pool(deleted_pool);
718 dout(10) << __func__ << " " << removed
719 << " pg removed because containing pool deleted: "
720 << deleted_pool << dendl;
721 last_epoch_clean.remove_pool(deleted_pool);
722 }
723 // pgmon updates its creating_pgs in check_osd_map() which is called by
724 // on_active() and check_osd_map() could be delayed if lease expires, so its
725 // creating_pgs could be stale in comparison with the one of osdmon. let's
726 // trim them here. otherwise, they will be added back after being erased.
727 unsigned removed = 0;
728 for (auto& pg : pending_created_pgs) {
729 dout(20) << __func__ << " noting created pg " << pg << dendl;
730 pending_creatings.created_pools.insert(pg.pool());
731 removed += pending_creatings.pgs.erase(pg);
732 }
733 pending_created_pgs.clear();
734 dout(10) << __func__ << " " << removed
735 << " pgs removed because they're created" << dendl;
736 pending_creatings.last_scan_epoch = osdmap.get_epoch();
737 }
738
94b18763
FG
739 // filter out any pgs that shouldn't exist.
740 {
741 auto i = pending_creatings.pgs.begin();
742 while (i != pending_creatings.pgs.end()) {
743 if (!nextmap.pg_exists(i->first)) {
744 dout(10) << __func__ << " removing pg " << i->first
745 << " which should not exist" << dendl;
746 i = pending_creatings.pgs.erase(i);
747 } else {
748 ++i;
749 }
750 }
751 }
752
31f18b77
FG
753 // process queue
754 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
755 const auto total = pending_creatings.pgs.size();
756 while (pending_creatings.pgs.size() < max &&
757 !pending_creatings.queue.empty()) {
758 auto p = pending_creatings.queue.begin();
759 int64_t poolid = p->first;
760 dout(10) << __func__ << " pool " << poolid
761 << " created " << p->second.created
762 << " modified " << p->second.modified
763 << " [" << p->second.start << "-" << p->second.end << ")"
764 << dendl;
765 int n = MIN(max - pending_creatings.pgs.size(),
766 p->second.end - p->second.start);
767 ps_t first = p->second.start;
768 ps_t end = first + n;
769 for (ps_t ps = first; ps < end; ++ps) {
770 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
771 // NOTE: use the *current* epoch as the PG creation epoch so that the
772 // OSD does not have to generate a long set of PastIntervals.
773 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
774 p->second.modified));
775 dout(10) << __func__ << " adding " << pgid << dendl;
776 }
777 p->second.start = end;
778 if (p->second.done()) {
779 dout(10) << __func__ << " done with queue for " << poolid << dendl;
780 pending_creatings.queue.erase(p);
781 } else {
782 dout(10) << __func__ << " pool " << poolid
783 << " now [" << p->second.start << "-" << p->second.end << ")"
784 << dendl;
785 }
786 }
787 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
788 << " pools" << dendl;
c07f9fc5
FG
789 dout(10) << __func__
790 << " " << (pending_creatings.pgs.size() - total)
791 << "/" << pending_creatings.pgs.size()
31f18b77 792 << " pgs added from queued pools" << dendl;
7c673cae
FG
793 return pending_creatings;
794}
795
796void OSDMonitor::maybe_prime_pg_temp()
797{
798 bool all = false;
799 if (pending_inc.crush.length()) {
800 dout(10) << __func__ << " new crush map, all" << dendl;
801 all = true;
802 }
803
804 if (!pending_inc.new_up_client.empty()) {
805 dout(10) << __func__ << " new up osds, all" << dendl;
806 all = true;
807 }
808
809 // check for interesting OSDs
810 set<int> osds;
31f18b77 811 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
812 !all && p != pending_inc.new_state.end();
813 ++p) {
814 if ((p->second & CEPH_OSD_UP) &&
815 osdmap.is_up(p->first)) {
816 osds.insert(p->first);
817 }
818 }
819 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
820 !all && p != pending_inc.new_weight.end();
821 ++p) {
822 if (p->second < osdmap.get_weight(p->first)) {
823 // weight reduction
824 osds.insert(p->first);
825 } else {
826 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
827 << dendl;
828 all = true;
829 }
830 }
831
832 if (!all && osds.empty())
833 return;
834
835 if (!all) {
836 unsigned estimate =
837 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
838 if (estimate > mapping.get_num_pgs() *
839 g_conf->mon_osd_prime_pg_temp_max_estimate) {
840 dout(10) << __func__ << " estimate " << estimate << " pgs on "
841 << osds.size() << " osds >= "
842 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
843 << mapping.get_num_pgs() << " pgs, all"
844 << dendl;
845 all = true;
846 } else {
847 dout(10) << __func__ << " estimate " << estimate << " pgs on "
848 << osds.size() << " osds" << dendl;
849 }
850 }
851
852 OSDMap next;
853 next.deepish_copy_from(osdmap);
854 next.apply_incremental(pending_inc);
855
224ce89b
WB
856 if (next.get_pools().empty()) {
857 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
858 } else if (all) {
7c673cae
FG
859 PrimeTempJob job(next, this);
860 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
861 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
862 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
863 } else {
864 dout(10) << __func__ << " did not finish in "
865 << g_conf->mon_osd_prime_pg_temp_max_time
866 << ", stopping" << dendl;
867 job.abort();
868 }
869 } else {
870 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
871 utime_t stop = ceph_clock_now();
872 stop += g_conf->mon_osd_prime_pg_temp_max_time;
873 const int chunk = 1000;
874 int n = chunk;
875 std::unordered_set<pg_t> did_pgs;
876 for (auto osd : osds) {
877 auto& pgs = mapping.get_osd_acting_pgs(osd);
878 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
879 for (auto pgid : pgs) {
880 if (!did_pgs.insert(pgid).second) {
881 continue;
882 }
883 prime_pg_temp(next, pgid);
884 if (--n <= 0) {
885 n = chunk;
886 if (ceph_clock_now() > stop) {
887 dout(10) << __func__ << " consumed more than "
888 << g_conf->mon_osd_prime_pg_temp_max_time
889 << " seconds, stopping"
890 << dendl;
891 return;
892 }
893 }
894 }
895 }
896 }
897}
898
899void OSDMonitor::prime_pg_temp(
900 const OSDMap& next,
901 pg_t pgid)
902{
903 if (mon->monmap->get_required_features().contains_all(
904 ceph::features::mon::FEATURE_LUMINOUS)) {
31f18b77 905 // TODO: remove this creating_pgs direct access?
7c673cae
FG
906 if (creating_pgs.pgs.count(pgid)) {
907 return;
908 }
909 } else {
31f18b77 910 if (mon->pgservice->is_creating_pg(pgid)) {
7c673cae
FG
911 return;
912 }
913 }
914 if (!osdmap.pg_exists(pgid)) {
915 return;
916 }
917
918 vector<int> up, acting;
919 mapping.get(pgid, &up, nullptr, &acting, nullptr);
920
921 vector<int> next_up, next_acting;
922 int next_up_primary, next_acting_primary;
923 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
924 &next_acting, &next_acting_primary);
c07f9fc5 925 if (acting == next_acting && next_up != next_acting)
7c673cae
FG
926 return; // no change since last epoch
927
928 if (acting.empty())
929 return; // if previously empty now we can be no worse off
930 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
931 if (pool && acting.size() < pool->min_size)
932 return; // can be no worse off than before
933
c07f9fc5
FG
934 if (next_up == next_acting) {
935 acting.clear();
936 dout(20) << __func__ << "next_up === next_acting now, clear pg_temp"
937 << dendl;
938 }
939
7c673cae
FG
940 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
941 << " -> " << next_up << "/" << next_acting
942 << ", priming " << acting
943 << dendl;
944 {
945 Mutex::Locker l(prime_pg_temp_lock);
946 // do not touch a mapping if a change is pending
947 pending_inc.new_pg_temp.emplace(
948 pgid,
949 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
950 }
951}
952
953/**
954 * @note receiving a transaction in this function gives a fair amount of
955 * freedom to the service implementation if it does need it. It shouldn't.
956 */
957void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
958{
959 dout(10) << "encode_pending e " << pending_inc.epoch
960 << dendl;
961
962 // finalize up pending_inc
963 pending_inc.modified = ceph_clock_now();
964
965 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
966 assert(r == 0);
967
968 if (mapping_job) {
969 if (!mapping_job->is_done()) {
970 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
971 << mapping_job.get() << " did not complete, "
972 << mapping_job->shards << " left" << dendl;
973 mapping_job->abort();
974 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
975 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
976 << mapping_job.get() << " is prior epoch "
977 << mapping.get_epoch() << dendl;
978 } else {
979 if (g_conf->mon_osd_prime_pg_temp) {
980 maybe_prime_pg_temp();
981 }
982 }
983 } else if (g_conf->mon_osd_prime_pg_temp) {
984 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
985 << dendl;
986 }
987 mapping_job.reset();
988
c07f9fc5
FG
989 // ensure we don't have blank new_state updates. these are interrpeted as
990 // CEPH_OSD_UP (and almost certainly not what we want!).
991 auto p = pending_inc.new_state.begin();
992 while (p != pending_inc.new_state.end()) {
993 if (p->second == 0) {
994 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
995 p = pending_inc.new_state.erase(p);
996 } else {
997 ++p;
998 }
999 }
1000
7c673cae
FG
1001 bufferlist bl;
1002
1003 {
1004 OSDMap tmp;
1005 tmp.deepish_copy_from(osdmap);
1006 tmp.apply_incremental(pending_inc);
1007
31f18b77 1008 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
3efd9988
FG
1009 // remove any legacy osdmap nearfull/full flags
1010 {
1011 if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1012 dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1013 << dendl;
1014 remove_flag(CEPH_OSDMAP_NEARFULL);
1015 remove_flag(CEPH_OSDMAP_FULL);
1016 }
1017 }
1018 // collect which pools are currently affected by
1019 // the near/backfill/full osd(s),
1020 // and set per-pool near/backfill/full flag instead
1021 set<int64_t> full_pool_ids;
1022 set<int64_t> backfillfull_pool_ids;
1023 set<int64_t> nearfull_pool_ids;
1024 tmp.get_full_pools(g_ceph_context,
1025 &full_pool_ids,
1026 &backfillfull_pool_ids,
1027 &nearfull_pool_ids);
1028 if (full_pool_ids.empty() ||
1029 backfillfull_pool_ids.empty() ||
1030 nearfull_pool_ids.empty()) {
1031 // normal case - no nearfull, backfillfull or full osds
1032 // try cancel any improper nearfull/backfillfull/full pool
1033 // flags first
1034 for (auto &pool: tmp.get_pools()) {
1035 auto p = pool.first;
1036 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1037 nearfull_pool_ids.empty()) {
1038 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1039 << "'s nearfull flag" << dendl;
1040 if (pending_inc.new_pools.count(p) == 0) {
1041 // load original pool info first!
1042 pending_inc.new_pools[p] = pool.second;
1043 }
1044 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1045 }
1046 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1047 backfillfull_pool_ids.empty()) {
1048 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1049 << "'s backfillfull flag" << dendl;
1050 if (pending_inc.new_pools.count(p) == 0) {
1051 pending_inc.new_pools[p] = pool.second;
1052 }
1053 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1054 }
1055 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1056 full_pool_ids.empty()) {
1057 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1058 // set by EQUOTA, skipping
1059 continue;
1060 }
1061 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1062 << "'s full flag" << dendl;
1063 if (pending_inc.new_pools.count(p) == 0) {
1064 pending_inc.new_pools[p] = pool.second;
1065 }
1066 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1067 }
1068 }
1069 }
1070 if (!full_pool_ids.empty()) {
1071 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1072 << " as full" << dendl;
1073 for (auto &p: full_pool_ids) {
1074 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1075 continue;
1076 }
1077 if (pending_inc.new_pools.count(p) == 0) {
1078 pending_inc.new_pools[p] = tmp.pools[p];
1079 }
1080 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1081 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1082 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1083 }
1084 // cancel FLAG_FULL for pools which are no longer full too
1085 for (auto &pool: tmp.get_pools()) {
1086 auto p = pool.first;
1087 if (full_pool_ids.count(p)) {
1088 // skip pools we have just marked as full above
1089 continue;
1090 }
1091 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1092 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1093 // don't touch if currently is not full
1094 // or is running out of quota (and hence considered as full)
1095 continue;
1096 }
1097 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1098 << "'s full flag" << dendl;
1099 if (pending_inc.new_pools.count(p) == 0) {
1100 pending_inc.new_pools[p] = pool.second;
1101 }
1102 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1103 }
1104 }
1105 if (!backfillfull_pool_ids.empty()) {
1106 for (auto &p: backfillfull_pool_ids) {
1107 if (full_pool_ids.count(p)) {
1108 // skip pools we have already considered as full above
1109 continue;
1110 }
1111 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1112 // make sure FLAG_FULL is truly set, so we are safe not
1113 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1114 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1115 continue;
1116 }
1117 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1118 // don't bother if pool is already marked as backfillfull
1119 continue;
1120 }
1121 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1122 << "'s as backfillfull" << dendl;
1123 if (pending_inc.new_pools.count(p) == 0) {
1124 pending_inc.new_pools[p] = tmp.pools[p];
1125 }
1126 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1127 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1128 }
1129 // cancel FLAG_BACKFILLFULL for pools
1130 // which are no longer backfillfull too
1131 for (auto &pool: tmp.get_pools()) {
1132 auto p = pool.first;
1133 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1134 // skip pools we have just marked as backfillfull/full above
1135 continue;
1136 }
1137 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1138 // and don't touch if currently is not backfillfull
1139 continue;
1140 }
1141 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1142 << "'s backfillfull flag" << dendl;
1143 if (pending_inc.new_pools.count(p) == 0) {
1144 pending_inc.new_pools[p] = pool.second;
1145 }
1146 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1147 }
1148 }
1149 if (!nearfull_pool_ids.empty()) {
1150 for (auto &p: nearfull_pool_ids) {
1151 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1152 continue;
1153 }
1154 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
1155 // make sure FLAG_FULL is truly set, so we are safe not
1156 // to set a extra (redundant) FLAG_NEARFULL flag
1157 assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1158 continue;
1159 }
1160 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1161 // don't bother if pool is already marked as nearfull
1162 continue;
1163 }
1164 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1165 << "'s as nearfull" << dendl;
1166 if (pending_inc.new_pools.count(p) == 0) {
1167 pending_inc.new_pools[p] = tmp.pools[p];
1168 }
1169 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1170 }
1171 // cancel FLAG_NEARFULL for pools
1172 // which are no longer nearfull too
1173 for (auto &pool: tmp.get_pools()) {
1174 auto p = pool.first;
1175 if (full_pool_ids.count(p) ||
1176 backfillfull_pool_ids.count(p) ||
1177 nearfull_pool_ids.count(p)) {
1178 // skip pools we have just marked as
1179 // nearfull/backfillfull/full above
1180 continue;
1181 }
1182 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1183 // and don't touch if currently is not nearfull
1184 continue;
1185 }
1186 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1187 << "'s nearfull flag" << dendl;
1188 if (pending_inc.new_pools.count(p) == 0) {
1189 pending_inc.new_pools[p] = pool.second;
1190 }
1191 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1192 }
7c673cae
FG
1193 }
1194
1195 // min_compat_client?
31f18b77 1196 if (tmp.require_min_compat_client == 0) {
7c673cae 1197 auto mv = tmp.get_min_compat_client();
31f18b77
FG
1198 dout(1) << __func__ << " setting require_min_compat_client to currently "
1199 << "required " << ceph_release_name(mv) << dendl;
1200 mon->clog->info() << "setting require_min_compat_client to currently "
1201 << "required " << ceph_release_name(mv);
1202 pending_inc.new_require_min_compat_client = mv;
7c673cae 1203 }
224ce89b
WB
1204
1205 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
1206 // convert ec profile ruleset-* -> crush-*
1207 for (auto& p : tmp.erasure_code_profiles) {
1208 bool changed = false;
1209 map<string,string> newprofile;
1210 for (auto& q : p.second) {
1211 if (q.first.find("ruleset-") == 0) {
1212 string key = "crush-";
1213 key += q.first.substr(8);
1214 newprofile[key] = q.second;
1215 changed = true;
1216 dout(20) << " updating ec profile " << p.first
1217 << " key " << q.first << " -> " << key << dendl;
1218 } else {
1219 newprofile[q.first] = q.second;
1220 }
1221 }
1222 if (changed) {
1223 dout(10) << " updated ec profile " << p.first << ": "
1224 << newprofile << dendl;
1225 pending_inc.new_erasure_code_profiles[p.first] = newprofile;
1226 }
1227 }
c07f9fc5
FG
1228
1229 // auto-enable pool applications upon upgrade
1230 // NOTE: this can be removed post-Luminous assuming upgrades need to
1231 // proceed through Luminous
1232 for (auto &pool_pair : tmp.pools) {
1233 int64_t pool_id = pool_pair.first;
1234 pg_pool_t pg_pool = pool_pair.second;
1235 if (pg_pool.is_tier()) {
1236 continue;
1237 }
1238
1239 std::string pool_name = tmp.get_pool_name(pool_id);
1240 uint32_t match_count = 0;
1241
1242 // CephFS
1243 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
1244 if (pending_fsmap.pool_in_use(pool_id)) {
1245 dout(10) << __func__ << " auto-enabling CephFS on pool '"
1246 << pool_name << "'" << dendl;
1247 pg_pool.application_metadata.insert(
1248 {pg_pool_t::APPLICATION_NAME_CEPHFS, {}});
1249 ++match_count;
1250 }
1251
1252 // RBD heuristics (default OpenStack pool names from docs and
1253 // ceph-ansible)
1254 if (boost::algorithm::contains(pool_name, "rbd") ||
1255 pool_name == "images" || pool_name == "volumes" ||
1256 pool_name == "backups" || pool_name == "vms") {
1257 dout(10) << __func__ << " auto-enabling RBD on pool '"
1258 << pool_name << "'" << dendl;
1259 pg_pool.application_metadata.insert(
1260 {pg_pool_t::APPLICATION_NAME_RBD, {}});
1261 ++match_count;
1262 }
1263
1264 // RGW heuristics
1265 if (boost::algorithm::contains(pool_name, ".rgw") ||
1266 boost::algorithm::contains(pool_name, ".log") ||
1267 boost::algorithm::contains(pool_name, ".intent-log") ||
1268 boost::algorithm::contains(pool_name, ".usage") ||
1269 boost::algorithm::contains(pool_name, ".users")) {
1270 dout(10) << __func__ << " auto-enabling RGW on pool '"
1271 << pool_name << "'" << dendl;
1272 pg_pool.application_metadata.insert(
1273 {pg_pool_t::APPLICATION_NAME_RGW, {}});
1274 ++match_count;
1275 }
1276
1277 // OpenStack gnocchi (from ceph-ansible)
1278 if (pool_name == "metrics" && match_count == 0) {
1279 dout(10) << __func__ << " auto-enabling OpenStack Gnocchi on pool '"
1280 << pool_name << "'" << dendl;
1281 pg_pool.application_metadata.insert({"openstack_gnocchi", {}});
1282 ++match_count;
1283 }
1284
1285 if (match_count == 1) {
1286 pg_pool.last_change = pending_inc.epoch;
1287 pending_inc.new_pools[pool_id] = pg_pool;
1288 } else if (match_count > 1) {
1289 auto pstat = mon->pgservice->get_pool_stat(pool_id);
1290 if (pstat != nullptr && pstat->stats.sum.num_objects > 0) {
1291 mon->clog->info() << "unable to auto-enable application for pool "
1292 << "'" << pool_name << "'";
1293 }
1294 }
1295 }
224ce89b 1296 }
7c673cae
FG
1297 }
1298 }
1299
1300 // tell me about it
31f18b77 1301 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1302 i != pending_inc.new_state.end();
1303 ++i) {
1304 int s = i->second ? i->second : CEPH_OSD_UP;
1305 if (s & CEPH_OSD_UP)
1306 dout(2) << " osd." << i->first << " DOWN" << dendl;
1307 if (s & CEPH_OSD_EXISTS)
1308 dout(2) << " osd." << i->first << " DNE" << dendl;
1309 }
1310 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1311 i != pending_inc.new_up_client.end();
1312 ++i) {
1313 //FIXME: insert cluster addresses too
1314 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1315 }
1316 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1317 i != pending_inc.new_weight.end();
1318 ++i) {
1319 if (i->second == CEPH_OSD_OUT) {
1320 dout(2) << " osd." << i->first << " OUT" << dendl;
1321 } else if (i->second == CEPH_OSD_IN) {
1322 dout(2) << " osd." << i->first << " IN" << dendl;
1323 } else {
1324 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1325 }
1326 }
1327
94b18763
FG
1328 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1329 osdmap.maybe_remove_pg_upmaps(cct, osdmap, &pending_inc);
1330
7c673cae
FG
1331 // features for osdmap and its incremental
1332 uint64_t features = mon->get_quorum_con_features();
1333
1334 // encode full map and determine its crc
1335 OSDMap tmp;
1336 {
1337 tmp.deepish_copy_from(osdmap);
1338 tmp.apply_incremental(pending_inc);
1339
1340 // determine appropriate features
31f18b77 1341 if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1342 dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
1343 << dendl;
1344 features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
1345 }
31f18b77 1346 if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
7c673cae
FG
1347 dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
1348 << "MSG_ADDR2" << dendl;
1349 features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1350 CEPH_FEATURE_MSG_ADDR2);
1351 }
31f18b77
FG
1352 if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
1353 dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
1354 features &= ~CEPH_FEATURE_SERVER_JEWEL;
1355 }
7c673cae
FG
1356 dout(10) << __func__ << " encoding full map with " << features << dendl;
1357
1358 bufferlist fullbl;
1359 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1360 pending_inc.full_crc = tmp.get_crc();
1361
1362 // include full map in the txn. note that old monitors will
1363 // overwrite this. new ones will now skip the local full map
1364 // encode and reload from this.
1365 put_version_full(t, pending_inc.epoch, fullbl);
1366 }
1367
1368 // encode
1369 assert(get_last_committed() + 1 == pending_inc.epoch);
1370 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1371
1372 dout(20) << " full_crc " << tmp.get_crc()
1373 << " inc_crc " << pending_inc.inc_crc << dendl;
1374
1375 /* put everything in the transaction */
1376 put_version(t, pending_inc.epoch, bl);
1377 put_last_committed(t, pending_inc.epoch);
1378
1379 // metadata, too!
1380 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1381 p != pending_metadata.end();
1382 ++p)
1383 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1384 for (set<int>::iterator p = pending_metadata_rm.begin();
1385 p != pending_metadata_rm.end();
1386 ++p)
1387 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1388 pending_metadata.clear();
1389 pending_metadata_rm.clear();
1390
1391 // and pg creating, also!
1392 if (mon->monmap->get_required_features().contains_all(
1393 ceph::features::mon::FEATURE_LUMINOUS)) {
94b18763 1394 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
31f18b77
FG
1395 if (osdmap.get_epoch() &&
1396 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1397 dout(7) << __func__ << " in the middle of upgrading, "
1398 << " trimming pending creating_pgs using pgmap" << dendl;
31f18b77 1399 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
7c673cae
FG
1400 }
1401 bufferlist creatings_bl;
1402 ::encode(pending_creatings, creatings_bl);
1403 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1404 }
224ce89b
WB
1405
1406 // health
1407 health_check_map_t next;
1408 tmp.check_health(&next);
1409 encode_health(next, t);
7c673cae
FG
1410}
1411
1412void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
31f18b77 1413 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
7c673cae
FG
1414{
1415 auto p = creating_pgs->pgs.begin();
1416 while (p != creating_pgs->pgs.end()) {
31f18b77
FG
1417 auto q = pg_stat.find(p->first);
1418 if (q != pg_stat.end() &&
7c673cae
FG
1419 !(q->second.state & PG_STATE_CREATING)) {
1420 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1421 << dendl;
1422 p = creating_pgs->pgs.erase(p);
7c673cae
FG
1423 } else {
1424 ++p;
1425 }
1426 }
1427}
1428
1429int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1430{
1431 bufferlist bl;
1432 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1433 if (r < 0)
1434 return r;
1435 try {
1436 bufferlist::iterator p = bl.begin();
1437 ::decode(m, p);
1438 }
1439 catch (buffer::error& e) {
1440 if (err)
1441 *err << "osd." << osd << " metadata is corrupt";
1442 return -EIO;
1443 }
1444 return 0;
1445}
1446
c07f9fc5 1447void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
31f18b77 1448{
31f18b77
FG
1449 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1450 if (osdmap.is_up(osd)) {
1451 map<string,string> meta;
1452 load_metadata(osd, meta, nullptr);
1453 auto p = meta.find(field);
1454 if (p == meta.end()) {
c07f9fc5 1455 (*out)["unknown"]++;
31f18b77 1456 } else {
c07f9fc5 1457 (*out)[p->second]++;
31f18b77
FG
1458 }
1459 }
1460 }
c07f9fc5
FG
1461}
1462
1463void OSDMonitor::count_metadata(const string& field, Formatter *f)
1464{
1465 map<string,int> by_val;
1466 count_metadata(field, &by_val);
31f18b77
FG
1467 f->open_object_section(field.c_str());
1468 for (auto& p : by_val) {
1469 f->dump_int(p.first.c_str(), p.second);
1470 }
1471 f->close_section();
1472}
1473
7c673cae
FG
1474int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1475{
1476 map<string, string> metadata;
1477 int r = load_metadata(osd, metadata, nullptr);
1478 if (r < 0)
1479 return r;
1480
1481 auto it = metadata.find("osd_objectstore");
1482 if (it == metadata.end())
1483 return -ENOENT;
1484 *type = it->second;
1485 return 0;
1486}
1487
1488bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1489 const pg_pool_t &pool,
1490 ostream *err)
1491{
1492 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1493 // since filestore osds could always join the pool later
1494 set<int> checked_osds;
1495 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1496 vector<int> up, acting;
1497 pg_t pgid(ps, pool_id, -1);
1498 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1499 for (int osd : up) {
1500 if (checked_osds.find(osd) != checked_osds.end())
1501 continue;
1502 string objectstore_type;
1503 int r = get_osd_objectstore_type(osd, &objectstore_type);
1504 // allow with missing metadata, e.g. due to an osd never booting yet
1505 if (r < 0 || objectstore_type == "bluestore") {
1506 checked_osds.insert(osd);
1507 continue;
1508 }
1509 *err << "osd." << osd << " uses " << objectstore_type;
1510 return false;
1511 }
1512 }
1513 return true;
1514}
1515
1516int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1517{
1518 map<string,string> m;
1519 if (int r = load_metadata(osd, m, err))
1520 return r;
1521 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1522 f->dump_string(p->first.c_str(), p->second);
1523 return 0;
1524}
1525
1526void OSDMonitor::print_nodes(Formatter *f)
1527{
1528 // group OSDs by their hosts
1529 map<string, list<int> > osds; // hostname => osd
1530 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1531 map<string, string> m;
1532 if (load_metadata(osd, m, NULL)) {
1533 continue;
1534 }
1535 map<string, string>::iterator hostname = m.find("hostname");
1536 if (hostname == m.end()) {
1537 // not likely though
1538 continue;
1539 }
1540 osds[hostname->second].push_back(osd);
1541 }
1542
1543 dump_services(f, osds, "osd");
1544}
1545
1546void OSDMonitor::share_map_with_random_osd()
1547{
1548 if (osdmap.get_num_up_osds() == 0) {
1549 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1550 return;
1551 }
1552
1553 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1554 if (!s) {
1555 dout(10) << __func__ << " no up osd on our session map" << dendl;
1556 return;
1557 }
1558
1559 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
1560 // whatev, they'll request more if they need it
1561 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
1562 s->con->send_message(m);
1563 // NOTE: do *not* record osd has up to this epoch (as we do
1564 // elsewhere) as they may still need to request older values.
1565}
1566
1567version_t OSDMonitor::get_trim_to()
1568{
31f18b77
FG
1569 if (mon->get_quorum().empty()) {
1570 dout(10) << __func__ << ": quorum not formed" << dendl;
1571 return 0;
1572 }
7c673cae 1573
31f18b77 1574 epoch_t floor;
7c673cae
FG
1575 if (mon->monmap->get_required_features().contains_all(
1576 ceph::features::mon::FEATURE_LUMINOUS)) {
1577 {
31f18b77 1578 // TODO: Get this hidden in PGStatService
7c673cae
FG
1579 std::lock_guard<std::mutex> l(creating_pgs_lock);
1580 if (!creating_pgs.pgs.empty()) {
1581 return 0;
1582 }
1583 }
1584 floor = get_min_last_epoch_clean();
1585 } else {
31f18b77 1586 if (!mon->pgservice->is_readable())
7c673cae 1587 return 0;
31f18b77 1588 if (mon->pgservice->have_creating_pgs()) {
7c673cae
FG
1589 return 0;
1590 }
31f18b77 1591 floor = mon->pgservice->get_min_last_epoch_clean();
7c673cae
FG
1592 }
1593 {
1594 dout(10) << " min_last_epoch_clean " << floor << dendl;
1595 if (g_conf->mon_osd_force_trim_to > 0 &&
1596 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1597 floor = g_conf->mon_osd_force_trim_to;
1598 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1599 }
1600 unsigned min = g_conf->mon_min_osdmap_epochs;
1601 if (floor + min > get_last_committed()) {
1602 if (min < get_last_committed())
1603 floor = get_last_committed() - min;
1604 else
1605 floor = 0;
1606 }
1607 if (floor > get_first_committed())
1608 return floor;
1609 }
1610 return 0;
1611}
1612
1613epoch_t OSDMonitor::get_min_last_epoch_clean() const
1614{
1615 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1616 // also scan osd epochs
1617 // don't trim past the oldest reported osd epoch
1618 for (auto& osd_epoch : osd_epochs) {
1619 if (osd_epoch.second < floor) {
1620 floor = osd_epoch.second;
1621 }
1622 }
1623 return floor;
1624}
1625
1626void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1627 version_t first)
1628{
1629 dout(10) << __func__ << " including full map for e " << first << dendl;
1630 bufferlist bl;
1631 get_version_full(first, bl);
1632 put_version_full(tx, first, bl);
1633}
1634
1635// -------------
1636
1637bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1638{
1639 op->mark_osdmon_event(__func__);
1640 Message *m = op->get_req();
1641 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1642
1643 switch (m->get_type()) {
1644 // READs
1645 case MSG_MON_COMMAND:
1646 return preprocess_command(op);
1647 case CEPH_MSG_MON_GET_OSDMAP:
1648 return preprocess_get_osdmap(op);
1649
1650 // damp updates
1651 case MSG_OSD_MARK_ME_DOWN:
1652 return preprocess_mark_me_down(op);
1653 case MSG_OSD_FULL:
1654 return preprocess_full(op);
1655 case MSG_OSD_FAILURE:
1656 return preprocess_failure(op);
1657 case MSG_OSD_BOOT:
1658 return preprocess_boot(op);
1659 case MSG_OSD_ALIVE:
1660 return preprocess_alive(op);
1661 case MSG_OSD_PG_CREATED:
1662 return preprocess_pg_created(op);
1663 case MSG_OSD_PGTEMP:
1664 return preprocess_pgtemp(op);
1665 case MSG_OSD_BEACON:
1666 return preprocess_beacon(op);
1667
1668 case CEPH_MSG_POOLOP:
1669 return preprocess_pool_op(op);
1670
1671 case MSG_REMOVE_SNAPS:
1672 return preprocess_remove_snaps(op);
1673
1674 default:
1675 ceph_abort();
1676 return true;
1677 }
1678}
1679
1680bool OSDMonitor::prepare_update(MonOpRequestRef op)
1681{
1682 op->mark_osdmon_event(__func__);
1683 Message *m = op->get_req();
1684 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1685
1686 switch (m->get_type()) {
1687 // damp updates
1688 case MSG_OSD_MARK_ME_DOWN:
1689 return prepare_mark_me_down(op);
1690 case MSG_OSD_FULL:
1691 return prepare_full(op);
1692 case MSG_OSD_FAILURE:
1693 return prepare_failure(op);
1694 case MSG_OSD_BOOT:
1695 return prepare_boot(op);
1696 case MSG_OSD_ALIVE:
1697 return prepare_alive(op);
1698 case MSG_OSD_PG_CREATED:
1699 return prepare_pg_created(op);
1700 case MSG_OSD_PGTEMP:
1701 return prepare_pgtemp(op);
1702 case MSG_OSD_BEACON:
1703 return prepare_beacon(op);
1704
1705 case MSG_MON_COMMAND:
1706 return prepare_command(op);
1707
1708 case CEPH_MSG_POOLOP:
1709 return prepare_pool_op(op);
1710
1711 case MSG_REMOVE_SNAPS:
1712 return prepare_remove_snaps(op);
1713
1714
1715 default:
1716 ceph_abort();
1717 }
1718
1719 return false;
1720}
1721
1722bool OSDMonitor::should_propose(double& delay)
1723{
1724 dout(10) << "should_propose" << dendl;
1725
1726 // if full map, propose immediately! any subsequent changes will be clobbered.
1727 if (pending_inc.fullmap.length())
1728 return true;
1729
1730 // adjust osd weights?
1731 if (!osd_weight.empty() &&
1732 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1733 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1734 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1735 delay = 0.0;
1736 osd_weight.clear();
1737 return true;
1738 }
1739
1740 // propose as fast as possible if updating up_thru or pg_temp
1741 // want to merge OSDMap changes as much as possible
1742 if ((pending_inc.new_primary_temp.size() == 1
1743 || pending_inc.new_up_thru.size() == 1)
1744 && pending_inc.new_state.size() < 2) {
1745 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
1746
1747 utime_t now = ceph_clock_now();
1748 if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
1749 && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
1750 delay = g_conf->paxos_min_wait;
1751 last_attempted_minwait_time = now;
1752 return true;
1753 }
1754 }
1755
1756 return PaxosService::should_propose(delay);
1757}
1758
1759
1760
1761// ---------------------------
1762// READs
1763
1764bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1765{
1766 op->mark_osdmon_event(__func__);
1767 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
1768 dout(10) << __func__ << " " << *m << dendl;
1769 MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
1770 epoch_t first = get_first_committed();
1771 epoch_t last = osdmap.get_epoch();
1772 int max = g_conf->osd_map_message_max;
1773 for (epoch_t e = MAX(first, m->get_full_first());
1774 e <= MIN(last, m->get_full_last()) && max > 0;
1775 ++e, --max) {
1776 int r = get_version_full(e, reply->maps[e]);
1777 assert(r >= 0);
1778 }
1779 for (epoch_t e = MAX(first, m->get_inc_first());
1780 e <= MIN(last, m->get_inc_last()) && max > 0;
1781 ++e, --max) {
1782 int r = get_version(e, reply->incremental_maps[e]);
1783 assert(r >= 0);
1784 }
1785 reply->oldest_map = first;
1786 reply->newest_map = last;
1787 mon->send_reply(op, reply);
1788 return true;
1789}
1790
1791
1792// ---------------------------
1793// UPDATEs
1794
1795// failure --
1796
1797bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1798 // check permissions
1799 MonSession *session = m->get_session();
1800 if (!session)
1801 return true;
1802 if (!session->is_capable("osd", MON_CAP_X)) {
1803 dout(0) << "got MOSDFailure from entity with insufficient caps "
1804 << session->caps << dendl;
1805 return true;
1806 }
1807 if (fsid != mon->monmap->fsid) {
1808 dout(0) << "check_source: on fsid " << fsid
1809 << " != " << mon->monmap->fsid << dendl;
1810 return true;
1811 }
1812 return false;
1813}
1814
1815
1816bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1817{
1818 op->mark_osdmon_event(__func__);
1819 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1820 // who is target_osd
1821 int badboy = m->get_target().name.num();
1822
1823 // check permissions
1824 if (check_source(m, m->fsid))
1825 goto didit;
1826
1827 // first, verify the reporting host is valid
1828 if (m->get_orig_source().is_osd()) {
1829 int from = m->get_orig_source().num();
1830 if (!osdmap.exists(from) ||
1831 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1832 (osdmap.is_down(from) && m->if_osd_failed())) {
1833 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1834 send_incremental(op, m->get_epoch()+1);
1835 goto didit;
1836 }
1837 }
1838
1839
1840 // weird?
1841 if (osdmap.is_down(badboy)) {
1842 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1843 if (m->get_epoch() < osdmap.get_epoch())
1844 send_incremental(op, m->get_epoch()+1);
1845 goto didit;
1846 }
1847 if (osdmap.get_inst(badboy) != m->get_target()) {
1848 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1849 << ", from " << m->get_orig_source_inst() << dendl;
1850 if (m->get_epoch() < osdmap.get_epoch())
1851 send_incremental(op, m->get_epoch()+1);
1852 goto didit;
1853 }
1854
1855 // already reported?
1856 if (osdmap.is_down(badboy) ||
1857 osdmap.get_up_from(badboy) > m->get_epoch()) {
1858 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1859 if (m->get_epoch() < osdmap.get_epoch())
1860 send_incremental(op, m->get_epoch()+1);
1861 goto didit;
1862 }
1863
1864 if (!can_mark_down(badboy)) {
1865 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1866 goto didit;
1867 }
1868
1869 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1870 return false;
1871
1872 didit:
1873 return true;
1874}
1875
1876class C_AckMarkedDown : public C_MonOp {
1877 OSDMonitor *osdmon;
1878public:
1879 C_AckMarkedDown(
1880 OSDMonitor *osdmon,
1881 MonOpRequestRef op)
1882 : C_MonOp(op), osdmon(osdmon) {}
1883
1884 void _finish(int) override {
1885 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1886 osdmon->mon->send_reply(
1887 op,
1888 new MOSDMarkMeDown(
1889 m->fsid,
1890 m->get_target(),
1891 m->get_epoch(),
1892 false)); // ACK itself does not request an ack
1893 }
1894 ~C_AckMarkedDown() override {
1895 }
1896};
1897
1898bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1899{
1900 op->mark_osdmon_event(__func__);
1901 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1902 int requesting_down = m->get_target().name.num();
1903 int from = m->get_orig_source().num();
1904
1905 // check permissions
1906 if (check_source(m, m->fsid))
1907 goto reply;
1908
1909 // first, verify the reporting host is valid
1910 if (!m->get_orig_source().is_osd())
1911 goto reply;
1912
1913 if (!osdmap.exists(from) ||
1914 osdmap.is_down(from) ||
1915 osdmap.get_addr(from) != m->get_target().addr) {
1916 dout(5) << "preprocess_mark_me_down from dead osd."
1917 << from << ", ignoring" << dendl;
1918 send_incremental(op, m->get_epoch()+1);
1919 goto reply;
1920 }
1921
1922 // no down might be set
1923 if (!can_mark_down(requesting_down))
1924 goto reply;
1925
1926 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1927 return false;
1928
1929 reply:
1930 if (m->request_ack) {
1931 Context *c(new C_AckMarkedDown(this, op));
1932 c->complete(0);
1933 }
1934 return true;
1935}
1936
1937bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
1938{
1939 op->mark_osdmon_event(__func__);
1940 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1941 int target_osd = m->get_target().name.num();
1942
1943 assert(osdmap.is_up(target_osd));
1944 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1945
1946 mon->clog->info() << "osd." << target_osd << " marked itself down";
1947 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1948 if (m->request_ack)
1949 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
1950 return true;
1951}
1952
1953bool OSDMonitor::can_mark_down(int i)
1954{
1955 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
31f18b77
FG
1956 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
1957 << " down" << dendl;
1958 return false;
1959 }
1960
1961 if (osdmap.is_nodown(i)) {
1962 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
1963 << "will not mark it down" << dendl;
7c673cae
FG
1964 return false;
1965 }
31f18b77 1966
7c673cae
FG
1967 int num_osds = osdmap.get_num_osds();
1968 if (num_osds == 0) {
31f18b77 1969 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
1970 return false;
1971 }
1972 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
1973 float up_ratio = (float)up / (float)num_osds;
1974 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
31f18b77 1975 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
7c673cae
FG
1976 << g_conf->mon_osd_min_up_ratio
1977 << ", will not mark osd." << i << " down" << dendl;
1978 return false;
1979 }
1980 return true;
1981}
1982
1983bool OSDMonitor::can_mark_up(int i)
1984{
1985 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
31f18b77
FG
1986 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
1987 << " up" << dendl;
1988 return false;
1989 }
1990
1991 if (osdmap.is_noup(i)) {
1992 dout(5) << __func__ << " osd." << i << " is marked as noup, "
1993 << "will not mark it up" << dendl;
7c673cae
FG
1994 return false;
1995 }
31f18b77 1996
7c673cae
FG
1997 return true;
1998}
1999
2000/**
2001 * @note the parameter @p i apparently only exists here so we can output the
2002 * osd's id on messages.
2003 */
2004bool OSDMonitor::can_mark_out(int i)
2005{
2006 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
2007 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
2008 return false;
2009 }
31f18b77
FG
2010
2011 if (osdmap.is_noout(i)) {
2012 dout(5) << __func__ << " osd." << i << " is marked as noout, "
2013 << "will not mark it out" << dendl;
2014 return false;
2015 }
2016
7c673cae
FG
2017 int num_osds = osdmap.get_num_osds();
2018 if (num_osds == 0) {
2019 dout(5) << __func__ << " no osds" << dendl;
2020 return false;
2021 }
2022 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2023 float in_ratio = (float)in / (float)num_osds;
2024 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
2025 if (i >= 0)
2026 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2027 << g_conf->mon_osd_min_in_ratio
2028 << ", will not mark osd." << i << " out" << dendl;
2029 else
2030 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2031 << g_conf->mon_osd_min_in_ratio
2032 << ", will not mark osds out" << dendl;
2033 return false;
2034 }
2035
2036 return true;
2037}
2038
2039bool OSDMonitor::can_mark_in(int i)
2040{
2041 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
31f18b77
FG
2042 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
2043 << " in" << dendl;
2044 return false;
2045 }
2046
2047 if (osdmap.is_noin(i)) {
2048 dout(5) << __func__ << " osd." << i << " is marked as noin, "
2049 << "will not mark it in" << dendl;
7c673cae
FG
2050 return false;
2051 }
31f18b77 2052
7c673cae
FG
2053 return true;
2054}
2055
2056bool OSDMonitor::check_failures(utime_t now)
2057{
2058 bool found_failure = false;
2059 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2060 p != failure_info.end();
2061 ++p) {
2062 if (can_mark_down(p->first)) {
2063 found_failure |= check_failure(now, p->first, p->second);
2064 }
2065 }
2066 return found_failure;
2067}
2068
2069bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2070{
2071 // already pending failure?
2072 if (pending_inc.new_state.count(target_osd) &&
2073 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2074 dout(10) << " already pending failure" << dendl;
2075 return true;
2076 }
2077
2078 set<string> reporters_by_subtree;
2079 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
2080 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
2081 utime_t max_failed_since = fi.get_failed_since();
2082 utime_t failed_for = now - max_failed_since;
2083
2084 utime_t grace = orig_grace;
2085 double my_grace = 0, peer_grace = 0;
2086 double decay_k = 0;
2087 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2088 double halflife = (double)g_conf->mon_osd_laggy_halflife;
2089 decay_k = ::log(.5) / halflife;
2090
2091 // scale grace period based on historical probability of 'lagginess'
2092 // (false positive failures due to slowness).
2093 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2094 double decay = exp((double)failed_for * decay_k);
2095 dout(20) << " halflife " << halflife << " decay_k " << decay_k
2096 << " failed_for " << failed_for << " decay " << decay << dendl;
2097 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2098 grace += my_grace;
2099 }
2100
2101 // consider the peers reporting a failure a proxy for a potential
2102 // 'subcluster' over the overall cluster that is similarly
2103 // laggy. this is clearly not true in all cases, but will sometimes
2104 // help us localize the grace correction to a subset of the system
2105 // (say, a rack with a bad switch) that is unhappy.
2106 assert(fi.reporters.size());
2107 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2108 p != fi.reporters.end();
2109 ++p) {
2110 // get the parent bucket whose type matches with "reporter_subtree_level".
2111 // fall back to OSD if the level doesn't exist.
2112 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2113 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2114 if (iter == reporter_loc.end()) {
2115 reporters_by_subtree.insert("osd." + to_string(p->first));
2116 } else {
2117 reporters_by_subtree.insert(iter->second);
2118 }
2119 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2120 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2121 utime_t elapsed = now - xi.down_stamp;
2122 double decay = exp((double)elapsed * decay_k);
2123 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2124 }
2125 }
2126
2127 if (g_conf->mon_osd_adjust_heartbeat_grace) {
2128 peer_grace /= (double)fi.reporters.size();
2129 grace += peer_grace;
2130 }
2131
2132 dout(10) << " osd." << target_osd << " has "
2133 << fi.reporters.size() << " reporters, "
2134 << grace << " grace (" << orig_grace << " + " << my_grace
2135 << " + " << peer_grace << "), max_failed_since " << max_failed_since
2136 << dendl;
2137
2138 if (failed_for >= grace &&
2139 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
2140 dout(1) << " we have enough reporters to mark osd." << target_osd
2141 << " down" << dendl;
2142 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2143
31f18b77
FG
2144 mon->clog->info() << "osd." << target_osd << " failed ("
2145 << osdmap.crush->get_full_location_ordered_string(
2146 target_osd)
2147 << ") ("
2148 << (int)reporters_by_subtree.size()
2149 << " reporters from different "
7c673cae
FG
2150 << reporter_subtree_level << " after "
2151 << failed_for << " >= grace " << grace << ")";
2152 return true;
2153 }
2154 return false;
2155}
2156
224ce89b 2157void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
2158{
2159 // already pending failure?
2160 if (pending_inc.new_state.count(target_osd) &&
2161 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2162 dout(10) << " already pending failure" << dendl;
2163 return;
2164 }
2165
2166 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2167 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2168
31f18b77
FG
2169 mon->clog->info() << "osd." << target_osd << " failed ("
2170 << osdmap.crush->get_full_location_ordered_string(target_osd)
2171 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
2172 return;
2173}
2174
2175bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2176{
2177 op->mark_osdmon_event(__func__);
2178 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2179 dout(1) << "prepare_failure " << m->get_target()
2180 << " from " << m->get_orig_source_inst()
2181 << " is reporting failure:" << m->if_osd_failed() << dendl;
2182
2183 int target_osd = m->get_target().name.num();
2184 int reporter = m->get_orig_source().num();
2185 assert(osdmap.is_up(target_osd));
2186 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
2187
2188 if (m->if_osd_failed()) {
2189 // calculate failure time
2190 utime_t now = ceph_clock_now();
2191 utime_t failed_since =
2192 m->get_recv_stamp() - utime_t(m->failed_for, 0);
2193
2194 // add a report
2195 if (m->is_immediate()) {
2196 mon->clog->debug() << m->get_target() << " reported immediately failed by "
2197 << m->get_orig_source_inst();
224ce89b 2198 force_failure(target_osd, reporter);
94b18763 2199 mon->no_reply(op);
7c673cae
FG
2200 return true;
2201 }
2202 mon->clog->debug() << m->get_target() << " reported failed by "
2203 << m->get_orig_source_inst();
2204
2205 failure_info_t& fi = failure_info[target_osd];
2206 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2207 if (old_op) {
2208 mon->no_reply(old_op);
2209 }
2210
2211 return check_failure(now, target_osd, fi);
2212 } else {
2213 // remove the report
2214 mon->clog->debug() << m->get_target() << " failure report canceled by "
2215 << m->get_orig_source_inst();
2216 if (failure_info.count(target_osd)) {
2217 failure_info_t& fi = failure_info[target_osd];
2218 MonOpRequestRef report_op = fi.cancel_report(reporter);
2219 if (report_op) {
2220 mon->no_reply(report_op);
2221 }
2222 if (fi.reporters.empty()) {
2223 dout(10) << " removing last failure_info for osd." << target_osd
2224 << dendl;
2225 failure_info.erase(target_osd);
2226 } else {
2227 dout(10) << " failure_info for osd." << target_osd << " now "
2228 << fi.reporters.size() << " reporters" << dendl;
2229 }
2230 } else {
2231 dout(10) << " no failure_info for osd." << target_osd << dendl;
2232 }
2233 mon->no_reply(op);
2234 }
2235
2236 return false;
2237}
2238
2239void OSDMonitor::process_failures()
2240{
2241 map<int,failure_info_t>::iterator p = failure_info.begin();
2242 while (p != failure_info.end()) {
2243 if (osdmap.is_up(p->first)) {
2244 ++p;
2245 } else {
2246 dout(10) << "process_failures osd." << p->first << dendl;
2247 list<MonOpRequestRef> ls;
2248 p->second.take_report_messages(ls);
2249 failure_info.erase(p++);
2250
2251 while (!ls.empty()) {
2252 MonOpRequestRef o = ls.front();
2253 if (o) {
2254 o->mark_event(__func__);
2255 MOSDFailure *m = o->get_req<MOSDFailure>();
2256 send_latest(o, m->get_epoch());
2257 }
2258 ls.pop_front();
2259 }
2260 }
2261 }
2262}
2263
2264void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2265{
2266 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2267
2268 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2269 p != failure_info.end();
2270 ++p) {
2271 p->second.take_report_messages(ls);
2272 }
2273 failure_info.clear();
2274}
2275
2276
2277// boot --
2278
2279bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2280{
2281 op->mark_osdmon_event(__func__);
2282 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2283 int from = m->get_orig_source_inst().name.num();
2284
2285 // check permissions, ignore if failed (no response expected)
2286 MonSession *session = m->get_session();
2287 if (!session)
2288 goto ignore;
2289 if (!session->is_capable("osd", MON_CAP_X)) {
2290 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2291 << session->caps << dendl;
2292 goto ignore;
2293 }
2294
2295 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2296 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2297 << " != " << mon->monmap->fsid << dendl;
2298 goto ignore;
2299 }
2300
2301 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2302 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2303 goto ignore;
2304 }
2305
2306 assert(m->get_orig_source_inst().name.is_osd());
2307
2308 // check if osd has required features to boot
2309 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2310 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2311 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2312 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2313 << m->get_orig_source_inst()
2314 << " doesn't announce support -- ignore" << dendl;
2315 goto ignore;
2316 }
2317
2318 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2319 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2320 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2321 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2322 << m->get_orig_source_inst()
2323 << " doesn't announce support -- ignore" << dendl;
2324 goto ignore;
2325 }
2326
2327 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2328 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2329 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2330 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2331 << m->get_orig_source_inst()
2332 << " doesn't announce support -- ignore" << dendl;
2333 goto ignore;
2334 }
2335
31f18b77 2336 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
2337 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2338 mon->clog->info() << "disallowing boot of OSD "
2339 << m->get_orig_source_inst()
2340 << " because the osdmap requires"
2341 << " CEPH_FEATURE_SERVER_LUMINOUS"
2342 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2343 goto ignore;
2344 }
2345
31f18b77 2346 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
7c673cae
FG
2347 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2348 mon->clog->info() << "disallowing boot of OSD "
2349 << m->get_orig_source_inst()
2350 << " because the osdmap requires"
2351 << " CEPH_FEATURE_SERVER_JEWEL"
2352 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2353 goto ignore;
2354 }
2355
31f18b77 2356 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
7c673cae
FG
2357 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2358 mon->clog->info() << "disallowing boot of OSD "
2359 << m->get_orig_source_inst()
2360 << " because the osdmap requires"
2361 << " CEPH_FEATURE_SERVER_KRAKEN"
2362 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2363 goto ignore;
2364 }
2365
2366 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2367 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2368 mon->clog->info() << "disallowing boot of OSD "
2369 << m->get_orig_source_inst()
2370 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2371 goto ignore;
2372 }
2373
c07f9fc5
FG
2374 if (osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES) &&
2375 !(m->osd_features & CEPH_FEATURE_OSD_RECOVERY_DELETES)) {
2376 mon->clog->info() << "disallowing boot of OSD "
2377 << m->get_orig_source_inst()
2378 << " because 'recovery_deletes' osdmap flag is set and OSD lacks the OSD_RECOVERY_DELETES feature";
2379 goto ignore;
2380 }
2381
7c673cae
FG
2382 if (any_of(osdmap.get_pools().begin(),
2383 osdmap.get_pools().end(),
2384 [](const std::pair<int64_t,pg_pool_t>& pool)
2385 { return pool.second.use_gmt_hitset; })) {
2386 assert(osdmap.get_num_up_osds() == 0 ||
2387 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2388 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2389 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2390 << m->get_orig_source_inst()
2391 << " doesn't announce support -- ignore" << dendl;
2392 goto ignore;
2393 }
2394 }
2395
2396 // make sure upgrades stop at luminous
2397 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
31f18b77 2398 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
2399 mon->clog->info() << "disallowing boot of post-luminous OSD "
2400 << m->get_orig_source_inst()
31f18b77 2401 << " because require_osd_release < luminous";
7c673cae
FG
2402 goto ignore;
2403 }
2404
2405 // make sure upgrades stop at jewel
2406 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
31f18b77 2407 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
2408 mon->clog->info() << "disallowing boot of post-jewel OSD "
2409 << m->get_orig_source_inst()
31f18b77 2410 << " because require_osd_release < jewel";
7c673cae
FG
2411 goto ignore;
2412 }
2413
2414 // make sure upgrades stop at hammer
2415 // * HAMMER_0_94_4 is the required hammer feature
2416 // * MON_METADATA is the first post-hammer feature
2417 if (osdmap.get_num_up_osds() > 0) {
2418 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2419 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2420 mon->clog->info() << "disallowing boot of post-hammer OSD "
2421 << m->get_orig_source_inst()
2422 << " because one or more up OSDs is pre-hammer v0.94.4";
2423 goto ignore;
2424 }
2425 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2426 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2427 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2428 << m->get_orig_source_inst()
2429 << " because all up OSDs are post-hammer";
2430 goto ignore;
2431 }
2432 }
2433
2434 // already booted?
2435 if (osdmap.is_up(from) &&
2436 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2437 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2438 // yup.
2439 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2440 << " == " << osdmap.get_inst(from) << dendl;
2441 _booted(op, false);
2442 return true;
2443 }
2444
2445 if (osdmap.exists(from) &&
2446 !osdmap.get_uuid(from).is_zero() &&
2447 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2448 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2449 << " clashes with existing osd: different fsid"
2450 << " (ours: " << osdmap.get_uuid(from)
2451 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2452 goto ignore;
2453 }
2454
2455 if (osdmap.exists(from) &&
2456 osdmap.get_info(from).up_from > m->version &&
2457 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2458 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2459 send_latest(op, m->sb.current_epoch+1);
2460 return true;
2461 }
2462
2463 // noup?
2464 if (!can_mark_up(from)) {
2465 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2466 send_latest(op, m->sb.current_epoch+1);
2467 return true;
2468 }
2469
2470 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2471 return false;
2472
2473 ignore:
2474 return true;
2475}
2476
2477bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2478{
2479 op->mark_osdmon_event(__func__);
2480 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2481 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2482 << " cluster_addr " << m->cluster_addr
2483 << " hb_back_addr " << m->hb_back_addr
2484 << " hb_front_addr " << m->hb_front_addr
2485 << dendl;
2486
2487 assert(m->get_orig_source().is_osd());
2488 int from = m->get_orig_source().num();
2489
2490 // does this osd exist?
2491 if (from >= osdmap.get_max_osd()) {
2492 dout(1) << "boot from osd." << from << " >= max_osd "
2493 << osdmap.get_max_osd() << dendl;
2494 return false;
2495 }
2496
2497 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2498 if (pending_inc.new_state.count(from))
2499 oldstate ^= pending_inc.new_state[from];
2500
2501 // already up? mark down first?
2502 if (osdmap.is_up(from)) {
2503 dout(7) << __func__ << " was up, first marking down "
2504 << osdmap.get_inst(from) << dendl;
2505 // preprocess should have caught these; if not, assert.
2506 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2507 osdmap.get_cluster_addr(from) != m->cluster_addr);
2508 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2509
2510 if (pending_inc.new_state.count(from) == 0 ||
2511 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2512 // mark previous guy down
2513 pending_inc.new_state[from] = CEPH_OSD_UP;
2514 }
2515 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2516 } else if (pending_inc.new_up_client.count(from)) {
2517 // already prepared, just wait
2518 dout(7) << __func__ << " already prepared, waiting on "
2519 << m->get_orig_source_addr() << dendl;
2520 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2521 } else {
2522 // mark new guy up.
2523 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2524 if (!m->cluster_addr.is_blank_ip())
2525 pending_inc.new_up_cluster[from] = m->cluster_addr;
2526 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2527 if (!m->hb_front_addr.is_blank_ip())
2528 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2529
2530 down_pending_out.erase(from); // if any
2531
2532 if (m->sb.weight)
2533 osd_weight[from] = m->sb.weight;
2534
2535 // set uuid?
2536 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2537 << dendl;
2538 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2539 // preprocess should have caught this; if not, assert.
2540 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2541 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2542 }
2543
2544 // fresh osd?
2545 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2546 const osd_info_t& i = osdmap.get_info(from);
2547 if (i.up_from > i.lost_at) {
2548 dout(10) << " fresh osd; marking lost_at too" << dendl;
2549 pending_inc.new_lost[from] = osdmap.get_epoch();
2550 }
2551 }
2552
2553 // metadata
2554 bufferlist osd_metadata;
2555 ::encode(m->metadata, osd_metadata);
2556 pending_metadata[from] = osd_metadata;
31f18b77 2557 pending_metadata_rm.erase(from);
7c673cae
FG
2558
2559 // adjust last clean unmount epoch?
2560 const osd_info_t& info = osdmap.get_info(from);
2561 dout(10) << " old osd_info: " << info << dendl;
2562 if (m->sb.mounted > info.last_clean_begin ||
2563 (m->sb.mounted == info.last_clean_begin &&
2564 m->sb.clean_thru > info.last_clean_end)) {
2565 epoch_t begin = m->sb.mounted;
2566 epoch_t end = m->sb.clean_thru;
2567
2568 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2569 << "[" << info.last_clean_begin << "," << info.last_clean_end
2570 << ") -> [" << begin << "-" << end << ")"
2571 << dendl;
2572 pending_inc.new_last_clean_interval[from] =
2573 pair<epoch_t,epoch_t>(begin, end);
2574 }
2575
2576 osd_xinfo_t xi = osdmap.get_xinfo(from);
2577 if (m->boot_epoch == 0) {
2578 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2579 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2580 dout(10) << " not laggy, new xi " << xi << dendl;
2581 } else {
2582 if (xi.down_stamp.sec()) {
2583 int interval = ceph_clock_now().sec() -
2584 xi.down_stamp.sec();
2585 if (g_conf->mon_osd_laggy_max_interval &&
2586 (interval > g_conf->mon_osd_laggy_max_interval)) {
2587 interval = g_conf->mon_osd_laggy_max_interval;
2588 }
2589 xi.laggy_interval =
2590 interval * g_conf->mon_osd_laggy_weight +
2591 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2592 }
2593 xi.laggy_probability =
2594 g_conf->mon_osd_laggy_weight +
2595 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2596 dout(10) << " laggy, now xi " << xi << dendl;
2597 }
2598
2599 // set features shared by the osd
2600 if (m->osd_features)
2601 xi.features = m->osd_features;
2602 else
2603 xi.features = m->get_connection()->get_features();
2604
2605 // mark in?
2606 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2607 (oldstate & CEPH_OSD_AUTOOUT)) ||
2608 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2609 (g_conf->mon_osd_auto_mark_in)) {
2610 if (can_mark_in(from)) {
2611 if (osdmap.osd_xinfo[from].old_weight > 0) {
2612 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2613 xi.old_weight = 0;
2614 } else {
2615 pending_inc.new_weight[from] = CEPH_OSD_IN;
2616 }
2617 } else {
2618 dout(7) << __func__ << " NOIN set, will not mark in "
2619 << m->get_orig_source_addr() << dendl;
2620 }
2621 }
2622
2623 pending_inc.new_xinfo[from] = xi;
2624
2625 // wait
2626 wait_for_finished_proposal(op, new C_Booted(this, op));
2627 }
2628 return true;
2629}
2630
2631void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2632{
2633 op->mark_osdmon_event(__func__);
2634 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2635 dout(7) << "_booted " << m->get_orig_source_inst()
2636 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2637
2638 if (logit) {
2639 mon->clog->info() << m->get_orig_source_inst() << " boot";
2640 }
2641
2642 send_latest(op, m->sb.current_epoch+1);
2643}
2644
2645
2646// -------------
2647// full
2648
2649bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2650{
2651 op->mark_osdmon_event(__func__);
2652 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2653 int from = m->get_orig_source().num();
2654 set<string> state;
2655 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2656
2657 // check permissions, ignore if failed
2658 MonSession *session = m->get_session();
2659 if (!session)
2660 goto ignore;
2661 if (!session->is_capable("osd", MON_CAP_X)) {
2662 dout(0) << "MOSDFull from entity with insufficient privileges:"
2663 << session->caps << dendl;
2664 goto ignore;
2665 }
2666
2667 // ignore a full message from the osd instance that already went down
2668 if (!osdmap.exists(from)) {
2669 dout(7) << __func__ << " ignoring full message from nonexistent "
2670 << m->get_orig_source_inst() << dendl;
2671 goto ignore;
2672 }
2673 if ((!osdmap.is_up(from) &&
2674 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2675 (osdmap.is_up(from) &&
2676 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2677 dout(7) << __func__ << " ignoring full message from down "
2678 << m->get_orig_source_inst() << dendl;
2679 goto ignore;
2680 }
2681
2682 OSDMap::calc_state_set(osdmap.get_state(from), state);
2683
2684 if ((osdmap.get_state(from) & mask) == m->state) {
2685 dout(7) << __func__ << " state already " << state << " for osd." << from
2686 << " " << m->get_orig_source_inst() << dendl;
2687 _reply_map(op, m->version);
2688 goto ignore;
2689 }
2690
2691 dout(10) << __func__ << " want state " << state << " for osd." << from
2692 << " " << m->get_orig_source_inst() << dendl;
2693 return false;
2694
2695 ignore:
2696 return true;
2697}
2698
2699bool OSDMonitor::prepare_full(MonOpRequestRef op)
2700{
2701 op->mark_osdmon_event(__func__);
2702 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2703 const int from = m->get_orig_source().num();
2704
2705 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2706 const unsigned want_state = m->state & mask; // safety first
2707
2708 unsigned cur_state = osdmap.get_state(from);
2709 auto p = pending_inc.new_state.find(from);
2710 if (p != pending_inc.new_state.end()) {
2711 cur_state ^= p->second;
2712 }
2713 cur_state &= mask;
2714
2715 set<string> want_state_set, cur_state_set;
2716 OSDMap::calc_state_set(want_state, want_state_set);
2717 OSDMap::calc_state_set(cur_state, cur_state_set);
2718
2719 if (cur_state != want_state) {
2720 if (p != pending_inc.new_state.end()) {
2721 p->second &= ~mask;
2722 } else {
2723 pending_inc.new_state[from] = 0;
2724 }
2725 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2726 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2727 << " -> " << want_state_set << dendl;
2728 } else {
2729 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2730 << " = wanted " << want_state_set << ", just waiting" << dendl;
2731 }
2732
2733 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2734 return true;
2735}
2736
2737// -------------
2738// alive
2739
2740bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2741{
2742 op->mark_osdmon_event(__func__);
2743 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2744 int from = m->get_orig_source().num();
2745
2746 // check permissions, ignore if failed
2747 MonSession *session = m->get_session();
2748 if (!session)
2749 goto ignore;
2750 if (!session->is_capable("osd", MON_CAP_X)) {
2751 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2752 << session->caps << dendl;
2753 goto ignore;
2754 }
2755
2756 if (!osdmap.is_up(from) ||
2757 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2758 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2759 goto ignore;
2760 }
2761
2762 if (osdmap.get_up_thru(from) >= m->want) {
2763 // yup.
2764 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2765 _reply_map(op, m->version);
2766 return true;
2767 }
2768
2769 dout(10) << "preprocess_alive want up_thru " << m->want
2770 << " from " << m->get_orig_source_inst() << dendl;
2771 return false;
2772
2773 ignore:
2774 return true;
2775}
2776
2777bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2778{
2779 op->mark_osdmon_event(__func__);
2780 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2781 int from = m->get_orig_source().num();
2782
2783 if (0) { // we probably don't care much about these
2784 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2785 }
2786
2787 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2788 << " from " << m->get_orig_source_inst() << dendl;
2789
2790 update_up_thru(from, m->version); // set to the latest map the OSD has
2791 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2792 return true;
2793}
2794
2795void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2796{
2797 op->mark_osdmon_event(__func__);
2798 dout(7) << "_reply_map " << e
2799 << " from " << op->get_req()->get_orig_source_inst()
2800 << dendl;
2801 send_latest(op, e);
2802}
2803
2804// pg_created
2805bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2806{
2807 op->mark_osdmon_event(__func__);
2808 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2809 dout(10) << __func__ << " " << *m << dendl;
2810 auto session = m->get_session();
94b18763 2811 mon->no_reply(op);
7c673cae
FG
2812 if (!session) {
2813 dout(10) << __func__ << ": no monitor session!" << dendl;
2814 return true;
2815 }
2816 if (!session->is_capable("osd", MON_CAP_X)) {
2817 derr << __func__ << " received from entity "
2818 << "with insufficient privileges " << session->caps << dendl;
2819 return true;
2820 }
2821 // always forward the "created!" to the leader
2822 return false;
2823}
2824
2825bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2826{
2827 op->mark_osdmon_event(__func__);
2828 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2829 dout(10) << __func__ << " " << *m << dendl;
2830 auto src = m->get_orig_source();
2831 auto from = src.num();
2832 if (!src.is_osd() ||
2833 !mon->osdmon()->osdmap.is_up(from) ||
2834 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2835 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2836 return false;
2837 }
2838 pending_created_pgs.push_back(m->pgid);
2839 return true;
2840}
2841
2842// -------------
2843// pg_temp changes
2844
2845bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2846{
2847 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2848 dout(10) << "preprocess_pgtemp " << *m << dendl;
2849 mempool::osdmap::vector<int> empty;
2850 int from = m->get_orig_source().num();
2851 size_t ignore_cnt = 0;
2852
2853 // check caps
2854 MonSession *session = m->get_session();
2855 if (!session)
2856 goto ignore;
2857 if (!session->is_capable("osd", MON_CAP_X)) {
2858 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2859 << session->caps << dendl;
2860 goto ignore;
2861 }
2862
2863 if (!osdmap.is_up(from) ||
2864 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2865 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2866 goto ignore;
2867 }
2868
3efd9988
FG
2869 if (m->forced) {
2870 return false;
2871 }
2872
7c673cae
FG
2873 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2874 dout(20) << " " << p->first
31f18b77 2875 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
2876 << " -> " << p->second << dendl;
2877
2878 // does the pool exist?
2879 if (!osdmap.have_pg_pool(p->first.pool())) {
2880 /*
2881 * 1. If the osdmap does not have the pool, it means the pool has been
2882 * removed in-between the osd sending this message and us handling it.
2883 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2884 * not exist in the pending either, as the osds would not send a
2885 * message about a pool they know nothing about (yet).
2886 * 3. However, if the pool does exist in the pending, then it must be a
2887 * new pool, and not relevant to this message (see 1).
2888 */
2889 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2890 << ": pool has been removed" << dendl;
2891 ignore_cnt++;
2892 continue;
2893 }
2894
2895 int acting_primary = -1;
2896 osdmap.pg_to_up_acting_osds(
2897 p->first, nullptr, nullptr, nullptr, &acting_primary);
2898 if (acting_primary != from) {
2899 /* If the source isn't the primary based on the current osdmap, we know
2900 * that the interval changed and that we can discard this message.
2901 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2902 * which of two pg temp mappings on the same pg is more recent.
2903 */
2904 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2905 << ": primary has changed" << dendl;
2906 ignore_cnt++;
2907 continue;
2908 }
2909
2910 // removal?
2911 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2912 osdmap.primary_temp->count(p->first)))
2913 return false;
2914 // change?
2915 // NOTE: we assume that this will clear pg_primary, so consider
2916 // an existing pg_primary field to imply a change
2917 if (p->second.size() &&
2918 (osdmap.pg_temp->count(p->first) == 0 ||
31f18b77 2919 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
7c673cae
FG
2920 osdmap.primary_temp->count(p->first)))
2921 return false;
2922 }
2923
2924 // should we ignore all the pgs?
2925 if (ignore_cnt == m->pg_temp.size())
2926 goto ignore;
2927
2928 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
2929 _reply_map(op, m->map_epoch);
2930 return true;
2931
2932 ignore:
2933 return true;
2934}
2935
2936void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
2937{
2938 epoch_t old_up_thru = osdmap.get_up_thru(from);
2939 auto ut = pending_inc.new_up_thru.find(from);
2940 if (ut != pending_inc.new_up_thru.end()) {
2941 old_up_thru = ut->second;
2942 }
2943 if (up_thru > old_up_thru) {
2944 // set up_thru too, so the osd doesn't have to ask again
2945 pending_inc.new_up_thru[from] = up_thru;
2946 }
2947}
2948
2949bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
2950{
2951 op->mark_osdmon_event(__func__);
2952 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2953 int from = m->get_orig_source().num();
2954 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
2955 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2956 uint64_t pool = p->first.pool();
2957 if (pending_inc.old_pools.count(pool)) {
2958 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2959 << ": pool pending removal" << dendl;
2960 continue;
2961 }
2962 if (!osdmap.have_pg_pool(pool)) {
2963 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2964 << ": pool has been removed" << dendl;
2965 continue;
2966 }
2967 pending_inc.new_pg_temp[p->first] =
2968 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
2969
2970 // unconditionally clear pg_primary (until this message can encode
2971 // a change for that, too.. at which point we need to also fix
2972 // preprocess_pg_temp)
2973 if (osdmap.primary_temp->count(p->first) ||
2974 pending_inc.new_primary_temp.count(p->first))
2975 pending_inc.new_primary_temp[p->first] = -1;
2976 }
2977
2978 // set up_thru too, so the osd doesn't have to ask again
2979 update_up_thru(from, m->map_epoch);
2980
2981 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
2982 return true;
2983}
2984
2985
2986// ---
2987
2988bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
2989{
2990 op->mark_osdmon_event(__func__);
2991 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2992 dout(7) << "preprocess_remove_snaps " << *m << dendl;
2993
2994 // check privilege, ignore if failed
2995 MonSession *session = m->get_session();
2996 if (!session)
2997 goto ignore;
2998 if (!session->caps.is_capable(
2999 g_ceph_context,
3000 CEPH_ENTITY_TYPE_MON,
3001 session->entity_name,
3002 "osd", "osd pool rmsnap", {}, true, true, false)) {
3003 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3004 << session->caps << dendl;
3005 goto ignore;
3006 }
3007
3008 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3009 q != m->snaps.end();
3010 ++q) {
3011 if (!osdmap.have_pg_pool(q->first)) {
3012 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3013 continue;
3014 }
3015 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3016 for (vector<snapid_t>::iterator p = q->second.begin();
3017 p != q->second.end();
3018 ++p) {
3019 if (*p > pi->get_snap_seq() ||
3020 !pi->removed_snaps.contains(*p))
3021 return false;
3022 }
3023 }
3024
3025 ignore:
3026 return true;
3027}
3028
3029bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3030{
3031 op->mark_osdmon_event(__func__);
3032 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3033 dout(7) << "prepare_remove_snaps " << *m << dendl;
3034
3035 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3036 p != m->snaps.end();
3037 ++p) {
3038
3039 if (!osdmap.have_pg_pool(p->first)) {
3040 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3041 continue;
3042 }
3043
3044 pg_pool_t& pi = osdmap.pools[p->first];
3045 for (vector<snapid_t>::iterator q = p->second.begin();
3046 q != p->second.end();
3047 ++q) {
3048 if (!pi.removed_snaps.contains(*q) &&
3049 (!pending_inc.new_pools.count(p->first) ||
3050 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3051 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3052 newpi->removed_snaps.insert(*q);
3053 dout(10) << " pool " << p->first << " removed_snaps added " << *q
3054 << " (now " << newpi->removed_snaps << ")" << dendl;
3055 if (*q > newpi->get_snap_seq()) {
3056 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
3057 newpi->set_snap_seq(*q);
3058 }
3059 newpi->set_snap_epoch(pending_inc.epoch);
3060 }
3061 }
3062 }
3063 return true;
3064}
3065
3066// osd beacon
3067bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3068{
3069 op->mark_osdmon_event(__func__);
3070 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3071 // check caps
3072 auto session = beacon->get_session();
94b18763 3073 mon->no_reply(op);
7c673cae
FG
3074 if (!session) {
3075 dout(10) << __func__ << " no monitor session!" << dendl;
3076 return true;
3077 }
3078 if (!session->is_capable("osd", MON_CAP_X)) {
3079 derr << __func__ << " received from entity "
3080 << "with insufficient privileges " << session->caps << dendl;
3081 return true;
3082 }
3083 // Always forward the beacon to the leader, even if they are the same as
3084 // the old one. The leader will mark as down osds that haven't sent
3085 // beacon for a few minutes.
3086 return false;
3087}
3088
3089bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3090{
3091 op->mark_osdmon_event(__func__);
3092 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3093 const auto src = beacon->get_orig_source();
3094 dout(10) << __func__ << " " << *beacon
3095 << " from " << src << dendl;
3096 int from = src.num();
3097
3098 if (!src.is_osd() ||
3099 !osdmap.is_up(from) ||
3100 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
3101 dout(1) << " ignoring beacon from non-active osd." << dendl;
3102 return false;
3103 }
3104
3105 last_osd_report[from] = ceph_clock_now();
3106 osd_epochs[from] = beacon->version;
3107
3108 for (const auto& pg : beacon->pgs) {
3109 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3110 }
3111 return false;
3112}
3113
3114// ---------------
3115// map helpers
3116
3117void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3118{
3119 op->mark_osdmon_event(__func__);
3120 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3121 << " start " << start << dendl;
3122 if (start == 0)
3123 send_full(op);
3124 else
3125 send_incremental(op, start);
3126}
3127
3128
3129MOSDMap *OSDMonitor::build_latest_full()
3130{
3131 MOSDMap *r = new MOSDMap(mon->monmap->fsid);
3132 get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
3133 r->oldest_map = get_first_committed();
3134 r->newest_map = osdmap.get_epoch();
3135 return r;
3136}
3137
3138MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
3139{
3140 dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
3141 MOSDMap *m = new MOSDMap(mon->monmap->fsid);
3142 m->oldest_map = get_first_committed();
3143 m->newest_map = osdmap.get_epoch();
3144
3145 for (epoch_t e = to; e >= from && e > 0; e--) {
3146 bufferlist bl;
3147 int err = get_version(e, bl);
3148 if (err == 0) {
3149 assert(bl.length());
3150 // if (get_version(e, bl) > 0) {
3151 dout(20) << "build_incremental inc " << e << " "
3152 << bl.length() << " bytes" << dendl;
3153 m->incremental_maps[e] = bl;
3154 } else {
3155 assert(err == -ENOENT);
3156 assert(!bl.length());
3157 get_version_full(e, bl);
3158 if (bl.length() > 0) {
3159 //else if (get_version("full", e, bl) > 0) {
3160 dout(20) << "build_incremental full " << e << " "
3161 << bl.length() << " bytes" << dendl;
3162 m->maps[e] = bl;
3163 } else {
3164 ceph_abort(); // we should have all maps.
3165 }
3166 }
3167 }
3168 return m;
3169}
3170
3171void OSDMonitor::send_full(MonOpRequestRef op)
3172{
3173 op->mark_osdmon_event(__func__);
3174 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
3175 mon->send_reply(op, build_latest_full());
3176}
3177
3178void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3179{
3180 op->mark_osdmon_event(__func__);
3181
3182 MonSession *s = op->get_session();
3183 assert(s);
3184
3185 if (s->proxy_con &&
3186 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
3187 // oh, we can tell the other mon to do it
3188 dout(10) << __func__ << " asking proxying mon to send_incremental from "
3189 << first << dendl;
3190 MRoute *r = new MRoute(s->proxy_tid, NULL);
3191 r->send_osdmap_first = first;
3192 s->proxy_con->send_message(r);
3193 op->mark_event("reply: send routed send_osdmap_first reply");
3194 } else {
3195 // do it ourselves
3196 send_incremental(first, s, false, op);
3197 }
3198}
3199
3200void OSDMonitor::send_incremental(epoch_t first,
3201 MonSession *session,
3202 bool onetime,
3203 MonOpRequestRef req)
3204{
3205 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3206 << " to " << session->inst << dendl;
3207
3208 if (first <= session->osd_epoch) {
31f18b77 3209 dout(10) << __func__ << " " << session->inst << " should already have epoch "
7c673cae
FG
3210 << session->osd_epoch << dendl;
3211 first = session->osd_epoch + 1;
3212 }
3213
3214 if (first < get_first_committed()) {
3215 first = get_first_committed();
3216 bufferlist bl;
3217 int err = get_version_full(first, bl);
3218 assert(err == 0);
3219 assert(bl.length());
3220
3221 dout(20) << "send_incremental starting with base full "
3222 << first << " " << bl.length() << " bytes" << dendl;
3223
3224 MOSDMap *m = new MOSDMap(osdmap.get_fsid());
3225 m->oldest_map = get_first_committed();
3226 m->newest_map = osdmap.get_epoch();
3227 m->maps[first] = bl;
3228
3229 if (req) {
3230 mon->send_reply(req, m);
3231 session->osd_epoch = first;
3232 return;
3233 } else {
3234 session->con->send_message(m);
3235 session->osd_epoch = first;
3236 }
3237 first++;
3238 }
3239
3240 while (first <= osdmap.get_epoch()) {
3241 epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
3242 osdmap.get_epoch());
3243 MOSDMap *m = build_incremental(first, last);
3244
3245 if (req) {
3246 // send some maps. it may not be all of them, but it will get them
3247 // started.
3248 mon->send_reply(req, m);
3249 } else {
3250 session->con->send_message(m);
3251 first = last + 1;
3252 }
3253 session->osd_epoch = last;
3254 if (onetime || req)
3255 break;
3256 }
3257}
3258
3259int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3260{
3261 if (inc_osd_cache.lookup(ver, &bl)) {
3262 return 0;
3263 }
3264 int ret = PaxosService::get_version(ver, bl);
3265 if (!ret) {
3266 inc_osd_cache.add(ver, bl);
3267 }
3268 return ret;
3269}
3270
3271int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3272{
3273 if (full_osd_cache.lookup(ver, &bl)) {
3274 return 0;
3275 }
3276 int ret = PaxosService::get_version_full(ver, bl);
3277 if (!ret) {
3278 full_osd_cache.add(ver, bl);
3279 }
3280 return ret;
3281}
3282
3283epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
3284{
3285 dout(10) << "blacklist " << a << " until " << until << dendl;
3286 pending_inc.new_blacklist[a] = until;
3287 return pending_inc.epoch;
3288}
3289
3290
3291void OSDMonitor::check_osdmap_subs()
3292{
3293 dout(10) << __func__ << dendl;
3294 if (!osdmap.get_epoch()) {
3295 return;
3296 }
3297 auto osdmap_subs = mon->session_map.subs.find("osdmap");
3298 if (osdmap_subs == mon->session_map.subs.end()) {
3299 return;
3300 }
3301 auto p = osdmap_subs->second->begin();
3302 while (!p.end()) {
3303 auto sub = *p;
3304 ++p;
3305 check_osdmap_sub(sub);
3306 }
3307}
3308
3309void OSDMonitor::check_osdmap_sub(Subscription *sub)
3310{
3311 dout(10) << __func__ << " " << sub << " next " << sub->next
3312 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
3313 if (sub->next <= osdmap.get_epoch()) {
3314 if (sub->next >= 1)
3315 send_incremental(sub->next, sub->session, sub->incremental_onetime);
3316 else
3317 sub->session->con->send_message(build_latest_full());
3318 if (sub->onetime)
3319 mon->session_map.remove_sub(sub);
3320 else
3321 sub->next = osdmap.get_epoch() + 1;
3322 }
3323}
3324
3325void OSDMonitor::check_pg_creates_subs()
3326{
3327 if (!mon->monmap->get_required_features().contains_all(
3328 ceph::features::mon::FEATURE_LUMINOUS)) {
3329 // PGMonitor takes care of this in pre-luminous era.
3330 return;
3331 }
3332 if (!osdmap.get_num_up_osds()) {
3333 return;
3334 }
3335 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3336 mon->with_session_map([this](const MonSessionMap& session_map) {
3337 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3338 if (pg_creates_subs == session_map.subs.end()) {
3339 return;
3340 }
3341 for (auto sub : *pg_creates_subs->second) {
3342 check_pg_creates_sub(sub);
3343 }
3344 });
3345}
3346
3347void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3348{
3349 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3350 assert(sub->type == "osd_pg_creates");
3351 // only send these if the OSD is up. we will check_subs() when they do
3352 // come up so they will get the creates then.
3353 if (sub->session->inst.name.is_osd() &&
3354 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3355 sub->next = send_pg_creates(sub->session->inst.name.num(),
3356 sub->session->con.get(),
3357 sub->next);
3358 }
3359}
3360
c07f9fc5
FG
3361void OSDMonitor::do_application_enable(int64_t pool_id,
3362 const std::string &app_name)
3363{
35e4c445 3364 assert(paxos->is_plugged() && is_writeable());
c07f9fc5
FG
3365
3366 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
3367 << dendl;
3368
35e4c445
FG
3369 assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS ||
3370 pending_inc.new_require_osd_release >= CEPH_RELEASE_LUMINOUS);
3371
c07f9fc5
FG
3372 auto pp = osdmap.get_pg_pool(pool_id);
3373 assert(pp != nullptr);
3374
3375 pg_pool_t p = *pp;
3376 if (pending_inc.new_pools.count(pool_id)) {
3377 p = pending_inc.new_pools[pool_id];
3378 }
3379
3380 p.application_metadata.insert({app_name, {}});
3381 p.last_change = pending_inc.epoch;
3382 pending_inc.new_pools[pool_id] = p;
3383}
3384
31f18b77 3385unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
3386 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3387 const mempool::osdmap::set<int64_t>& removed_pools,
3388 utime_t modified,
3389 creating_pgs_t* creating_pgs) const
3390{
31f18b77 3391 unsigned queued = 0;
7c673cae
FG
3392 for (auto& p : pools) {
3393 int64_t poolid = p.first;
3394 const pg_pool_t& pool = p.second;
31f18b77 3395 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
3396 pool.get_type(), pool.get_size());
3397 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3398 continue;
3399
3400 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3401 const auto created = pool.get_last_change();
3402 if (last_scan_epoch && created <= last_scan_epoch) {
3403 dout(10) << __func__ << " no change in pool " << poolid
3404 << " " << pool << dendl;
3405 continue;
3406 }
3407 if (removed_pools.count(poolid)) {
3408 dout(10) << __func__ << " pool is being removed: " << poolid
3409 << " " << pool << dendl;
3410 continue;
3411 }
31f18b77 3412 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 3413 << " " << pool << dendl;
31f18b77
FG
3414 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3415 created, modified)) {
3416 queued++;
7c673cae
FG
3417 }
3418 }
31f18b77 3419 return queued;
7c673cae
FG
3420}
3421
3422void OSDMonitor::update_creating_pgs()
3423{
31f18b77
FG
3424 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3425 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
3426 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3427 std::lock_guard<std::mutex> l(creating_pgs_lock);
c07f9fc5 3428 for (const auto& pg : creating_pgs.pgs) {
7c673cae
FG
3429 int acting_primary = -1;
3430 auto pgid = pg.first;
94b18763
FG
3431 if (!osdmap.pg_exists(pgid)) {
3432 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
3433 << dendl;
3434 continue;
3435 }
7c673cae 3436 auto mapped = pg.second.first;
c07f9fc5 3437 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
7c673cae
FG
3438 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3439 // check the previous creating_pgs, look for the target to whom the pg was
3440 // previously mapped
3441 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3442 const auto last_acting_primary = pgs_by_epoch.first;
3443 for (auto& pgs: pgs_by_epoch.second) {
3444 if (pgs.second.count(pgid)) {
3445 if (last_acting_primary == acting_primary) {
3446 mapped = pgs.first;
3447 } else {
3448 dout(20) << __func__ << " " << pgid << " "
3449 << " acting_primary:" << last_acting_primary
3450 << " -> " << acting_primary << dendl;
3451 // note epoch if the target of the create message changed.
3452 mapped = mapping.get_epoch();
3453 }
3454 break;
31f18b77
FG
3455 } else {
3456 // newly creating
3457 mapped = mapping.get_epoch();
3458 }
7c673cae
FG
3459 }
3460 }
3461 dout(10) << __func__ << " will instruct osd." << acting_primary
c07f9fc5 3462 << " to create " << pgid << "@" << mapped << dendl;
7c673cae
FG
3463 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3464 }
3465 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3466 creating_pgs_epoch = mapping.get_epoch();
3467}
3468
c07f9fc5 3469epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
7c673cae
FG
3470{
3471 dout(30) << __func__ << " osd." << osd << " next=" << next
3472 << " " << creating_pgs_by_osd_epoch << dendl;
3473 std::lock_guard<std::mutex> l(creating_pgs_lock);
b5b8bbf5
FG
3474 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
3475 dout(20) << __func__
3476 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
3477 // the subscribers will be updated when the mapping is completed anyway
3478 return next;
3479 }
7c673cae
FG
3480 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3481 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3482 return next;
3483 assert(!creating_pgs_by_epoch->second.empty());
3484
3485 MOSDPGCreate *m = nullptr;
3486 epoch_t last = 0;
3487 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3488 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3489 auto epoch = epoch_pgs->first;
3490 auto& pgs = epoch_pgs->second;
3491 dout(20) << __func__ << " osd." << osd << " from " << next
3492 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3493 last = epoch;
3494 for (auto& pg : pgs) {
3495 if (!m)
3496 m = new MOSDPGCreate(creating_pgs_epoch);
3497 // Need the create time from the monitor using its clock to set
3498 // last_scrub_stamp upon pg creation.
c07f9fc5
FG
3499 auto create = creating_pgs.pgs.find(pg);
3500 assert(create != creating_pgs.pgs.end());
3501 m->mkpg.emplace(pg, pg_create_t{create->second.first, pg, 0});
3502 m->ctimes.emplace(pg, create->second.second);
7c673cae 3503 dout(20) << __func__ << " will create " << pg
c07f9fc5 3504 << " at " << create->second.first << dendl;
7c673cae
FG
3505 }
3506 }
3507 if (!m) {
3508 dout(20) << __func__ << " osd." << osd << " from " << next
3509 << " has nothing to send" << dendl;
3510 return next;
3511 }
3512 con->send_message(m);
3513 // sub is current through last + 1
3514 return last + 1;
3515}
3516
3517// TICK
3518
3519
3520void OSDMonitor::tick()
3521{
3522 if (!is_active()) return;
3523
3524 dout(10) << osdmap << dendl;
3525
3526 if (!mon->is_leader()) return;
3527
3528 bool do_propose = false;
3529 utime_t now = ceph_clock_now();
3530
31f18b77 3531 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
3532 mon->monmap->get_required_features().contains_all(
3533 ceph::features::mon::FEATURE_LUMINOUS)) {
3534 if (handle_osd_timeouts(now, last_osd_report)) {
3535 do_propose = true;
3536 }
3537 }
181888fb
FG
3538 if (!osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) &&
3539 osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
3540 mon->mgrstatmon()->is_readable() &&
3541 mon->mgrstatmon()->definitely_converted_snapsets()) {
3542 dout(1) << __func__ << " all snapsets converted, setting purged_snapdirs"
3543 << dendl;
3544 add_flag(CEPH_OSDMAP_PURGED_SNAPDIRS);
3545 do_propose = true;
3546 }
7c673cae
FG
3547
3548 // mark osds down?
3549 if (check_failures(now))
3550 do_propose = true;
3551
3552 // mark down osds out?
3553
3554 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3555 * influence at all. The decision is made based on the ratio of "in" osds,
3556 * and the function returns false if this ratio is lower that the minimum
3557 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3558 */
3559 if (can_mark_out(-1)) {
3560 set<int> down_cache; // quick cache of down subtrees
3561
3562 map<int,utime_t>::iterator i = down_pending_out.begin();
3563 while (i != down_pending_out.end()) {
3564 int o = i->first;
3565 utime_t down = now;
3566 down -= i->second;
3567 ++i;
3568
3569 if (osdmap.is_down(o) &&
3570 osdmap.is_in(o) &&
3571 can_mark_out(o)) {
3572 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3573 utime_t grace = orig_grace;
3574 double my_grace = 0.0;
3575
3576 if (g_conf->mon_osd_adjust_down_out_interval) {
3577 // scale grace period the same way we do the heartbeat grace.
3578 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3579 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3580 double decay_k = ::log(.5) / halflife;
3581 double decay = exp((double)down * decay_k);
3582 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3583 << " down for " << down << " decay " << decay << dendl;
3584 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3585 grace += my_grace;
3586 }
3587
3588 // is this an entire large subtree down?
3589 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3590 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3591 if (type > 0) {
3592 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3593 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3594 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3595 // reset timer, too.
3596 down_pending_out[o] = now;
3597 continue;
3598 }
3599 }
3600 }
3601
c07f9fc5
FG
3602 bool down_out = !osdmap.is_destroyed(o) &&
3603 g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace;
3604 bool destroyed_out = osdmap.is_destroyed(o) &&
3605 g_conf->mon_osd_destroyed_out_interval > 0 &&
3606 // this is not precise enough as we did not make a note when this osd
3607 // was marked as destroyed, but let's not bother with that
3608 // complexity for now.
3609 down.sec() >= g_conf->mon_osd_destroyed_out_interval;
3610 if (down_out || destroyed_out) {
7c673cae
FG
3611 dout(10) << "tick marking osd." << o << " OUT after " << down
3612 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3613 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3614
3615 // set the AUTOOUT bit.
3616 if (pending_inc.new_state.count(o) == 0)
3617 pending_inc.new_state[o] = 0;
3618 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3619
3620 // remember previous weight
3621 if (pending_inc.new_xinfo.count(o) == 0)
3622 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3623 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3624
3625 do_propose = true;
3626
224ce89b
WB
3627 mon->clog->info() << "Marking osd." << o << " out (has been down for "
3628 << int(down.sec()) << " seconds)";
7c673cae
FG
3629 } else
3630 continue;
3631 }
3632
3633 down_pending_out.erase(o);
3634 }
3635 } else {
3636 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3637 }
3638
3639 // expire blacklisted items?
3640 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3641 p != osdmap.blacklist.end();
3642 ++p) {
3643 if (p->second < now) {
3644 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3645 pending_inc.old_blacklist.push_back(p->first);
3646 do_propose = true;
3647 }
3648 }
3649
3650 // if map full setting has changed, get that info out there!
31f18b77
FG
3651 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3652 mon->pgservice->is_readable()) {
7c673cae 3653 // for pre-luminous compat only!
31f18b77 3654 if (mon->pgservice->have_full_osds()) {
7c673cae
FG
3655 dout(5) << "There are full osds, setting full flag" << dendl;
3656 add_flag(CEPH_OSDMAP_FULL);
3657 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3658 dout(10) << "No full osds, removing full flag" << dendl;
3659 remove_flag(CEPH_OSDMAP_FULL);
3660 }
3661
31f18b77 3662 if (mon->pgservice->have_nearfull_osds()) {
7c673cae
FG
3663 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3664 add_flag(CEPH_OSDMAP_NEARFULL);
3665 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3666 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3667 remove_flag(CEPH_OSDMAP_NEARFULL);
3668 }
3669 if (pending_inc.new_flags != -1 &&
3670 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3671 dout(1) << "New setting for" <<
3672 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3673 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3674 << " -- doing propose" << dendl;
3675 do_propose = true;
3676 }
3677 }
3678
3679 if (update_pools_status())
3680 do_propose = true;
3681
3682 if (do_propose ||
3683 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3684 propose_pending();
3685}
3686
3687bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3688 std::map<int,utime_t> &last_osd_report)
3689{
3690 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3691 if (now - mon->get_leader_since() < timeo) {
3692 // We haven't been the leader for long enough to consider OSD timeouts
3693 return false;
3694 }
3695
3696 int max_osd = osdmap.get_max_osd();
3697 bool new_down = false;
3698
3699 for (int i=0; i < max_osd; ++i) {
3700 dout(30) << __func__ << ": checking up on osd " << i << dendl;
c07f9fc5
FG
3701 if (!osdmap.exists(i)) {
3702 last_osd_report.erase(i); // if any
3703 continue;
3704 }
7c673cae
FG
3705 if (!osdmap.is_up(i))
3706 continue;
3707 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3708 if (t == last_osd_report.end()) {
3709 // it wasn't in the map; start the timer.
3710 last_osd_report[i] = now;
3711 } else if (can_mark_down(i)) {
3712 utime_t diff = now - t->second;
3713 if (diff > timeo) {
31f18b77
FG
3714 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3715 << diff << " seconds";
3716 derr << "no beacon from osd." << i << " since " << t->second
3717 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
3718 pending_inc.new_state[i] = CEPH_OSD_UP;
3719 new_down = true;
3720 }
3721 }
3722 }
3723 return new_down;
3724}
3725
3726void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3727 list<pair<health_status_t,string> > *detail,
3728 CephContext *cct) const
3729{
3730 int num_osds = osdmap.get_num_osds();
3731
3732 if (num_osds == 0) {
3733 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3734 } else {
3735 int num_in_osds = 0;
3736 int num_down_in_osds = 0;
3737 set<int> osds;
31f18b77
FG
3738 set<int> down_in_osds;
3739 set<int> up_in_osds;
3740 set<int> subtree_up;
3741 unordered_map<int, set<int> > subtree_type_down;
3742 unordered_map<int, int> num_osds_subtree;
3743 int max_type = osdmap.crush->get_max_type_id();
3744
7c673cae
FG
3745 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3746 if (!osdmap.exists(i)) {
3747 if (osdmap.crush->item_exists(i)) {
3748 osds.insert(i);
3749 }
31f18b77 3750 continue;
224ce89b 3751 }
7c673cae
FG
3752 if (osdmap.is_out(i))
3753 continue;
3754 ++num_in_osds;
31f18b77
FG
3755 if (down_in_osds.count(i) || up_in_osds.count(i))
3756 continue;
7c673cae 3757 if (!osdmap.is_up(i)) {
31f18b77
FG
3758 down_in_osds.insert(i);
3759 int parent_id = 0;
3760 int current = i;
3761 for (int type = 0; type <= max_type; type++) {
3762 if (!osdmap.crush->get_type_name(type))
3763 continue;
3764 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3765 if (r == -ENOENT)
3766 break;
3767 // break early if this parent is already marked as up
3768 if (subtree_up.count(parent_id))
3769 break;
3770 type = osdmap.crush->get_bucket_type(parent_id);
3771 if (!osdmap.subtree_type_is_down(
3772 g_ceph_context, parent_id, type,
3773 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3774 break;
3775 current = parent_id;
3776 }
3777 }
3778 }
3779
3780 // calculate the number of down osds in each down subtree and
3781 // store it in num_osds_subtree
3782 for (int type = 1; type <= max_type; type++) {
3783 if (!osdmap.crush->get_type_name(type))
3784 continue;
3785 for (auto j = subtree_type_down[type].begin();
3786 j != subtree_type_down[type].end();
3787 ++j) {
3788 if (type == 1) {
3789 list<int> children;
3790 int num = osdmap.crush->get_children(*j, &children);
3791 num_osds_subtree[*j] = num;
3792 } else {
3793 list<int> children;
3794 int num = 0;
3795 int num_children = osdmap.crush->get_children(*j, &children);
3796 if (num_children == 0)
3797 continue;
3798 for (auto l = children.begin(); l != children.end(); ++l) {
3799 if (num_osds_subtree[*l] > 0) {
3800 num = num + num_osds_subtree[*l];
3801 }
3802 }
3803 num_osds_subtree[*j] = num;
7c673cae
FG
3804 }
3805 }
3806 }
31f18b77 3807 num_down_in_osds = down_in_osds.size();
7c673cae
FG
3808 assert(num_down_in_osds <= num_in_osds);
3809 if (num_down_in_osds > 0) {
31f18b77
FG
3810 // summary of down subtree types and osds
3811 for (int type = max_type; type > 0; type--) {
3812 if (!osdmap.crush->get_type_name(type))
3813 continue;
3814 if (subtree_type_down[type].size() > 0) {
3815 ostringstream ss;
3816 ss << subtree_type_down[type].size() << " "
3817 << osdmap.crush->get_type_name(type);
3818 if (subtree_type_down[type].size() > 1) {
3819 ss << "s";
3820 }
3821 int sum_down_osds = 0;
3822 for (auto j = subtree_type_down[type].begin();
3823 j != subtree_type_down[type].end();
3824 ++j) {
3825 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
3826 }
3827 ss << " (" << sum_down_osds << " osds) down";
3828 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3829 }
3830 }
7c673cae 3831 ostringstream ss;
31f18b77 3832 ss << down_in_osds.size() << " osds down";
7c673cae 3833 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
31f18b77
FG
3834
3835 if (detail) {
3836 // details of down subtree types
3837 for (int type = max_type; type > 0; type--) {
3838 if (!osdmap.crush->get_type_name(type))
3839 continue;
3840 for (auto j = subtree_type_down[type].rbegin();
3841 j != subtree_type_down[type].rend();
3842 ++j) {
3843 ostringstream ss;
3844 ss << osdmap.crush->get_type_name(type);
3845 ss << " ";
3846 ss << osdmap.crush->get_item_name(*j);
3847 // at the top level, do not print location
3848 if (type != max_type) {
3849 ss << " (";
3850 ss << osdmap.crush->get_full_location_ordered_string(*j);
3851 ss << ")";
3852 }
3853 int num = num_osds_subtree[*j];
3854 ss << " (" << num << " osds)";
3855 ss << " is down";
3856 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3857 }
3858 }
3859 // details of down osds
3860 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
3861 ostringstream ss;
3862 ss << "osd." << *it << " (";
3863 ss << osdmap.crush->get_full_location_ordered_string(*it);
3864 ss << ") is down";
3865 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3866 }
3867 }
7c673cae
FG
3868 }
3869
3870 if (!osds.empty()) {
3871 ostringstream ss;
31f18b77 3872 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
7c673cae
FG
3873 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3874 if (detail) {
31f18b77 3875 ss << " (osds: " << osds << ")";
7c673cae
FG
3876 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3877 }
3878 }
3879
7c673cae
FG
3880 // note: we leave it to ceph-mgr to generate details health warnings
3881 // with actual osd utilizations
3882
3883 // warn about flags
3884 uint64_t warn_flags =
3885 CEPH_OSDMAP_FULL |
3886 CEPH_OSDMAP_PAUSERD |
3887 CEPH_OSDMAP_PAUSEWR |
3888 CEPH_OSDMAP_PAUSEREC |
3889 CEPH_OSDMAP_NOUP |
3890 CEPH_OSDMAP_NODOWN |
3891 CEPH_OSDMAP_NOIN |
3892 CEPH_OSDMAP_NOOUT |
3893 CEPH_OSDMAP_NOBACKFILL |
3894 CEPH_OSDMAP_NORECOVER |
3895 CEPH_OSDMAP_NOSCRUB |
3896 CEPH_OSDMAP_NODEEP_SCRUB |
3897 CEPH_OSDMAP_NOTIERAGENT |
3898 CEPH_OSDMAP_NOREBALANCE;
3899 if (osdmap.test_flag(warn_flags)) {
3900 ostringstream ss;
3901 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
3902 << " flag(s) set";
3903 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3904 if (detail)
3905 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3906 }
3907
3908 // old crush tunables?
3909 if (g_conf->mon_warn_on_legacy_crush_tunables) {
3910 string min = osdmap.crush->get_min_required_version();
3911 if (min < g_conf->mon_crush_min_required_version) {
3912 ostringstream ss;
3913 ss << "crush map has legacy tunables (require " << min
3914 << ", min is " << g_conf->mon_crush_min_required_version << ")";
3915 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3916 if (detail) {
3917 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3918 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3919 }
3920 }
3921 }
3922 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
3923 if (osdmap.crush->get_straw_calc_version() == 0) {
3924 ostringstream ss;
3925 ss << "crush map has straw_calc_version=0";
3926 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3927 if (detail) {
3928 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3929 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3930 }
3931 }
3932 }
3933
3934 // hit_set-less cache_mode?
3935 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
3936 int problem_cache_pools = 0;
3937 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
3938 p != osdmap.pools.end();
3939 ++p) {
3940 const pg_pool_t& info = p->second;
3941 if (info.cache_mode_requires_hit_set() &&
3942 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
3943 ++problem_cache_pools;
3944 if (detail) {
3945 ostringstream ss;
3946 ss << "pool '" << osdmap.get_pool_name(p->first)
3947 << "' with cache_mode " << info.get_cache_mode_name()
3948 << " needs hit_set_type to be set but it is not";
3949 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3950 }
3951 }
3952 }
3953 if (problem_cache_pools) {
3954 ostringstream ss;
3955 ss << problem_cache_pools << " cache pools are missing hit_sets";
3956 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3957 }
3958 }
3959
3960 // Not using 'sortbitwise' and should be?
3961 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
3962 (osdmap.get_up_osd_features() &
3963 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
3964 ostringstream ss;
3965 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
3966 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3967 }
3968
3969 // Warn if 'mon_osd_down_out_interval' is set to zero.
3970 // Having this option set to zero on the leader acts much like the
3971 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3972 // without the 'noout' flag set but acting like that just the same, so
3973 // we report a HEALTH_WARN in case this option is set to zero.
3974 // This is an ugly hack to get the warning out, but until we find a way
3975 // to spread global options throughout the mon cluster and have all mons
3976 // using a base set of the same options, we need to work around this sort
3977 // of things.
3978 // There's also the obvious drawback that if this is set on a single
3979 // monitor on a 3-monitor cluster, this warning will only be shown every
3980 // third monitor connection.
3981 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
3982 g_conf->mon_osd_down_out_interval == 0) {
3983 ostringstream ss;
3984 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
3985 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3986 if (detail) {
3987 ss << "; this has the same effect as the 'noout' flag";
3988 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3989 }
3990 }
3991
3992 // warn about upgrade flags that can be set but are not.
3993 if (g_conf->mon_debug_no_require_luminous) {
3994 // ignore these checks
3995 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
31f18b77
FG
3996 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
3997 string msg = "all OSDs are running luminous or later but"
3998 " require_osd_release < luminous";
7c673cae
FG
3999 summary.push_back(make_pair(HEALTH_WARN, msg));
4000 if (detail) {
4001 detail->push_back(make_pair(HEALTH_WARN, msg));
4002 }
4003 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
31f18b77
FG
4004 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
4005 string msg = "all OSDs are running kraken or later but"
4006 " require_osd_release < kraken";
7c673cae
FG
4007 summary.push_back(make_pair(HEALTH_WARN, msg));
4008 if (detail) {
4009 detail->push_back(make_pair(HEALTH_WARN, msg));
4010 }
4011 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
31f18b77
FG
4012 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
4013 string msg = "all OSDs are running jewel or later but"
4014 " require_osd_release < jewel";
7c673cae
FG
4015 summary.push_back(make_pair(HEALTH_WARN, msg));
4016 if (detail) {
4017 detail->push_back(make_pair(HEALTH_WARN, msg));
4018 }
4019 }
4020
224ce89b
WB
4021 for (auto it : osdmap.get_pools()) {
4022 const pg_pool_t &pool = it.second;
4023 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
4024 const string& pool_name = osdmap.get_pool_name(it.first);
4025 stringstream ss;
4026 ss << "pool '" << pool_name << "' is full";
4027 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
4028 if (detail)
4029 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
4030 }
4031 }
7c673cae
FG
4032 }
4033}
4034
4035void OSDMonitor::dump_info(Formatter *f)
4036{
4037 f->open_object_section("osdmap");
4038 osdmap.dump(f);
4039 f->close_section();
4040
4041 f->open_array_section("osd_metadata");
4042 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4043 if (osdmap.exists(i)) {
4044 f->open_object_section("osd");
4045 f->dump_unsigned("id", i);
4046 dump_osd_metadata(i, f, NULL);
4047 f->close_section();
4048 }
4049 }
4050 f->close_section();
4051
4052 f->dump_unsigned("osdmap_first_committed", get_first_committed());
4053 f->dump_unsigned("osdmap_last_committed", get_last_committed());
4054
4055 f->open_object_section("crushmap");
4056 osdmap.crush->dump(f);
4057 f->close_section();
4058}
4059
4060namespace {
4061 enum osd_pool_get_choices {
4062 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
31f18b77 4063 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
7c673cae
FG
4064 NODELETE, NOPGCHANGE, NOSIZECHANGE,
4065 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4066 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4067 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4068 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4069 CACHE_TARGET_FULL_RATIO,
4070 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4071 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4072 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4073 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4074 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4075 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4076 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4077 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4078 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
4079
4080 std::set<osd_pool_get_choices>
4081 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4082 const std::set<osd_pool_get_choices>& second)
4083 {
4084 std::set<osd_pool_get_choices> result;
4085 std::set_difference(first.begin(), first.end(),
4086 second.begin(), second.end(),
4087 std::inserter(result, result.end()));
4088 return result;
4089 }
4090}
4091
4092
4093bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4094{
4095 op->mark_osdmon_event(__func__);
4096 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4097 int r = 0;
4098 bufferlist rdata;
4099 stringstream ss, ds;
4100
4101 map<string, cmd_vartype> cmdmap;
4102 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4103 string rs = ss.str();
4104 mon->reply_command(op, -EINVAL, rs, get_last_committed());
4105 return true;
4106 }
4107
4108 MonSession *session = m->get_session();
4109 if (!session) {
4110 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4111 return true;
4112 }
4113
4114 string prefix;
4115 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
4116
4117 string format;
4118 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
4119 boost::scoped_ptr<Formatter> f(Formatter::create(format));
4120
4121 if (prefix == "osd stat") {
224ce89b 4122 osdmap.print_summary(f.get(), ds, "");
7c673cae
FG
4123 if (f)
4124 f->flush(rdata);
4125 else
4126 rdata.append(ds);
4127 }
4128 else if (prefix == "osd perf" ||
4129 prefix == "osd blocked-by") {
31f18b77
FG
4130 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4131 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
4132 }
4133 else if (prefix == "osd dump" ||
4134 prefix == "osd tree" ||
4135 prefix == "osd ls" ||
4136 prefix == "osd getmap" ||
31f18b77
FG
4137 prefix == "osd getcrushmap" ||
4138 prefix == "osd ls-tree") {
7c673cae
FG
4139 string val;
4140
4141 epoch_t epoch = 0;
4142 int64_t epochnum;
4143 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
4144 epoch = epochnum;
4145
4146 bufferlist osdmap_bl;
4147 int err = get_version_full(epoch, osdmap_bl);
4148 if (err == -ENOENT) {
4149 r = -ENOENT;
4150 ss << "there is no map for epoch " << epoch;
4151 goto reply;
4152 }
4153 assert(err == 0);
4154 assert(osdmap_bl.length());
4155
4156 OSDMap *p;
4157 if (epoch == osdmap.get_epoch()) {
4158 p = &osdmap;
4159 } else {
4160 p = new OSDMap;
4161 p->decode(osdmap_bl);
4162 }
4163
224ce89b
WB
4164 auto sg = make_scope_guard([&] {
4165 if (p != &osdmap) {
4166 delete p;
4167 }
4168 });
4169
7c673cae
FG
4170 if (prefix == "osd dump") {
4171 stringstream ds;
4172 if (f) {
4173 f->open_object_section("osdmap");
4174 p->dump(f.get());
4175 f->close_section();
4176 f->flush(ds);
4177 } else {
4178 p->print(ds);
4179 }
4180 rdata.append(ds);
4181 if (!f)
4182 ds << " ";
4183 } else if (prefix == "osd ls") {
4184 if (f) {
4185 f->open_array_section("osds");
4186 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4187 if (osdmap.exists(i)) {
4188 f->dump_int("osd", i);
4189 }
4190 }
4191 f->close_section();
4192 f->flush(ds);
4193 } else {
4194 bool first = true;
4195 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4196 if (osdmap.exists(i)) {
4197 if (!first)
4198 ds << "\n";
4199 first = false;
4200 ds << i;
4201 }
4202 }
4203 }
4204 rdata.append(ds);
4205 } else if (prefix == "osd tree") {
31f18b77
FG
4206 vector<string> states;
4207 cmd_getval(g_ceph_context, cmdmap, "states", states);
4208 unsigned filter = 0;
4209 for (auto& s : states) {
4210 if (s == "up") {
4211 filter |= OSDMap::DUMP_UP;
4212 } else if (s == "down") {
4213 filter |= OSDMap::DUMP_DOWN;
4214 } else if (s == "in") {
4215 filter |= OSDMap::DUMP_IN;
4216 } else if (s == "out") {
4217 filter |= OSDMap::DUMP_OUT;
c07f9fc5
FG
4218 } else if (s == "destroyed") {
4219 filter |= OSDMap::DUMP_DESTROYED;
31f18b77
FG
4220 } else {
4221 ss << "unrecognized state '" << s << "'";
4222 r = -EINVAL;
4223 goto reply;
4224 }
4225 }
4226 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
c07f9fc5
FG
4227 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4228 ss << "cannot specify both 'in' and 'out'";
4229 r = -EINVAL;
4230 goto reply;
4231 }
4232 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4233 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4234 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4235 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4236 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4237 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4238 ss << "can specify only one of 'up', 'down' and 'destroyed'";
31f18b77
FG
4239 r = -EINVAL;
4240 goto reply;
4241 }
7c673cae
FG
4242 if (f) {
4243 f->open_object_section("tree");
31f18b77 4244 p->print_tree(f.get(), NULL, filter);
7c673cae
FG
4245 f->close_section();
4246 f->flush(ds);
4247 } else {
31f18b77 4248 p->print_tree(NULL, &ds, filter);
7c673cae
FG
4249 }
4250 rdata.append(ds);
4251 } else if (prefix == "osd getmap") {
4252 rdata.append(osdmap_bl);
4253 ss << "got osdmap epoch " << p->get_epoch();
4254 } else if (prefix == "osd getcrushmap") {
4255 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
4256 ss << p->get_crush_version();
4257 } else if (prefix == "osd ls-tree") {
4258 string bucket_name;
4259 cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
4260 set<int> osds;
4261 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4262 if (r == -ENOENT) {
4263 ss << "\"" << bucket_name << "\" does not exist";
4264 goto reply;
4265 } else if (r < 0) {
4266 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4267 goto reply;
4268 }
4269
4270 if (f) {
4271 f->open_array_section("osds");
4272 for (auto &i : osds) {
4273 if (osdmap.exists(i)) {
4274 f->dump_int("osd", i);
4275 }
4276 }
4277 f->close_section();
4278 f->flush(ds);
4279 } else {
4280 bool first = true;
4281 for (auto &i : osds) {
4282 if (osdmap.exists(i)) {
4283 if (!first)
4284 ds << "\n";
4285 first = false;
4286 ds << i;
4287 }
4288 }
4289 }
4290
4291 rdata.append(ds);
7c673cae 4292 }
7c673cae
FG
4293 } else if (prefix == "osd df") {
4294 string method;
4295 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
31f18b77
FG
4296 print_osd_utilization(osdmap, mon->pgservice, ds,
4297 f.get(), method == "tree");
7c673cae
FG
4298 rdata.append(ds);
4299 } else if (prefix == "osd getmaxosd") {
4300 if (f) {
4301 f->open_object_section("getmaxosd");
4302 f->dump_unsigned("epoch", osdmap.get_epoch());
4303 f->dump_int("max_osd", osdmap.get_max_osd());
4304 f->close_section();
4305 f->flush(rdata);
4306 } else {
4307 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4308 rdata.append(ds);
4309 }
4310 } else if (prefix == "osd utilization") {
4311 string out;
4312 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4313 if (f)
4314 f->flush(rdata);
4315 else
4316 rdata.append(out);
4317 r = 0;
4318 goto reply;
4319 } else if (prefix == "osd find") {
4320 int64_t osd;
4321 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4322 ss << "unable to parse osd id value '"
4323 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4324 r = -EINVAL;
4325 goto reply;
4326 }
4327 if (!osdmap.exists(osd)) {
4328 ss << "osd." << osd << " does not exist";
4329 r = -ENOENT;
4330 goto reply;
4331 }
4332 string format;
4333 cmd_getval(g_ceph_context, cmdmap, "format", format);
4334 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4335 f->open_object_section("osd_location");
4336 f->dump_int("osd", osd);
4337 f->dump_stream("ip") << osdmap.get_addr(osd);
4338 f->open_object_section("crush_location");
4339 map<string,string> loc = osdmap.crush->get_full_location(osd);
4340 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4341 f->dump_string(p->first.c_str(), p->second);
4342 f->close_section();
4343 f->close_section();
4344 f->flush(rdata);
4345 } else if (prefix == "osd metadata") {
4346 int64_t osd = -1;
4347 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4348 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4349 ss << "unable to parse osd id value '"
4350 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4351 r = -EINVAL;
4352 goto reply;
4353 }
4354 if (osd >= 0 && !osdmap.exists(osd)) {
4355 ss << "osd." << osd << " does not exist";
4356 r = -ENOENT;
4357 goto reply;
4358 }
4359 string format;
4360 cmd_getval(g_ceph_context, cmdmap, "format", format);
4361 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4362 if (osd >= 0) {
4363 f->open_object_section("osd_metadata");
4364 f->dump_unsigned("id", osd);
4365 r = dump_osd_metadata(osd, f.get(), &ss);
4366 if (r < 0)
4367 goto reply;
4368 f->close_section();
4369 } else {
4370 r = 0;
4371 f->open_array_section("osd_metadata");
4372 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4373 if (osdmap.exists(i)) {
4374 f->open_object_section("osd");
4375 f->dump_unsigned("id", i);
4376 r = dump_osd_metadata(i, f.get(), NULL);
4377 if (r == -EINVAL || r == -ENOENT) {
4378 // Drop error, continue to get other daemons' metadata
4379 dout(4) << "No metadata for osd." << i << dendl;
4380 r = 0;
4381 } else if (r < 0) {
4382 // Unexpected error
4383 goto reply;
4384 }
4385 f->close_section();
4386 }
4387 }
4388 f->close_section();
4389 }
4390 f->flush(rdata);
31f18b77
FG
4391 } else if (prefix == "osd versions") {
4392 if (!f)
4393 f.reset(Formatter::create("json-pretty"));
4394 count_metadata("ceph_version", f.get());
4395 f->flush(rdata);
4396 r = 0;
4397 } else if (prefix == "osd count-metadata") {
4398 if (!f)
4399 f.reset(Formatter::create("json-pretty"));
4400 string field;
4401 cmd_getval(g_ceph_context, cmdmap, "property", field);
4402 count_metadata(field, f.get());
4403 f->flush(rdata);
4404 r = 0;
7c673cae
FG
4405 } else if (prefix == "osd map") {
4406 string poolstr, objstr, namespacestr;
4407 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4408 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4409 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4410
4411 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4412 if (pool < 0) {
4413 ss << "pool " << poolstr << " does not exist";
4414 r = -ENOENT;
4415 goto reply;
4416 }
4417 object_locator_t oloc(pool, namespacestr);
4418 object_t oid(objstr);
4419 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4420 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4421 vector<int> up, acting;
4422 int up_p, acting_p;
4423 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4424
4425 string fullobjname;
4426 if (!namespacestr.empty())
4427 fullobjname = namespacestr + string("/") + oid.name;
4428 else
4429 fullobjname = oid.name;
4430 if (f) {
4431 f->open_object_section("osd_map");
4432 f->dump_unsigned("epoch", osdmap.get_epoch());
4433 f->dump_string("pool", poolstr);
4434 f->dump_int("pool_id", pool);
4435 f->dump_stream("objname") << fullobjname;
4436 f->dump_stream("raw_pgid") << pgid;
4437 f->dump_stream("pgid") << mpgid;
4438 f->open_array_section("up");
4439 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4440 f->dump_int("osd", *p);
4441 f->close_section();
4442 f->dump_int("up_primary", up_p);
4443 f->open_array_section("acting");
4444 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4445 f->dump_int("osd", *p);
4446 f->close_section();
4447 f->dump_int("acting_primary", acting_p);
4448 f->close_section(); // osd_map
4449 f->flush(rdata);
4450 } else {
4451 ds << "osdmap e" << osdmap.get_epoch()
4452 << " pool '" << poolstr << "' (" << pool << ")"
4453 << " object '" << fullobjname << "' ->"
4454 << " pg " << pgid << " (" << mpgid << ")"
4455 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4456 << pg_vector_string(acting) << ", p" << acting_p << ")";
4457 rdata.append(ds);
4458 }
4459
4460 } else if (prefix == "pg map") {
4461 pg_t pgid;
4462 string pgidstr;
4463 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4464 if (!pgid.parse(pgidstr.c_str())) {
4465 ss << "invalid pgid '" << pgidstr << "'";
4466 r = -EINVAL;
4467 goto reply;
4468 }
4469 vector<int> up, acting;
4470 if (!osdmap.have_pg_pool(pgid.pool())) {
4471 ss << "pg '" << pgidstr << "' does not exist";
4472 r = -ENOENT;
4473 goto reply;
4474 }
4475 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4476 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4477 if (f) {
4478 f->open_object_section("pg_map");
4479 f->dump_unsigned("epoch", osdmap.get_epoch());
4480 f->dump_stream("raw_pgid") << pgid;
4481 f->dump_stream("pgid") << mpgid;
4482 f->open_array_section("up");
4483 for (auto osd : up) {
4484 f->dump_int("up_osd", osd);
4485 }
4486 f->close_section();
4487 f->open_array_section("acting");
4488 for (auto osd : acting) {
4489 f->dump_int("acting_osd", osd);
4490 }
4491 f->close_section();
4492 f->close_section();
4493 f->flush(rdata);
4494 } else {
4495 ds << "osdmap e" << osdmap.get_epoch()
4496 << " pg " << pgid << " (" << mpgid << ")"
4497 << " -> up " << up << " acting " << acting;
4498 rdata.append(ds);
4499 }
4500 goto reply;
4501
224ce89b
WB
4502 } else if (prefix == "osd scrub" ||
4503 prefix == "osd deep-scrub" ||
4504 prefix == "osd repair") {
7c673cae
FG
4505 string whostr;
4506 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4507 vector<string> pvec;
4508 get_str_vec(prefix, pvec);
4509
224ce89b 4510 if (whostr == "*" || whostr == "all" || whostr == "any") {
7c673cae
FG
4511 ss << "osds ";
4512 int c = 0;
4513 for (int i = 0; i < osdmap.get_max_osd(); i++)
4514 if (osdmap.is_up(i)) {
4515 ss << (c++ ? "," : "") << i;
4516 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4517 pvec.back() == "repair",
4518 pvec.back() == "deep-scrub"),
4519 osdmap.get_inst(i));
4520 }
4521 r = 0;
4522 ss << " instructed to " << pvec.back();
4523 } else {
4524 long osd = parse_osd_id(whostr.c_str(), &ss);
4525 if (osd < 0) {
4526 r = -EINVAL;
4527 } else if (osdmap.is_up(osd)) {
4528 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4529 pvec.back() == "repair",
4530 pvec.back() == "deep-scrub"),
4531 osdmap.get_inst(osd));
4532 ss << "osd." << osd << " instructed to " << pvec.back();
4533 } else {
4534 ss << "osd." << osd << " is not up";
4535 r = -EAGAIN;
4536 }
4537 }
4538 } else if (prefix == "osd lspools") {
4539 int64_t auid;
4540 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4541 if (f)
4542 f->open_array_section("pools");
4543 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4544 p != osdmap.pools.end();
4545 ++p) {
4546 if (!auid || p->second.auid == (uint64_t)auid) {
4547 if (f) {
4548 f->open_object_section("pool");
4549 f->dump_int("poolnum", p->first);
4550 f->dump_string("poolname", osdmap.pool_name[p->first]);
4551 f->close_section();
4552 } else {
4553 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4554 }
4555 }
4556 }
4557 if (f) {
4558 f->close_section();
4559 f->flush(ds);
4560 }
4561 rdata.append(ds);
4562 } else if (prefix == "osd blacklist ls") {
4563 if (f)
4564 f->open_array_section("blacklist");
4565
4566 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4567 p != osdmap.blacklist.end();
4568 ++p) {
4569 if (f) {
4570 f->open_object_section("entry");
4571 f->dump_stream("addr") << p->first;
4572 f->dump_stream("until") << p->second;
4573 f->close_section();
4574 } else {
4575 stringstream ss;
4576 string s;
4577 ss << p->first << " " << p->second;
4578 getline(ss, s);
4579 s += "\n";
4580 rdata.append(s);
4581 }
4582 }
4583 if (f) {
4584 f->close_section();
4585 f->flush(rdata);
4586 }
4587 ss << "listed " << osdmap.blacklist.size() << " entries";
4588
4589 } else if (prefix == "osd pool ls") {
4590 string detail;
4591 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4592 if (!f && detail == "detail") {
4593 ostringstream ss;
4594 osdmap.print_pools(ss);
4595 rdata.append(ss.str());
4596 } else {
4597 if (f)
4598 f->open_array_section("pools");
4599 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4600 it != osdmap.get_pools().end();
4601 ++it) {
4602 if (f) {
4603 if (detail == "detail") {
4604 f->open_object_section("pool");
4605 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4606 it->second.dump(f.get());
4607 f->close_section();
4608 } else {
4609 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4610 }
4611 } else {
4612 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4613 }
4614 }
4615 if (f) {
4616 f->close_section();
4617 f->flush(rdata);
4618 }
4619 }
4620
4621 } else if (prefix == "osd crush get-tunable") {
4622 string tunable;
4623 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4624 ostringstream rss;
4625 if (f)
4626 f->open_object_section("tunable");
4627 if (tunable == "straw_calc_version") {
4628 if (f)
4629 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4630 else
4631 rss << osdmap.crush->get_straw_calc_version() << "\n";
4632 } else {
4633 r = -EINVAL;
4634 goto reply;
4635 }
4636 if (f) {
4637 f->close_section();
4638 f->flush(rdata);
4639 } else {
4640 rdata.append(rss.str());
4641 }
4642 r = 0;
4643
4644 } else if (prefix == "osd pool get") {
4645 string poolstr;
4646 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4647 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4648 if (pool < 0) {
4649 ss << "unrecognized pool '" << poolstr << "'";
4650 r = -ENOENT;
4651 goto reply;
4652 }
4653
4654 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4655 string var;
4656 cmd_getval(g_ceph_context, cmdmap, "var", var);
4657
4658 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4659 const choices_map_t ALL_CHOICES = {
4660 {"size", SIZE},
4661 {"min_size", MIN_SIZE},
4662 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4663 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
4664 {"crush_rule", CRUSH_RULE},
7c673cae
FG
4665 {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
4666 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4667 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4668 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4669 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4670 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4671 {"use_gmt_hitset", USE_GMT_HITSET},
4672 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4673 {"target_max_bytes", TARGET_MAX_BYTES},
4674 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4675 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4676 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4677 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4678 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4679 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4680 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4681 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4682 {"fast_read", FAST_READ},
4683 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4684 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4685 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4686 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4687 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4688 {"recovery_priority", RECOVERY_PRIORITY},
4689 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4690 {"scrub_priority", SCRUB_PRIORITY},
4691 {"compression_mode", COMPRESSION_MODE},
4692 {"compression_algorithm", COMPRESSION_ALGORITHM},
4693 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4694 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4695 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4696 {"csum_type", CSUM_TYPE},
4697 {"csum_max_block", CSUM_MAX_BLOCK},
4698 {"csum_min_block", CSUM_MIN_BLOCK},
4699 };
4700
4701 typedef std::set<osd_pool_get_choices> choices_set_t;
4702
4703 const choices_set_t ONLY_TIER_CHOICES = {
4704 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4705 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4706 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4707 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4708 MIN_READ_RECENCY_FOR_PROMOTE,
c07f9fc5 4709 MIN_WRITE_RECENCY_FOR_PROMOTE,
7c673cae
FG
4710 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4711 };
4712 const choices_set_t ONLY_ERASURE_CHOICES = {
4713 ERASURE_CODE_PROFILE
4714 };
4715
4716 choices_set_t selected_choices;
4717 if (var == "all") {
4718 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4719 it != ALL_CHOICES.end(); ++it) {
4720 selected_choices.insert(it->second);
4721 }
4722
4723 if(!p->is_tier()) {
4724 selected_choices = subtract_second_from_first(selected_choices,
4725 ONLY_TIER_CHOICES);
4726 }
4727
4728 if(!p->is_erasure()) {
4729 selected_choices = subtract_second_from_first(selected_choices,
4730 ONLY_ERASURE_CHOICES);
4731 }
4732 } else /* var != "all" */ {
4733 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4734 osd_pool_get_choices selected = found->second;
4735
4736 if (!p->is_tier() &&
4737 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4738 ss << "pool '" << poolstr
4739 << "' is not a tier pool: variable not applicable";
4740 r = -EACCES;
4741 goto reply;
4742 }
4743
4744 if (!p->is_erasure() &&
4745 ONLY_ERASURE_CHOICES.find(selected)
4746 != ONLY_ERASURE_CHOICES.end()) {
4747 ss << "pool '" << poolstr
4748 << "' is not a erasure pool: variable not applicable";
4749 r = -EACCES;
4750 goto reply;
4751 }
4752
94b18763
FG
4753 if (pool_opts_t::is_opt_name(var) &&
4754 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
4755 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
4756 r = -ENOENT;
4757 goto reply;
4758 }
4759
7c673cae
FG
4760 selected_choices.insert(selected);
4761 }
4762
4763 if (f) {
94b18763
FG
4764 f->open_object_section("pool");
4765 f->dump_string("pool", poolstr);
4766 f->dump_int("pool_id", pool);
7c673cae
FG
4767 for(choices_set_t::const_iterator it = selected_choices.begin();
4768 it != selected_choices.end(); ++it) {
4769 choices_map_t::const_iterator i;
c07f9fc5
FG
4770 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4771 if (i->second == *it) {
4772 break;
4773 }
4774 }
4775 assert(i != ALL_CHOICES.end());
7c673cae
FG
4776 switch(*it) {
4777 case PG_NUM:
4778 f->dump_int("pg_num", p->get_pg_num());
4779 break;
4780 case PGP_NUM:
4781 f->dump_int("pgp_num", p->get_pgp_num());
4782 break;
4783 case AUID:
4784 f->dump_int("auid", p->get_auid());
4785 break;
4786 case SIZE:
4787 f->dump_int("size", p->get_size());
4788 break;
4789 case MIN_SIZE:
4790 f->dump_int("min_size", p->get_min_size());
4791 break;
4792 case CRASH_REPLAY_INTERVAL:
4793 f->dump_int("crash_replay_interval",
4794 p->get_crash_replay_interval());
4795 break;
4796 case CRUSH_RULE:
31f18b77 4797 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4798 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 4799 p->get_crush_rule()));
7c673cae 4800 } else {
31f18b77 4801 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
4802 }
4803 break;
7c673cae
FG
4804 case HASHPSPOOL:
4805 case NODELETE:
4806 case NOPGCHANGE:
4807 case NOSIZECHANGE:
4808 case WRITE_FADVISE_DONTNEED:
4809 case NOSCRUB:
4810 case NODEEP_SCRUB:
94b18763
FG
4811 f->dump_bool(i->first.c_str(),
4812 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
7c673cae
FG
4813 break;
4814 case HIT_SET_PERIOD:
4815 f->dump_int("hit_set_period", p->hit_set_period);
4816 break;
4817 case HIT_SET_COUNT:
4818 f->dump_int("hit_set_count", p->hit_set_count);
4819 break;
4820 case HIT_SET_TYPE:
4821 f->dump_string("hit_set_type",
4822 HitSet::get_type_name(p->hit_set_params.get_type()));
4823 break;
4824 case HIT_SET_FPP:
4825 {
4826 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4827 BloomHitSet::Params *bloomp =
4828 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4829 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4830 } else if(var != "all") {
4831 f->close_section();
4832 ss << "hit set is not of type Bloom; " <<
4833 "invalid to get a false positive rate!";
4834 r = -EINVAL;
4835 goto reply;
4836 }
4837 }
4838 break;
4839 case USE_GMT_HITSET:
4840 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4841 break;
4842 case TARGET_MAX_OBJECTS:
4843 f->dump_unsigned("target_max_objects", p->target_max_objects);
4844 break;
4845 case TARGET_MAX_BYTES:
4846 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4847 break;
4848 case CACHE_TARGET_DIRTY_RATIO:
4849 f->dump_unsigned("cache_target_dirty_ratio_micro",
4850 p->cache_target_dirty_ratio_micro);
4851 f->dump_float("cache_target_dirty_ratio",
4852 ((float)p->cache_target_dirty_ratio_micro/1000000));
4853 break;
4854 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4855 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
4856 p->cache_target_dirty_high_ratio_micro);
4857 f->dump_float("cache_target_dirty_high_ratio",
4858 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
4859 break;
4860 case CACHE_TARGET_FULL_RATIO:
4861 f->dump_unsigned("cache_target_full_ratio_micro",
4862 p->cache_target_full_ratio_micro);
4863 f->dump_float("cache_target_full_ratio",
4864 ((float)p->cache_target_full_ratio_micro/1000000));
4865 break;
4866 case CACHE_MIN_FLUSH_AGE:
4867 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
4868 break;
4869 case CACHE_MIN_EVICT_AGE:
4870 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
4871 break;
4872 case ERASURE_CODE_PROFILE:
4873 f->dump_string("erasure_code_profile", p->erasure_code_profile);
4874 break;
4875 case MIN_READ_RECENCY_FOR_PROMOTE:
4876 f->dump_int("min_read_recency_for_promote",
4877 p->min_read_recency_for_promote);
4878 break;
4879 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4880 f->dump_int("min_write_recency_for_promote",
4881 p->min_write_recency_for_promote);
4882 break;
4883 case FAST_READ:
4884 f->dump_int("fast_read", p->fast_read);
4885 break;
4886 case HIT_SET_GRADE_DECAY_RATE:
4887 f->dump_int("hit_set_grade_decay_rate",
4888 p->hit_set_grade_decay_rate);
4889 break;
4890 case HIT_SET_SEARCH_LAST_N:
4891 f->dump_int("hit_set_search_last_n",
4892 p->hit_set_search_last_n);
4893 break;
4894 case SCRUB_MIN_INTERVAL:
4895 case SCRUB_MAX_INTERVAL:
4896 case DEEP_SCRUB_INTERVAL:
4897 case RECOVERY_PRIORITY:
4898 case RECOVERY_OP_PRIORITY:
4899 case SCRUB_PRIORITY:
4900 case COMPRESSION_MODE:
4901 case COMPRESSION_ALGORITHM:
4902 case COMPRESSION_REQUIRED_RATIO:
4903 case COMPRESSION_MAX_BLOB_SIZE:
4904 case COMPRESSION_MIN_BLOB_SIZE:
4905 case CSUM_TYPE:
4906 case CSUM_MAX_BLOCK:
4907 case CSUM_MIN_BLOCK:
c07f9fc5
FG
4908 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4909 if (p->opts.is_set(key)) {
c07f9fc5
FG
4910 if(*it == CSUM_TYPE) {
4911 int val;
4912 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
4913 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
4914 } else {
4915 p->opts.dump(i->first, f.get());
4916 }
94b18763 4917 }
7c673cae
FG
4918 break;
4919 }
7c673cae 4920 }
94b18763
FG
4921 f->close_section();
4922 f->flush(rdata);
7c673cae
FG
4923 } else /* !f */ {
4924 for(choices_set_t::const_iterator it = selected_choices.begin();
4925 it != selected_choices.end(); ++it) {
4926 choices_map_t::const_iterator i;
4927 switch(*it) {
4928 case PG_NUM:
4929 ss << "pg_num: " << p->get_pg_num() << "\n";
4930 break;
4931 case PGP_NUM:
4932 ss << "pgp_num: " << p->get_pgp_num() << "\n";
4933 break;
4934 case AUID:
4935 ss << "auid: " << p->get_auid() << "\n";
4936 break;
4937 case SIZE:
4938 ss << "size: " << p->get_size() << "\n";
4939 break;
4940 case MIN_SIZE:
4941 ss << "min_size: " << p->get_min_size() << "\n";
4942 break;
4943 case CRASH_REPLAY_INTERVAL:
4944 ss << "crash_replay_interval: " <<
4945 p->get_crash_replay_interval() << "\n";
4946 break;
4947 case CRUSH_RULE:
31f18b77 4948 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4949 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 4950 p->get_crush_rule()) << "\n";
7c673cae 4951 } else {
31f18b77 4952 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
4953 }
4954 break;
7c673cae
FG
4955 case HIT_SET_PERIOD:
4956 ss << "hit_set_period: " << p->hit_set_period << "\n";
4957 break;
4958 case HIT_SET_COUNT:
4959 ss << "hit_set_count: " << p->hit_set_count << "\n";
4960 break;
4961 case HIT_SET_TYPE:
4962 ss << "hit_set_type: " <<
4963 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
4964 break;
4965 case HIT_SET_FPP:
4966 {
4967 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4968 BloomHitSet::Params *bloomp =
4969 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4970 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
4971 } else if(var != "all") {
4972 ss << "hit set is not of type Bloom; " <<
4973 "invalid to get a false positive rate!";
4974 r = -EINVAL;
4975 goto reply;
4976 }
4977 }
4978 break;
4979 case USE_GMT_HITSET:
4980 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
4981 break;
4982 case TARGET_MAX_OBJECTS:
4983 ss << "target_max_objects: " << p->target_max_objects << "\n";
4984 break;
4985 case TARGET_MAX_BYTES:
4986 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
4987 break;
4988 case CACHE_TARGET_DIRTY_RATIO:
4989 ss << "cache_target_dirty_ratio: "
4990 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
4991 break;
4992 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4993 ss << "cache_target_dirty_high_ratio: "
4994 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
4995 break;
4996 case CACHE_TARGET_FULL_RATIO:
4997 ss << "cache_target_full_ratio: "
4998 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
4999 break;
5000 case CACHE_MIN_FLUSH_AGE:
5001 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5002 break;
5003 case CACHE_MIN_EVICT_AGE:
5004 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5005 break;
5006 case ERASURE_CODE_PROFILE:
5007 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5008 break;
5009 case MIN_READ_RECENCY_FOR_PROMOTE:
5010 ss << "min_read_recency_for_promote: " <<
5011 p->min_read_recency_for_promote << "\n";
5012 break;
5013 case HIT_SET_GRADE_DECAY_RATE:
5014 ss << "hit_set_grade_decay_rate: " <<
5015 p->hit_set_grade_decay_rate << "\n";
5016 break;
5017 case HIT_SET_SEARCH_LAST_N:
5018 ss << "hit_set_search_last_n: " <<
5019 p->hit_set_search_last_n << "\n";
5020 break;
5021 case HASHPSPOOL:
5022 case NODELETE:
5023 case NOPGCHANGE:
5024 case NOSIZECHANGE:
5025 case WRITE_FADVISE_DONTNEED:
5026 case NOSCRUB:
5027 case NODEEP_SCRUB:
5028 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5029 if (i->second == *it)
5030 break;
5031 }
5032 assert(i != ALL_CHOICES.end());
5033 ss << i->first << ": " <<
5034 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5035 "true" : "false") << "\n";
5036 break;
5037 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5038 ss << "min_write_recency_for_promote: " <<
5039 p->min_write_recency_for_promote << "\n";
5040 break;
5041 case FAST_READ:
5042 ss << "fast_read: " << p->fast_read << "\n";
5043 break;
5044 case SCRUB_MIN_INTERVAL:
5045 case SCRUB_MAX_INTERVAL:
5046 case DEEP_SCRUB_INTERVAL:
5047 case RECOVERY_PRIORITY:
5048 case RECOVERY_OP_PRIORITY:
5049 case SCRUB_PRIORITY:
5050 case COMPRESSION_MODE:
5051 case COMPRESSION_ALGORITHM:
5052 case COMPRESSION_REQUIRED_RATIO:
5053 case COMPRESSION_MAX_BLOB_SIZE:
5054 case COMPRESSION_MIN_BLOB_SIZE:
5055 case CSUM_TYPE:
5056 case CSUM_MAX_BLOCK:
5057 case CSUM_MIN_BLOCK:
5058 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5059 if (i->second == *it)
5060 break;
5061 }
5062 assert(i != ALL_CHOICES.end());
5063 {
5064 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5065 if (p->opts.is_set(key)) {
5066 if(key == pool_opts_t::CSUM_TYPE) {
5067 int val;
5068 p->opts.get(key, &val);
5069 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5070 } else {
5071 ss << i->first << ": " << p->opts.get(key) << "\n";
5072 }
5073 }
5074 }
5075 break;
5076 }
5077 rdata.append(ss.str());
5078 ss.str("");
5079 }
5080 }
5081 r = 0;
5082 } else if (prefix == "osd pool stats") {
31f18b77
FG
5083 r = mon->pgservice->process_pg_command(prefix, cmdmap,
5084 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
5085 } else if (prefix == "osd pool get-quota") {
5086 string pool_name;
5087 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
5088
5089 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5090 if (poolid < 0) {
5091 assert(poolid == -ENOENT);
5092 ss << "unrecognized pool '" << pool_name << "'";
5093 r = -ENOENT;
5094 goto reply;
5095 }
5096 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5097
5098 if (f) {
5099 f->open_object_section("pool_quotas");
5100 f->dump_string("pool_name", pool_name);
5101 f->dump_unsigned("pool_id", poolid);
5102 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5103 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5104 f->close_section();
5105 f->flush(rdata);
5106 } else {
5107 stringstream rs;
5108 rs << "quotas for pool '" << pool_name << "':\n"
5109 << " max objects: ";
5110 if (p->quota_max_objects == 0)
5111 rs << "N/A";
5112 else
5113 rs << si_t(p->quota_max_objects) << " objects";
5114 rs << "\n"
5115 << " max bytes : ";
5116 if (p->quota_max_bytes == 0)
5117 rs << "N/A";
5118 else
5119 rs << si_t(p->quota_max_bytes) << "B";
5120 rdata.append(rs.str());
5121 }
5122 rdata.append("\n");
5123 r = 0;
5124 } else if (prefix == "osd crush rule list" ||
5125 prefix == "osd crush rule ls") {
c07f9fc5
FG
5126 if (f) {
5127 f->open_array_section("rules");
5128 osdmap.crush->list_rules(f.get());
5129 f->close_section();
5130 f->flush(rdata);
5131 } else {
5132 ostringstream ss;
5133 osdmap.crush->list_rules(&ss);
5134 rdata.append(ss.str());
5135 }
b5b8bbf5
FG
5136 } else if (prefix == "osd crush rule ls-by-class") {
5137 string class_name;
5138 cmd_getval(g_ceph_context, cmdmap, "class", class_name);
5139 if (class_name.empty()) {
5140 ss << "no class specified";
5141 r = -EINVAL;
5142 goto reply;
5143 }
5144 set<int> rules;
5145 r = osdmap.crush->get_rules_by_class(class_name, &rules);
5146 if (r < 0) {
5147 ss << "failed to get rules by class '" << class_name << "'";
5148 goto reply;
5149 }
5150 if (f) {
5151 f->open_array_section("rules");
5152 for (auto &rule: rules) {
5153 f->dump_string("name", osdmap.crush->get_rule_name(rule));
5154 }
5155 f->close_section();
5156 f->flush(rdata);
5157 } else {
5158 ostringstream rs;
5159 for (auto &rule: rules) {
5160 rs << osdmap.crush->get_rule_name(rule) << "\n";
5161 }
5162 rdata.append(rs.str());
5163 }
7c673cae
FG
5164 } else if (prefix == "osd crush rule dump") {
5165 string name;
5166 cmd_getval(g_ceph_context, cmdmap, "name", name);
5167 string format;
5168 cmd_getval(g_ceph_context, cmdmap, "format", format);
5169 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5170 if (name == "") {
5171 f->open_array_section("rules");
5172 osdmap.crush->dump_rules(f.get());
5173 f->close_section();
5174 } else {
5175 int ruleno = osdmap.crush->get_rule_id(name);
5176 if (ruleno < 0) {
31f18b77 5177 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
5178 r = ruleno;
5179 goto reply;
5180 }
5181 osdmap.crush->dump_rule(ruleno, f.get());
5182 }
5183 ostringstream rs;
5184 f->flush(rs);
5185 rs << "\n";
5186 rdata.append(rs.str());
5187 } else if (prefix == "osd crush dump") {
5188 string format;
5189 cmd_getval(g_ceph_context, cmdmap, "format", format);
5190 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5191 f->open_object_section("crush_map");
5192 osdmap.crush->dump(f.get());
5193 f->close_section();
5194 ostringstream rs;
5195 f->flush(rs);
5196 rs << "\n";
5197 rdata.append(rs.str());
5198 } else if (prefix == "osd crush show-tunables") {
5199 string format;
5200 cmd_getval(g_ceph_context, cmdmap, "format", format);
5201 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5202 f->open_object_section("crush_map_tunables");
5203 osdmap.crush->dump_tunables(f.get());
5204 f->close_section();
5205 ostringstream rs;
5206 f->flush(rs);
5207 rs << "\n";
5208 rdata.append(rs.str());
5209 } else if (prefix == "osd crush tree") {
c07f9fc5
FG
5210 string shadow;
5211 cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
5212 bool show_shadow = shadow == "--show-shadow";
5213 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5214 if (f) {
5215 osdmap.crush->dump_tree(nullptr,
5216 f.get(),
5217 osdmap.get_pool_names(),
5218 show_shadow);
5219 f->flush(rdata);
5220 } else {
5221 ostringstream ss;
5222 osdmap.crush->dump_tree(&ss,
5223 nullptr,
5224 osdmap.get_pool_names(),
5225 show_shadow);
5226 rdata.append(ss.str());
5227 }
d2e6a577
FG
5228 } else if (prefix == "osd crush ls") {
5229 string name;
5230 if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
5231 ss << "no node specified";
5232 r = -EINVAL;
5233 goto reply;
5234 }
5235 if (!osdmap.crush->name_exists(name)) {
5236 ss << "node '" << name << "' does not exist";
5237 r = -ENOENT;
5238 goto reply;
5239 }
5240 int id = osdmap.crush->get_item_id(name);
5241 list<int> result;
5242 if (id >= 0) {
5243 result.push_back(id);
5244 } else {
5245 int num = osdmap.crush->get_bucket_size(id);
5246 for (int i = 0; i < num; ++i) {
5247 result.push_back(osdmap.crush->get_bucket_item(id, i));
5248 }
5249 }
5250 if (f) {
5251 f->open_array_section("items");
5252 for (auto i : result) {
5253 f->dump_string("item", osdmap.crush->get_item_name(i));
5254 }
5255 f->close_section();
5256 f->flush(rdata);
5257 } else {
5258 ostringstream ss;
5259 for (auto i : result) {
5260 ss << osdmap.crush->get_item_name(i) << "\n";
5261 }
5262 rdata.append(ss.str());
5263 }
5264 r = 0;
7c673cae
FG
5265 } else if (prefix == "osd crush class ls") {
5266 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5267 f->open_array_section("crush_classes");
5268 for (auto i : osdmap.crush->class_name)
5269 f->dump_string("class", i.second);
5270 f->close_section();
5271 f->flush(rdata);
224ce89b
WB
5272 } else if (prefix == "osd crush class ls-osd") {
5273 string name;
5274 cmd_getval(g_ceph_context, cmdmap, "class", name);
224ce89b
WB
5275 set<int> osds;
5276 osdmap.crush->get_devices_by_class(name, &osds);
b5b8bbf5
FG
5277 if (f) {
5278 f->open_array_section("osds");
5279 for (auto &osd: osds)
5280 f->dump_int("osd", osd);
5281 f->close_section();
5282 f->flush(rdata);
5283 } else {
5284 bool first = true;
5285 for (auto &osd : osds) {
5286 if (!first)
5287 ds << "\n";
5288 first = false;
5289 ds << osd;
5290 }
5291 rdata.append(ds);
5292 }
7c673cae
FG
5293 } else if (prefix == "osd erasure-code-profile ls") {
5294 const auto &profiles = osdmap.get_erasure_code_profiles();
5295 if (f)
5296 f->open_array_section("erasure-code-profiles");
5297 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5298 if (f)
5299 f->dump_string("profile", i->first.c_str());
5300 else
5301 rdata.append(i->first + "\n");
5302 }
5303 if (f) {
5304 f->close_section();
5305 ostringstream rs;
5306 f->flush(rs);
5307 rs << "\n";
5308 rdata.append(rs.str());
5309 }
c07f9fc5
FG
5310 } else if (prefix == "osd crush weight-set ls") {
5311 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5312 if (f) {
5313 f->open_array_section("weight_sets");
5314 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5315 f->dump_string("pool", "(compat)");
5316 }
5317 for (auto& i : osdmap.crush->choose_args) {
5318 if (i.first >= 0) {
5319 f->dump_string("pool", osdmap.get_pool_name(i.first));
5320 }
5321 }
5322 f->close_section();
5323 f->flush(rdata);
5324 } else {
5325 ostringstream rs;
5326 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5327 rs << "(compat)\n";
5328 }
5329 for (auto& i : osdmap.crush->choose_args) {
5330 if (i.first >= 0) {
5331 rs << osdmap.get_pool_name(i.first) << "\n";
5332 }
5333 }
5334 rdata.append(rs.str());
5335 }
5336 } else if (prefix == "osd crush weight-set dump") {
5337 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5338 "json-pretty"));
5339 osdmap.crush->dump_choose_args(f.get());
5340 f->flush(rdata);
7c673cae
FG
5341 } else if (prefix == "osd erasure-code-profile get") {
5342 string name;
5343 cmd_getval(g_ceph_context, cmdmap, "name", name);
5344 if (!osdmap.has_erasure_code_profile(name)) {
5345 ss << "unknown erasure code profile '" << name << "'";
5346 r = -ENOENT;
5347 goto reply;
5348 }
5349 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5350 if (f)
5351 f->open_object_section("profile");
5352 for (map<string,string>::const_iterator i = profile.begin();
5353 i != profile.end();
5354 ++i) {
5355 if (f)
5356 f->dump_string(i->first.c_str(), i->second.c_str());
5357 else
5358 rdata.append(i->first + "=" + i->second + "\n");
5359 }
5360 if (f) {
5361 f->close_section();
5362 ostringstream rs;
5363 f->flush(rs);
5364 rs << "\n";
5365 rdata.append(rs.str());
5366 }
181888fb
FG
5367 } else if (prefix == "osd pool application get") {
5368 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5369 "json-pretty"));
5370 string pool_name;
5371 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
5372 string app;
5373 cmd_getval(g_ceph_context, cmdmap, "app", app);
5374 string key;
5375 cmd_getval(g_ceph_context, cmdmap, "key", key);
5376
5377 if (pool_name.empty()) {
5378 // all
5379 f->open_object_section("pools");
5380 for (const auto &pool : osdmap.pools) {
5381 std::string name("<unknown>");
5382 const auto &pni = osdmap.pool_name.find(pool.first);
5383 if (pni != osdmap.pool_name.end())
5384 name = pni->second;
5385 f->open_object_section(name.c_str());
5386 for (auto &app_pair : pool.second.application_metadata) {
5387 f->open_object_section(app_pair.first.c_str());
5388 for (auto &kv_pair : app_pair.second) {
5389 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5390 }
5391 f->close_section();
5392 }
5393 f->close_section(); // name
5394 }
5395 f->close_section(); // pools
5396 f->flush(rdata);
5397 } else {
5398 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
5399 if (pool < 0) {
5400 ss << "unrecognized pool '" << pool_name << "'";
5401 r = -ENOENT;
5402 goto reply;
5403 }
5404 auto p = osdmap.get_pg_pool(pool);
5405 // filter by pool
5406 if (app.empty()) {
5407 f->open_object_section(pool_name.c_str());
5408 for (auto &app_pair : p->application_metadata) {
5409 f->open_object_section(app_pair.first.c_str());
5410 for (auto &kv_pair : app_pair.second) {
5411 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5412 }
5413 f->close_section(); // application
5414 }
5415 f->close_section(); // pool_name
5416 f->flush(rdata);
5417 goto reply;
5418 }
5419
5420 auto app_it = p->application_metadata.find(app);
5421 if (app_it == p->application_metadata.end()) {
5422 ss << "pool '" << pool_name << "' has no application '" << app << "'";
5423 r = -ENOENT;
5424 goto reply;
5425 }
5426 // filter by pool + app
5427 if (key.empty()) {
5428 f->open_object_section(app_it->first.c_str());
5429 for (auto &kv_pair : app_it->second) {
5430 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5431 }
5432 f->close_section(); // application
5433 f->flush(rdata);
5434 goto reply;
5435 }
5436 // filter by pool + app + key
5437 auto key_it = app_it->second.find(key);
5438 if (key_it == app_it->second.end()) {
5439 ss << "application '" << app << "' on pool '" << pool_name
5440 << "' does not have key '" << key << "'";
5441 r = -ENOENT;
5442 goto reply;
5443 }
5444 ss << key_it->second << "\n";
5445 rdata.append(ss.str());
5446 ss.str("");
5447 }
7c673cae
FG
5448 } else {
5449 // try prepare update
5450 return false;
5451 }
5452
5453 reply:
5454 string rs;
5455 getline(ss, rs);
5456 mon->reply_command(op, r, rs, rdata, get_last_committed());
5457 return true;
5458}
5459
3efd9988
FG
5460void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
5461{
5462 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5463 osdmap.get_pg_pool(pool_id));
5464 assert(pool);
5465 pool->set_flag(flags);
5466}
5467
5468void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7c673cae 5469{
3efd9988
FG
5470 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
5471 osdmap.get_pg_pool(pool_id));
5472 assert(pool);
5473 pool->unset_flag(flags);
7c673cae
FG
5474}
5475
5476bool OSDMonitor::update_pools_status()
5477{
31f18b77 5478 if (!mon->pgservice->is_readable())
7c673cae
FG
5479 return false;
5480
5481 bool ret = false;
5482
5483 auto& pools = osdmap.get_pools();
5484 for (auto it = pools.begin(); it != pools.end(); ++it) {
31f18b77
FG
5485 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5486 if (!pstat)
7c673cae 5487 continue;
31f18b77 5488 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
5489 const pg_pool_t &pool = it->second;
5490 const string& pool_name = osdmap.get_pool_name(it->first);
5491
5492 bool pool_is_full =
5493 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
5494 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
5495
3efd9988 5496 if (pool.has_flag(pg_pool_t::FLAG_FULL_NO_QUOTA)) {
7c673cae
FG
5497 if (pool_is_full)
5498 continue;
5499
5500 mon->clog->info() << "pool '" << pool_name
3efd9988
FG
5501 << "' no longer out of quota; removing NO_QUOTA flag";
5502 // below we cancel FLAG_FULL too, we'll set it again in
5503 // OSDMonitor::encode_pending if it still fails the osd-full checking.
5504 clear_pool_flags(it->first,
5505 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
7c673cae
FG
5506 ret = true;
5507 } else {
5508 if (!pool_is_full)
5509 continue;
5510
5511 if (pool.quota_max_bytes > 0 &&
5512 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5513 mon->clog->warn() << "pool '" << pool_name << "' is full"
5514 << " (reached quota's max_bytes: "
5515 << si_t(pool.quota_max_bytes) << ")";
5516 }
5517 if (pool.quota_max_objects > 0 &&
5518 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
5519 mon->clog->warn() << "pool '" << pool_name << "' is full"
5520 << " (reached quota's max_objects: "
5521 << pool.quota_max_objects << ")";
5522 }
3efd9988
FG
5523 // set both FLAG_FULL_NO_QUOTA and FLAG_FULL
5524 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
5525 // since FLAG_FULL should always take precedence
5526 set_pool_flags(it->first,
5527 pg_pool_t::FLAG_FULL_NO_QUOTA | pg_pool_t::FLAG_FULL);
5528 clear_pool_flags(it->first,
5529 pg_pool_t::FLAG_NEARFULL |
5530 pg_pool_t::FLAG_BACKFILLFULL);
7c673cae
FG
5531 ret = true;
5532 }
5533 }
5534 return ret;
5535}
5536
7c673cae
FG
5537int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5538{
5539 op->mark_osdmon_event(__func__);
5540 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5541 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5542 MonSession *session = m->get_session();
5543 if (!session)
5544 return -EPERM;
5545 string erasure_code_profile;
5546 stringstream ss;
31f18b77 5547 string rule_name;
94b18763 5548 int ret = 0;
7c673cae 5549 if (m->auid)
94b18763 5550 ret = prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
7c673cae
FG
5551 0, 0,
5552 erasure_code_profile,
5553 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5554 else
94b18763 5555 ret = prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
7c673cae
FG
5556 0, 0,
5557 erasure_code_profile,
5558 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
94b18763
FG
5559
5560 if (ret < 0) {
5561 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
5562 }
5563 return ret;
7c673cae
FG
5564}
5565
5566int OSDMonitor::crush_rename_bucket(const string& srcname,
5567 const string& dstname,
5568 ostream *ss)
5569{
5570 int ret;
5571 //
5572 // Avoid creating a pending crush if it does not already exists and
5573 // the rename would fail.
5574 //
5575 if (!_have_pending_crush()) {
5576 ret = _get_stable_crush().can_rename_bucket(srcname,
5577 dstname,
5578 ss);
5579 if (ret)
5580 return ret;
5581 }
5582
5583 CrushWrapper newcrush;
5584 _get_pending_crush(newcrush);
5585
5586 ret = newcrush.rename_bucket(srcname,
5587 dstname,
5588 ss);
5589 if (ret)
5590 return ret;
5591
5592 pending_inc.crush.clear();
5593 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5594 *ss << "renamed bucket " << srcname << " into " << dstname;
5595 return 0;
5596}
5597
5598void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5599{
5600 string replacement = "";
5601
5602 if (plugin == "jerasure_generic" ||
5603 plugin == "jerasure_sse3" ||
5604 plugin == "jerasure_sse4" ||
5605 plugin == "jerasure_neon") {
5606 replacement = "jerasure";
5607 } else if (plugin == "shec_generic" ||
5608 plugin == "shec_sse3" ||
5609 plugin == "shec_sse4" ||
5610 plugin == "shec_neon") {
5611 replacement = "shec";
5612 }
5613
5614 if (replacement != "") {
5615 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5616 << plugin << " that has been deprecated. Please use "
5617 << replacement << " instead." << dendl;
5618 }
5619}
5620
5621int OSDMonitor::normalize_profile(const string& profilename,
5622 ErasureCodeProfile &profile,
5623 bool force,
5624 ostream *ss)
5625{
5626 ErasureCodeInterfaceRef erasure_code;
5627 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5628 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5629 check_legacy_ec_plugin(plugin->second, profilename);
5630 int err = instance.factory(plugin->second,
5631 g_conf->get_val<std::string>("erasure_code_dir"),
5632 profile, &erasure_code, ss);
5633 if (err) {
5634 return err;
5635 }
5636
5637 err = erasure_code->init(profile, ss);
5638 if (err) {
5639 return err;
5640 }
5641
5642 auto it = profile.find("stripe_unit");
5643 if (it != profile.end()) {
5644 string err_str;
5645 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5646 if (!err_str.empty()) {
5647 *ss << "could not parse stripe_unit '" << it->second
5648 << "': " << err_str << std::endl;
5649 return -EINVAL;
5650 }
5651 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5652 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5653 if (chunk_size != stripe_unit) {
5654 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5655 << "alignment. Would be padded to " << chunk_size
5656 << std::endl;
5657 return -EINVAL;
5658 }
5659 if ((stripe_unit % 4096) != 0 && !force) {
5660 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5661 << "use --force to override this check" << std::endl;
5662 return -EINVAL;
5663 }
5664 }
5665 return 0;
5666}
5667
31f18b77 5668int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 5669 const string &profile,
31f18b77 5670 int *rule,
7c673cae
FG
5671 ostream *ss)
5672{
5673 int ruleid = osdmap.crush->get_rule_id(name);
5674 if (ruleid != -ENOENT) {
31f18b77 5675 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
5676 return -EEXIST;
5677 }
5678
5679 CrushWrapper newcrush;
5680 _get_pending_crush(newcrush);
5681
5682 ruleid = newcrush.get_rule_id(name);
5683 if (ruleid != -ENOENT) {
31f18b77 5684 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
5685 return -EALREADY;
5686 } else {
5687 ErasureCodeInterfaceRef erasure_code;
5688 int err = get_erasure_code(profile, &erasure_code, ss);
5689 if (err) {
5690 *ss << "failed to load plugin using profile " << profile << std::endl;
5691 return err;
5692 }
5693
224ce89b 5694 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
5695 erasure_code.reset();
5696 if (err < 0)
5697 return err;
31f18b77 5698 *rule = err;
7c673cae
FG
5699 pending_inc.crush.clear();
5700 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5701 return 0;
5702 }
5703}
5704
5705int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5706 ErasureCodeInterfaceRef *erasure_code,
5707 ostream *ss) const
5708{
5709 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5710 return -EAGAIN;
5711 ErasureCodeProfile profile =
5712 osdmap.get_erasure_code_profile(erasure_code_profile);
5713 ErasureCodeProfile::const_iterator plugin =
5714 profile.find("plugin");
5715 if (plugin == profile.end()) {
5716 *ss << "cannot determine the erasure code plugin"
5717 << " because there is no 'plugin' entry in the erasure_code_profile "
5718 << profile << std::endl;
5719 return -EINVAL;
5720 }
5721 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5722 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5723 return instance.factory(plugin->second,
5724 g_conf->get_val<std::string>("erasure_code_dir"),
5725 profile, erasure_code, ss);
5726}
5727
5728int OSDMonitor::check_cluster_features(uint64_t features,
5729 stringstream &ss)
5730{
5731 stringstream unsupported_ss;
5732 int unsupported_count = 0;
5733 if ((mon->get_quorum_con_features() & features) != features) {
5734 unsupported_ss << "the monitor cluster";
5735 ++unsupported_count;
5736 }
5737
5738 set<int32_t> up_osds;
5739 osdmap.get_up_osds(up_osds);
5740 for (set<int32_t>::iterator it = up_osds.begin();
5741 it != up_osds.end(); ++it) {
5742 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5743 if ((xi.features & features) != features) {
5744 if (unsupported_count > 0)
5745 unsupported_ss << ", ";
5746 unsupported_ss << "osd." << *it;
5747 unsupported_count ++;
5748 }
5749 }
5750
5751 if (unsupported_count > 0) {
5752 ss << "features " << features << " unsupported by: "
5753 << unsupported_ss.str();
5754 return -ENOTSUP;
5755 }
5756
5757 // check pending osd state, too!
5758 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5759 pending_inc.new_xinfo.begin();
5760 p != pending_inc.new_xinfo.end(); ++p) {
5761 const osd_xinfo_t &xi = p->second;
5762 if ((xi.features & features) != features) {
5763 dout(10) << __func__ << " pending osd." << p->first
5764 << " features are insufficient; retry" << dendl;
5765 return -EAGAIN;
5766 }
5767 }
5768
5769 return 0;
5770}
5771
5772bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5773 stringstream& ss)
5774{
5775 OSDMap::Incremental new_pending = pending_inc;
5776 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5777 OSDMap newmap;
5778 newmap.deepish_copy_from(osdmap);
5779 newmap.apply_incremental(new_pending);
5780
5781 // client compat
31f18b77 5782 if (newmap.require_min_compat_client > 0) {
7c673cae 5783 auto mv = newmap.get_min_compat_client();
31f18b77
FG
5784 if (mv > newmap.require_min_compat_client) {
5785 ss << "new crush map requires client version " << ceph_release_name(mv)
7c673cae 5786 << " but require_min_compat_client is "
31f18b77 5787 << ceph_release_name(newmap.require_min_compat_client);
7c673cae
FG
5788 return false;
5789 }
5790 }
5791
5792 // osd compat
5793 uint64_t features =
5794 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5795 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5796 stringstream features_ss;
5797 int r = check_cluster_features(features, features_ss);
5798 if (r) {
5799 ss << "Could not change CRUSH: " << features_ss.str();
5800 return false;
5801 }
5802
5803 return true;
5804}
5805
5806bool OSDMonitor::erasure_code_profile_in_use(
5807 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5808 const string &profile,
5809 ostream *ss)
5810{
5811 bool found = false;
5812 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5813 p != pools.end();
5814 ++p) {
5815 if (p->second.erasure_code_profile == profile) {
5816 *ss << osdmap.pool_name[p->first] << " ";
5817 found = true;
5818 }
5819 }
5820 if (found) {
5821 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5822 }
5823 return found;
5824}
5825
5826int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5827 map<string,string> *erasure_code_profile_map,
5828 ostream *ss)
5829{
5830 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5831 *ss,
5832 erasure_code_profile_map);
5833 if (r)
5834 return r;
5835 assert((*erasure_code_profile_map).count("plugin"));
5836 string default_plugin = (*erasure_code_profile_map)["plugin"];
5837 map<string,string> user_map;
5838 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5839 i != erasure_code_profile.end();
5840 ++i) {
5841 size_t equal = i->find('=');
5842 if (equal == string::npos) {
5843 user_map[*i] = string();
5844 (*erasure_code_profile_map)[*i] = string();
5845 } else {
3efd9988 5846 string key = i->substr(0, equal);
7c673cae
FG
5847 equal++;
5848 const string value = i->substr(equal);
b32b8144
FG
5849 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
5850 key.find("ruleset-") == 0) {
5851 if (g_conf->get_val<bool>("mon_fixup_legacy_erasure_code_profiles")) {
3efd9988
FG
5852 mon->clog->warn() << "erasure code profile property '" << key
5853 << "' is no longer supported; try "
5854 << "'crush-" << key.substr(8) << "' instead";
5855 key = string("crush-") + key.substr(8);
5856 } else {
5857 *ss << "property '" << key << "' is no longer supported; try "
5858 << "'crush-" << key.substr(8) << "' instead";
5859 return -EINVAL;
5860 }
5861 }
7c673cae
FG
5862 user_map[key] = value;
5863 (*erasure_code_profile_map)[key] = value;
5864 }
5865 }
5866
5867 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
5868 (*erasure_code_profile_map) = user_map;
5869
5870 return 0;
5871}
5872
5873int OSDMonitor::prepare_pool_size(const unsigned pool_type,
5874 const string &erasure_code_profile,
5875 unsigned *size, unsigned *min_size,
5876 ostream *ss)
5877{
5878 int err = 0;
5879 switch (pool_type) {
5880 case pg_pool_t::TYPE_REPLICATED:
5881 *size = g_conf->osd_pool_default_size;
5882 *min_size = g_conf->get_osd_pool_default_min_size();
5883 break;
5884 case pg_pool_t::TYPE_ERASURE:
5885 {
5886 ErasureCodeInterfaceRef erasure_code;
5887 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5888 if (err == 0) {
5889 *size = erasure_code->get_chunk_count();
5890 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
5891 }
5892 }
5893 break;
5894 default:
5895 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
5896 err = -EINVAL;
5897 break;
5898 }
5899 return err;
5900}
5901
5902int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
5903 const string &erasure_code_profile,
5904 uint32_t *stripe_width,
5905 ostream *ss)
5906{
5907 int err = 0;
5908 switch (pool_type) {
5909 case pg_pool_t::TYPE_REPLICATED:
5910 // ignored
5911 break;
5912 case pg_pool_t::TYPE_ERASURE:
5913 {
5914 ErasureCodeProfile profile =
5915 osdmap.get_erasure_code_profile(erasure_code_profile);
5916 ErasureCodeInterfaceRef erasure_code;
5917 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5918 if (err)
5919 break;
5920 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5921 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
5922 auto it = profile.find("stripe_unit");
5923 if (it != profile.end()) {
5924 string err_str;
5925 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5926 assert(err_str.empty());
5927 }
5928 *stripe_width = data_chunks *
5929 erasure_code->get_chunk_size(stripe_unit * data_chunks);
5930 }
5931 break;
5932 default:
5933 *ss << "prepare_pool_stripe_width: "
5934 << pool_type << " is not a known pool type";
5935 err = -EINVAL;
5936 break;
5937 }
5938 return err;
5939}
5940
31f18b77 5941int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
5942 const string &erasure_code_profile,
5943 const string &rule_name,
5944 int *crush_rule,
5945 ostream *ss)
7c673cae
FG
5946{
5947
31f18b77 5948 if (*crush_rule < 0) {
7c673cae
FG
5949 switch (pool_type) {
5950 case pg_pool_t::TYPE_REPLICATED:
5951 {
31f18b77 5952 if (rule_name == "") {
224ce89b 5953 // Use default rule
31f18b77
FG
5954 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
5955 if (*crush_rule < 0) {
5956 // Errors may happen e.g. if no valid rule is available
5957 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
5958 << "'osd pool default crush *' config options";
5959 return -ENOENT;
5960 }
5961 } else {
31f18b77 5962 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
5963 }
5964 }
5965 break;
5966 case pg_pool_t::TYPE_ERASURE:
5967 {
31f18b77 5968 int err = crush_rule_create_erasure(rule_name,
7c673cae 5969 erasure_code_profile,
31f18b77 5970 crush_rule, ss);
7c673cae
FG
5971 switch (err) {
5972 case -EALREADY:
31f18b77
FG
5973 dout(20) << "prepare_pool_crush_rule: rule "
5974 << rule_name << " try again" << dendl;
7c673cae
FG
5975 // fall through
5976 case 0:
5977 // need to wait for the crush rule to be proposed before proceeding
5978 err = -EAGAIN;
5979 break;
5980 case -EEXIST:
5981 err = 0;
5982 break;
5983 }
5984 return err;
5985 }
5986 break;
5987 default:
31f18b77 5988 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
5989 << " is not a known pool type";
5990 return -EINVAL;
5991 break;
5992 }
5993 } else {
31f18b77
FG
5994 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
5995 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
5996 return -ENOENT;
5997 }
5998 }
5999
6000 return 0;
6001}
6002
31f18b77 6003int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
6004 int *crush_rule,
6005 ostream *ss)
7c673cae
FG
6006{
6007 int ret;
31f18b77 6008 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
6009 if (ret != -ENOENT) {
6010 // found it, use it
31f18b77 6011 *crush_rule = ret;
7c673cae
FG
6012 } else {
6013 CrushWrapper newcrush;
6014 _get_pending_crush(newcrush);
6015
31f18b77 6016 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
6017 if (ret != -ENOENT) {
6018 // found it, wait for it to be proposed
31f18b77 6019 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
6020 << " try again" << dendl;
6021 return -EAGAIN;
6022 } else {
224ce89b 6023 // Cannot find it , return error
31f18b77 6024 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
6025 return ret;
6026 }
6027 }
6028 return 0;
6029}
6030
3efd9988
FG
6031int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6032{
6033 auto max_pgs_per_osd = g_conf->get_val<uint64_t>("mon_max_pg_per_osd");
6034 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
6035 auto max_pgs = max_pgs_per_osd * num_osds;
6036 uint64_t projected = 0;
6037 if (pool < 0) {
6038 projected += pg_num * size;
6039 }
6040 for (const auto& i : osdmap.get_pools()) {
6041 if (i.first == pool) {
6042 projected += pg_num * size;
6043 } else {
6044 projected += i.second.get_pg_num() * i.second.get_size();
6045 }
6046 }
6047 if (projected > max_pgs) {
6048 if (pool >= 0) {
6049 *ss << "pool id " << pool;
6050 }
6051 *ss << " pg_num " << pg_num << " size " << size
6052 << " would mean " << projected
6053 << " total pgs, which exceeds max " << max_pgs
6054 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6055 << " * num_in_osds " << num_osds << ")";
6056 return -ERANGE;
6057 }
6058 return 0;
6059}
6060
7c673cae
FG
6061/**
6062 * @param name The name of the new pool
6063 * @param auid The auid of the pool owner. Can be -1
31f18b77
FG
6064 * @param crush_rule The crush rule to use. If <0, will use the system default
6065 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
6066 * @param pg_num The pg_num to use. If set to 0, will use the system default
6067 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6068 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6069 * @param pool_type TYPE_ERASURE, or TYPE_REP
6070 * @param expected_num_objects expected number of objects on the pool
6071 * @param fast_read fast read type.
6072 * @param ss human readable error message, if any.
6073 *
6074 * @return 0 on success, negative errno on failure.
6075 */
6076int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
31f18b77
FG
6077 int crush_rule,
6078 const string &crush_rule_name,
7c673cae
FG
6079 unsigned pg_num, unsigned pgp_num,
6080 const string &erasure_code_profile,
6081 const unsigned pool_type,
6082 const uint64_t expected_num_objects,
6083 FastReadType fast_read,
6084 ostream *ss)
6085{
6086 if (name.length() == 0)
6087 return -EINVAL;
6088 if (pg_num == 0)
6089 pg_num = g_conf->osd_pool_default_pg_num;
6090 if (pgp_num == 0)
6091 pgp_num = g_conf->osd_pool_default_pgp_num;
6092 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
6093 *ss << "'pg_num' must be greater than 0 and less than or equal to "
6094 << g_conf->mon_max_pool_pg_num
6095 << " (you may adjust 'mon max pool pg num' for higher values)";
6096 return -ERANGE;
6097 }
6098 if (pgp_num > pg_num) {
6099 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6100 << ", which in this case is " << pg_num;
6101 return -ERANGE;
6102 }
6103 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6104 *ss << "'fast_read' can only apply to erasure coding pool";
6105 return -EINVAL;
6106 }
6107 int r;
31f18b77
FG
6108 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6109 crush_rule_name, &crush_rule, ss);
7c673cae 6110 if (r) {
94b18763 6111 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
6112 return r;
6113 }
224ce89b
WB
6114 if (g_conf->mon_osd_crush_smoke_test) {
6115 CrushWrapper newcrush;
6116 _get_pending_crush(newcrush);
6117 ostringstream err;
6118 CrushTester tester(newcrush, err);
b5b8bbf5 6119 tester.set_min_x(0);
224ce89b
WB
6120 tester.set_max_x(50);
6121 tester.set_rule(crush_rule);
b5b8bbf5 6122 auto start = ceph::coarse_mono_clock::now();
224ce89b 6123 r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 6124 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b 6125 if (r < 0) {
94b18763 6126 dout(10) << "tester.test_with_fork returns " << r
224ce89b
WB
6127 << ": " << err.str() << dendl;
6128 *ss << "crush test failed with " << r << ": " << err.str();
6129 return r;
6130 }
181888fb 6131 dout(10) << __func__ << " crush smoke test duration: "
b5b8bbf5 6132 << duration << dendl;
7c673cae
FG
6133 }
6134 unsigned size, min_size;
6135 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
6136 if (r) {
94b18763 6137 dout(10) << "prepare_pool_size returns " << r << dendl;
7c673cae
FG
6138 return r;
6139 }
3efd9988
FG
6140 r = check_pg_num(-1, pg_num, size, ss);
6141 if (r) {
94b18763 6142 dout(10) << "check_pg_num returns " << r << dendl;
3efd9988
FG
6143 return r;
6144 }
7c673cae 6145
31f18b77 6146 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
6147 return -EINVAL;
6148 }
6149
6150 uint32_t stripe_width = 0;
6151 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6152 if (r) {
94b18763 6153 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7c673cae
FG
6154 return r;
6155 }
6156
6157 bool fread = false;
6158 if (pool_type == pg_pool_t::TYPE_ERASURE) {
6159 switch (fast_read) {
6160 case FAST_READ_OFF:
6161 fread = false;
6162 break;
6163 case FAST_READ_ON:
6164 fread = true;
6165 break;
6166 case FAST_READ_DEFAULT:
6167 fread = g_conf->mon_osd_pool_ec_fast_read;
6168 break;
6169 default:
6170 *ss << "invalid fast_read setting: " << fast_read;
6171 return -EINVAL;
6172 }
6173 }
6174
6175 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6176 p != pending_inc.new_pool_names.end();
6177 ++p) {
6178 if (p->second == name)
6179 return 0;
6180 }
6181
6182 if (-1 == pending_inc.new_pool_max)
6183 pending_inc.new_pool_max = osdmap.pool_max;
6184 int64_t pool = ++pending_inc.new_pool_max;
6185 pg_pool_t empty;
6186 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6187 pi->type = pool_type;
6188 pi->fast_read = fread;
6189 pi->flags = g_conf->osd_pool_default_flags;
6190 if (g_conf->osd_pool_default_flag_hashpspool)
6191 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6192 if (g_conf->osd_pool_default_flag_nodelete)
6193 pi->set_flag(pg_pool_t::FLAG_NODELETE);
6194 if (g_conf->osd_pool_default_flag_nopgchange)
6195 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6196 if (g_conf->osd_pool_default_flag_nosizechange)
6197 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
6198 if (g_conf->osd_pool_use_gmt_hitset &&
6199 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
6200 pi->use_gmt_hitset = true;
6201 else
6202 pi->use_gmt_hitset = false;
6203
6204 pi->size = size;
6205 pi->min_size = min_size;
31f18b77 6206 pi->crush_rule = crush_rule;
7c673cae
FG
6207 pi->expected_num_objects = expected_num_objects;
6208 pi->object_hash = CEPH_STR_HASH_RJENKINS;
6209 pi->set_pg_num(pg_num);
6210 pi->set_pgp_num(pgp_num);
6211 pi->last_change = pending_inc.epoch;
6212 pi->auid = auid;
6213 pi->erasure_code_profile = erasure_code_profile;
6214 pi->stripe_width = stripe_width;
6215 pi->cache_target_dirty_ratio_micro =
6216 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
6217 pi->cache_target_dirty_high_ratio_micro =
6218 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
6219 pi->cache_target_full_ratio_micro =
6220 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
6221 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
6222 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
6223 pending_inc.new_pool_names[pool] = name;
6224 return 0;
6225}
6226
6227bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
6228{
6229 op->mark_osdmon_event(__func__);
6230 ostringstream ss;
6231 if (pending_inc.new_flags < 0)
6232 pending_inc.new_flags = osdmap.get_flags();
6233 pending_inc.new_flags |= flag;
6234 ss << OSDMap::get_flag_string(flag) << " is set";
6235 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6236 get_last_committed() + 1));
6237 return true;
6238}
6239
6240bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
6241{
6242 op->mark_osdmon_event(__func__);
6243 ostringstream ss;
6244 if (pending_inc.new_flags < 0)
6245 pending_inc.new_flags = osdmap.get_flags();
6246 pending_inc.new_flags &= ~flag;
6247 ss << OSDMap::get_flag_string(flag) << " is unset";
6248 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
6249 get_last_committed() + 1));
6250 return true;
6251}
6252
7c673cae
FG
6253int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
6254 stringstream& ss)
6255{
6256 string poolstr;
6257 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
6258 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6259 if (pool < 0) {
6260 ss << "unrecognized pool '" << poolstr << "'";
6261 return -ENOENT;
6262 }
6263 string var;
6264 cmd_getval(g_ceph_context, cmdmap, "var", var);
6265
6266 pg_pool_t p = *osdmap.get_pg_pool(pool);
6267 if (pending_inc.new_pools.count(pool))
6268 p = pending_inc.new_pools[pool];
6269
6270 // accept val as a json string in the normal case (current
6271 // generation monitor). parse out int or float values from the
6272 // string as needed. however, if it is not a string, try to pull
6273 // out an int, in case an older monitor with an older json schema is
6274 // forwarding a request.
6275 string val;
6276 string interr, floaterr;
6277 int64_t n = 0;
6278 double f = 0;
6279 int64_t uf = 0; // micro-f
6280 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
6281 // wasn't a string; maybe an older mon forwarded json with an int?
6282 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
6283 return -EINVAL; // no value!
6284 } else {
6285 // we got a string. see if it contains an int.
6286 n = strict_strtoll(val.c_str(), 10, &interr);
6287 // or a float
6288 f = strict_strtod(val.c_str(), &floaterr);
6289 uf = llrintl(f * (double)1000000.0);
6290 }
6291
6292 if (!p.is_tier() &&
6293 (var == "hit_set_type" || var == "hit_set_period" ||
6294 var == "hit_set_count" || var == "hit_set_fpp" ||
6295 var == "target_max_objects" || var == "target_max_bytes" ||
6296 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
6297 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
6298 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
6299 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
6300 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
6301 return -EACCES;
6302 }
6303
6304 if (var == "size") {
6305 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6306 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
6307 return -EPERM;
6308 }
6309 if (p.type == pg_pool_t::TYPE_ERASURE) {
6310 ss << "can not change the size of an erasure-coded pool";
6311 return -ENOTSUP;
6312 }
6313 if (interr.length()) {
6314 ss << "error parsing integer value '" << val << "': " << interr;
6315 return -EINVAL;
6316 }
6317 if (n <= 0 || n > 10) {
6318 ss << "pool size must be between 1 and 10";
6319 return -EINVAL;
6320 }
3efd9988
FG
6321 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
6322 if (r < 0) {
6323 return r;
6324 }
7c673cae
FG
6325 p.size = n;
6326 if (n < p.min_size)
6327 p.min_size = n;
6328 } else if (var == "min_size") {
6329 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
6330 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
6331 return -EPERM;
6332 }
6333 if (interr.length()) {
6334 ss << "error parsing integer value '" << val << "': " << interr;
6335 return -EINVAL;
6336 }
6337
6338 if (p.type != pg_pool_t::TYPE_ERASURE) {
6339 if (n < 1 || n > p.size) {
6340 ss << "pool min_size must be between 1 and " << (int)p.size;
6341 return -EINVAL;
6342 }
6343 } else {
6344 ErasureCodeInterfaceRef erasure_code;
6345 int k;
6346 stringstream tmp;
6347 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
6348 if (err == 0) {
6349 k = erasure_code->get_data_chunk_count();
6350 } else {
b32b8144 6351 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7c673cae
FG
6352 return err;
6353 }
6354
6355 if (n < k || n > p.size) {
6356 ss << "pool min_size must be between " << k << " and " << (int)p.size;
6357 return -EINVAL;
6358 }
6359 }
6360 p.min_size = n;
6361 } else if (var == "auid") {
6362 if (interr.length()) {
6363 ss << "error parsing integer value '" << val << "': " << interr;
6364 return -EINVAL;
6365 }
6366 p.auid = n;
6367 } else if (var == "crash_replay_interval") {
6368 if (interr.length()) {
6369 ss << "error parsing integer value '" << val << "': " << interr;
6370 return -EINVAL;
6371 }
6372 p.crash_replay_interval = n;
6373 } else if (var == "pg_num") {
6374 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6375 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
6376 return -EPERM;
6377 }
6378 if (interr.length()) {
6379 ss << "error parsing integer value '" << val << "': " << interr;
6380 return -EINVAL;
6381 }
6382 if (n <= (int)p.get_pg_num()) {
6383 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
6384 if (n < (int)p.get_pg_num())
6385 return -EEXIST;
6386 return 0;
6387 }
c07f9fc5
FG
6388 if (n > (unsigned)g_conf->mon_max_pool_pg_num) {
6389 ss << "'pg_num' must be greater than 0 and less than or equal to "
6390 << g_conf->mon_max_pool_pg_num
6391 << " (you may adjust 'mon max pool pg num' for higher values)";
6392 return -ERANGE;
6393 }
3efd9988
FG
6394 int r = check_pg_num(pool, n, p.get_size(), &ss);
6395 if (r) {
6396 return r;
6397 }
7c673cae
FG
6398 string force;
6399 cmd_getval(g_ceph_context,cmdmap, "force", force);
6400 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
6401 force != "--yes-i-really-mean-it") {
6402 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
6403 return -EPERM;
6404 }
6405 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
6406 int64_t new_pgs = n - p.get_pg_num();
6407 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
6408 ss << "specified pg_num " << n << " is too large (creating "
6409 << new_pgs << " new PGs on ~" << expected_osds
6410 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
6411 << ')';
6412 return -E2BIG;
6413 }
6414 p.set_pg_num(n);
6415 // force pre-luminous clients to resend their ops, since they
6416 // don't understand that split PGs now form a new interval.
6417 p.last_force_op_resend_preluminous = pending_inc.epoch;
6418 } else if (var == "pgp_num") {
6419 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
6420 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
6421 return -EPERM;
6422 }
6423 if (interr.length()) {
6424 ss << "error parsing integer value '" << val << "': " << interr;
6425 return -EINVAL;
6426 }
6427 if (n <= 0) {
6428 ss << "specified pgp_num must > 0, but you set to " << n;
6429 return -EINVAL;
6430 }
6431 if (n > (int)p.get_pg_num()) {
6432 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
6433 return -EINVAL;
6434 }
6435 p.set_pgp_num(n);
6436 } else if (var == "crush_rule") {
6437 int id = osdmap.crush->get_rule_id(val);
6438 if (id == -ENOENT) {
6439 ss << "crush rule " << val << " does not exist";
6440 return -ENOENT;
6441 }
6442 if (id < 0) {
6443 ss << cpp_strerror(id);
6444 return -ENOENT;
6445 }
6446 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
6447 return -EINVAL;
6448 }
31f18b77 6449 p.crush_rule = id;
7c673cae
FG
6450 } else if (var == "nodelete" || var == "nopgchange" ||
6451 var == "nosizechange" || var == "write_fadvise_dontneed" ||
6452 var == "noscrub" || var == "nodeep-scrub") {
6453 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6454 // make sure we only compare against 'n' if we didn't receive a string
6455 if (val == "true" || (interr.empty() && n == 1)) {
6456 p.set_flag(flag);
6457 } else if (val == "false" || (interr.empty() && n == 0)) {
6458 p.unset_flag(flag);
6459 } else {
6460 ss << "expecting value 'true', 'false', '0', or '1'";
6461 return -EINVAL;
6462 }
6463 } else if (var == "hashpspool") {
6464 uint64_t flag = pg_pool_t::get_flag_by_name(var);
6465 string force;
6466 cmd_getval(g_ceph_context, cmdmap, "force", force);
6467 if (force != "--yes-i-really-mean-it") {
6468 ss << "are you SURE? this will remap all placement groups in this pool,"
6469 " this triggers large data movement,"
6470 " pass --yes-i-really-mean-it if you really do.";
6471 return -EPERM;
6472 }
6473 // make sure we only compare against 'n' if we didn't receive a string
6474 if (val == "true" || (interr.empty() && n == 1)) {
6475 p.set_flag(flag);
6476 } else if (val == "false" || (interr.empty() && n == 0)) {
6477 p.unset_flag(flag);
6478 } else {
6479 ss << "expecting value 'true', 'false', '0', or '1'";
6480 return -EINVAL;
6481 }
6482 } else if (var == "hit_set_type") {
6483 if (val == "none")
6484 p.hit_set_params = HitSet::Params();
6485 else {
6486 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
6487 if (err)
6488 return err;
6489 if (val == "bloom") {
6490 BloomHitSet::Params *bsp = new BloomHitSet::Params;
6491 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
6492 p.hit_set_params = HitSet::Params(bsp);
6493 } else if (val == "explicit_hash")
6494 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
6495 else if (val == "explicit_object")
6496 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
6497 else {
6498 ss << "unrecognized hit_set type '" << val << "'";
6499 return -EINVAL;
6500 }
6501 }
6502 } else if (var == "hit_set_period") {
6503 if (interr.length()) {
6504 ss << "error parsing integer value '" << val << "': " << interr;
6505 return -EINVAL;
6506 }
6507 p.hit_set_period = n;
6508 } else if (var == "hit_set_count") {
6509 if (interr.length()) {
6510 ss << "error parsing integer value '" << val << "': " << interr;
6511 return -EINVAL;
6512 }
6513 p.hit_set_count = n;
6514 } else if (var == "hit_set_fpp") {
6515 if (floaterr.length()) {
6516 ss << "error parsing floating point value '" << val << "': " << floaterr;
6517 return -EINVAL;
6518 }
6519 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
6520 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
6521 return -EINVAL;
6522 }
6523 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
6524 bloomp->set_fpp(f);
6525 } else if (var == "use_gmt_hitset") {
6526 if (val == "true" || (interr.empty() && n == 1)) {
3efd9988
FG
6527 string force;
6528 cmd_getval(g_ceph_context, cmdmap, "force", force);
6529 if (!osdmap.get_num_up_osds() && force != "--yes-i-really-mean-it") {
6530 ss << "Not advisable to continue since no OSDs are up. Pass "
6531 << "--yes-i-really-mean-it if you really wish to continue.";
6532 return -EPERM;
6533 }
6534 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)
6535 && force != "--yes-i-really-mean-it") {
7c673cae
FG
6536 ss << "not all OSDs support GMT hit set.";
6537 return -EINVAL;
6538 }
6539 p.use_gmt_hitset = true;
6540 } else {
6541 ss << "expecting value 'true' or '1'";
6542 return -EINVAL;
6543 }
6544 } else if (var == "allow_ec_overwrites") {
6545 if (!p.is_erasure()) {
6546 ss << "ec overwrites can only be enabled for an erasure coded pool";
6547 return -EINVAL;
6548 }
224ce89b
WB
6549 stringstream err;
6550 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6551 !is_pool_currently_all_bluestore(pool, p, &err)) {
6552 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6553 return -EINVAL;
6554 }
7c673cae
FG
6555 if (val == "true" || (interr.empty() && n == 1)) {
6556 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6557 } else if (val == "false" || (interr.empty() && n == 0)) {
6558 ss << "ec overwrites cannot be disabled once enabled";
6559 return -EINVAL;
6560 } else {
6561 ss << "expecting value 'true', 'false', '0', or '1'";
6562 return -EINVAL;
6563 }
7c673cae
FG
6564 } else if (var == "target_max_objects") {
6565 if (interr.length()) {
6566 ss << "error parsing int '" << val << "': " << interr;
6567 return -EINVAL;
6568 }
6569 p.target_max_objects = n;
6570 } else if (var == "target_max_bytes") {
6571 if (interr.length()) {
6572 ss << "error parsing int '" << val << "': " << interr;
6573 return -EINVAL;
6574 }
6575 p.target_max_bytes = n;
6576 } else if (var == "cache_target_dirty_ratio") {
6577 if (floaterr.length()) {
6578 ss << "error parsing float '" << val << "': " << floaterr;
6579 return -EINVAL;
6580 }
6581 if (f < 0 || f > 1.0) {
6582 ss << "value must be in the range 0..1";
6583 return -ERANGE;
6584 }
6585 p.cache_target_dirty_ratio_micro = uf;
6586 } else if (var == "cache_target_dirty_high_ratio") {
6587 if (floaterr.length()) {
6588 ss << "error parsing float '" << val << "': " << floaterr;
6589 return -EINVAL;
6590 }
6591 if (f < 0 || f > 1.0) {
6592 ss << "value must be in the range 0..1";
6593 return -ERANGE;
6594 }
6595 p.cache_target_dirty_high_ratio_micro = uf;
6596 } else if (var == "cache_target_full_ratio") {
6597 if (floaterr.length()) {
6598 ss << "error parsing float '" << val << "': " << floaterr;
6599 return -EINVAL;
6600 }
6601 if (f < 0 || f > 1.0) {
6602 ss << "value must be in the range 0..1";
6603 return -ERANGE;
6604 }
6605 p.cache_target_full_ratio_micro = uf;
6606 } else if (var == "cache_min_flush_age") {
6607 if (interr.length()) {
6608 ss << "error parsing int '" << val << "': " << interr;
6609 return -EINVAL;
6610 }
6611 p.cache_min_flush_age = n;
6612 } else if (var == "cache_min_evict_age") {
6613 if (interr.length()) {
6614 ss << "error parsing int '" << val << "': " << interr;
6615 return -EINVAL;
6616 }
6617 p.cache_min_evict_age = n;
6618 } else if (var == "min_read_recency_for_promote") {
6619 if (interr.length()) {
6620 ss << "error parsing integer value '" << val << "': " << interr;
6621 return -EINVAL;
6622 }
6623 p.min_read_recency_for_promote = n;
6624 } else if (var == "hit_set_grade_decay_rate") {
6625 if (interr.length()) {
6626 ss << "error parsing integer value '" << val << "': " << interr;
6627 return -EINVAL;
6628 }
6629 if (n > 100 || n < 0) {
6630 ss << "value out of range,valid range is 0 - 100";
6631 return -EINVAL;
6632 }
6633 p.hit_set_grade_decay_rate = n;
6634 } else if (var == "hit_set_search_last_n") {
6635 if (interr.length()) {
6636 ss << "error parsing integer value '" << val << "': " << interr;
6637 return -EINVAL;
6638 }
6639 if (n > p.hit_set_count || n < 0) {
6640 ss << "value out of range,valid range is 0 - hit_set_count";
6641 return -EINVAL;
6642 }
6643 p.hit_set_search_last_n = n;
6644 } else if (var == "min_write_recency_for_promote") {
6645 if (interr.length()) {
6646 ss << "error parsing integer value '" << val << "': " << interr;
6647 return -EINVAL;
6648 }
6649 p.min_write_recency_for_promote = n;
6650 } else if (var == "fast_read") {
6651 if (p.is_replicated()) {
6652 ss << "fast read is not supported in replication pool";
6653 return -EINVAL;
6654 }
6655 if (val == "true" || (interr.empty() && n == 1)) {
6656 p.fast_read = true;
6657 } else if (val == "false" || (interr.empty() && n == 0)) {
6658 p.fast_read = false;
6659 } else {
6660 ss << "expecting value 'true', 'false', '0', or '1'";
6661 return -EINVAL;
6662 }
6663 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 6664 bool unset = val == "unset";
7c673cae 6665 if (var == "compression_mode") {
224ce89b
WB
6666 if (!unset) {
6667 auto cmode = Compressor::get_comp_mode_type(val);
6668 if (!cmode) {
6669 ss << "unrecognized compression mode '" << val << "'";
6670 return -EINVAL;
6671 }
7c673cae
FG
6672 }
6673 } else if (var == "compression_algorithm") {
224ce89b
WB
6674 if (!unset) {
6675 auto alg = Compressor::get_comp_alg_type(val);
6676 if (!alg) {
6677 ss << "unrecognized compression_algorithm '" << val << "'";
6678 return -EINVAL;
6679 }
7c673cae
FG
6680 }
6681 } else if (var == "compression_required_ratio") {
6682 if (floaterr.length()) {
6683 ss << "error parsing float value '" << val << "': " << floaterr;
6684 return -EINVAL;
6685 }
224ce89b 6686 if (f < 0 || f > 1) {
7c673cae 6687 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 6688 return -EINVAL;
7c673cae
FG
6689 }
6690 } else if (var == "csum_type") {
224ce89b 6691 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
6692 if (t < 0 ) {
6693 ss << "unrecognized csum_type '" << val << "'";
224ce89b 6694 return -EINVAL;
7c673cae
FG
6695 }
6696 //preserve csum_type numeric value
6697 n = t;
6698 interr.clear();
6699 } else if (var == "compression_max_blob_size" ||
6700 var == "compression_min_blob_size" ||
6701 var == "csum_max_block" ||
6702 var == "csum_min_block") {
6703 if (interr.length()) {
6704 ss << "error parsing int value '" << val << "': " << interr;
6705 return -EINVAL;
6706 }
6707 }
6708
6709 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6710 switch (desc.type) {
6711 case pool_opts_t::STR:
224ce89b 6712 if (unset) {
7c673cae
FG
6713 p.opts.unset(desc.key);
6714 } else {
6715 p.opts.set(desc.key, static_cast<std::string>(val));
6716 }
6717 break;
6718 case pool_opts_t::INT:
6719 if (interr.length()) {
6720 ss << "error parsing integer value '" << val << "': " << interr;
6721 return -EINVAL;
6722 }
6723 if (n == 0) {
6724 p.opts.unset(desc.key);
6725 } else {
6726 p.opts.set(desc.key, static_cast<int>(n));
6727 }
6728 break;
6729 case pool_opts_t::DOUBLE:
6730 if (floaterr.length()) {
6731 ss << "error parsing floating point value '" << val << "': " << floaterr;
6732 return -EINVAL;
6733 }
6734 if (f == 0) {
6735 p.opts.unset(desc.key);
6736 } else {
6737 p.opts.set(desc.key, static_cast<double>(f));
6738 }
6739 break;
6740 default:
6741 assert(!"unknown type");
6742 }
6743 } else {
6744 ss << "unrecognized variable '" << var << "'";
6745 return -EINVAL;
6746 }
224ce89b
WB
6747 if (val != "unset") {
6748 ss << "set pool " << pool << " " << var << " to " << val;
6749 } else {
6750 ss << "unset pool " << pool << " " << var;
6751 }
7c673cae
FG
6752 p.last_change = pending_inc.epoch;
6753 pending_inc.new_pools[pool] = p;
6754 return 0;
6755}
6756
c07f9fc5
FG
6757int OSDMonitor::prepare_command_pool_application(const string &prefix,
6758 map<string,cmd_vartype> &cmdmap,
6759 stringstream& ss)
6760{
6761 string pool_name;
6762 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
6763 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6764 if (pool < 0) {
6765 ss << "unrecognized pool '" << pool_name << "'";
6766 return -ENOENT;
6767 }
6768
6769 pg_pool_t p = *osdmap.get_pg_pool(pool);
6770 if (pending_inc.new_pools.count(pool)) {
6771 p = pending_inc.new_pools[pool];
6772 }
6773
6774 string app;
6775 cmd_getval(g_ceph_context, cmdmap, "app", app);
6776 bool app_exists = (p.application_metadata.count(app) > 0);
6777
6778 if (boost::algorithm::ends_with(prefix, "enable")) {
6779 if (app.empty()) {
6780 ss << "application name must be provided";
6781 return -EINVAL;
6782 }
6783
6784 if (p.is_tier()) {
6785 ss << "application must be enabled on base tier";
6786 return -EINVAL;
6787 }
6788
6789 string force;
6790 cmd_getval(g_ceph_context, cmdmap, "force", force);
6791
6792 if (!app_exists && !p.application_metadata.empty() &&
6793 force != "--yes-i-really-mean-it") {
6794 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
6795 << "application; pass --yes-i-really-mean-it to proceed anyway";
6796 return -EPERM;
6797 }
6798
6799 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
6800 ss << "too many enabled applications on pool '" << pool_name << "'; "
6801 << "max " << MAX_POOL_APPLICATIONS;
6802 return -EINVAL;
6803 }
6804
6805 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
6806 ss << "application name '" << app << "' too long; max length "
6807 << MAX_POOL_APPLICATION_LENGTH;
6808 return -EINVAL;
6809 }
6810
6811 if (!app_exists) {
6812 p.application_metadata[app] = {};
6813 }
6814 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
6815
6816 } else if (boost::algorithm::ends_with(prefix, "disable")) {
6817 string force;
6818 cmd_getval(g_ceph_context, cmdmap, "force", force);
6819
6820 if (force != "--yes-i-really-mean-it") {
6821 ss << "Are you SURE? Disabling an application within a pool might result "
6822 << "in loss of application functionality; pass "
6823 << "--yes-i-really-mean-it to proceed anyway";
6824 return -EPERM;
6825 }
6826
6827 if (!app_exists) {
6828 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6829 << "'";
6830 return 0; // idempotent
6831 }
6832
6833 p.application_metadata.erase(app);
6834 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
6835
6836 } else if (boost::algorithm::ends_with(prefix, "set")) {
6837 if (p.is_tier()) {
6838 ss << "application metadata must be set on base tier";
6839 return -EINVAL;
6840 }
6841
6842 if (!app_exists) {
6843 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6844 << "'";
6845 return -ENOENT;
6846 }
6847
6848 string key;
6849 cmd_getval(g_ceph_context, cmdmap, "key", key);
6850
6851 if (key.empty()) {
6852 ss << "key must be provided";
6853 return -EINVAL;
6854 }
6855
6856 auto &app_keys = p.application_metadata[app];
6857 if (app_keys.count(key) == 0 &&
6858 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
6859 ss << "too many keys set for application '" << app << "' on pool '"
6860 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
6861 return -EINVAL;
6862 }
6863
6864 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
6865 ss << "key '" << app << "' too long; max length "
6866 << MAX_POOL_APPLICATION_LENGTH;
6867 return -EINVAL;
6868 }
6869
6870 string value;
6871 cmd_getval(g_ceph_context, cmdmap, "value", value);
6872 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
6873 ss << "value '" << value << "' too long; max length "
6874 << MAX_POOL_APPLICATION_LENGTH;
6875 return -EINVAL;
6876 }
6877
6878 p.application_metadata[app][key] = value;
6879 ss << "set application '" << app << "' key '" << key << "' to '"
6880 << value << "' on pool '" << pool_name << "'";
6881 } else if (boost::algorithm::ends_with(prefix, "rm")) {
6882 if (!app_exists) {
6883 ss << "application '" << app << "' is not enabled on pool '" << pool_name
6884 << "'";
6885 return -ENOENT;
6886 }
6887
6888 string key;
6889 cmd_getval(g_ceph_context, cmdmap, "key", key);
6890 auto it = p.application_metadata[app].find(key);
6891 if (it == p.application_metadata[app].end()) {
6892 ss << "application '" << app << "' on pool '" << pool_name
6893 << "' does not have key '" << key << "'";
6894 return 0; // idempotent
6895 }
6896
6897 p.application_metadata[app].erase(it);
6898 ss << "removed application '" << app << "' key '" << key << "' on pool '"
6899 << pool_name << "'";
6900 } else {
6901 assert(false);
6902 }
6903
6904 p.last_change = pending_inc.epoch;
6905 pending_inc.new_pools[pool] = p;
6906 return 0;
6907}
6908
31f18b77
FG
6909int OSDMonitor::_prepare_command_osd_crush_remove(
6910 CrushWrapper &newcrush,
6911 int32_t id,
6912 int32_t ancestor,
6913 bool has_ancestor,
6914 bool unlink_only)
6915{
6916 int err = 0;
6917
6918 if (has_ancestor) {
6919 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
6920 unlink_only);
6921 } else {
6922 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
6923 }
6924 return err;
6925}
6926
6927void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
6928{
6929 pending_inc.crush.clear();
6930 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6931}
6932
6933int OSDMonitor::prepare_command_osd_crush_remove(
6934 CrushWrapper &newcrush,
6935 int32_t id,
6936 int32_t ancestor,
6937 bool has_ancestor,
6938 bool unlink_only)
6939{
6940 int err = _prepare_command_osd_crush_remove(
6941 newcrush, id, ancestor,
6942 has_ancestor, unlink_only);
6943
6944 if (err < 0)
6945 return err;
6946
6947 assert(err == 0);
6948 do_osd_crush_remove(newcrush);
6949
6950 return 0;
6951}
6952
6953int OSDMonitor::prepare_command_osd_remove(int32_t id)
6954{
6955 if (osdmap.is_up(id)) {
6956 return -EBUSY;
6957 }
6958
6959 pending_inc.new_state[id] = osdmap.get_state(id);
6960 pending_inc.new_uuid[id] = uuid_d();
6961 pending_metadata_rm.insert(id);
6962 pending_metadata.erase(id);
6963
6964 return 0;
6965}
6966
6967int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
6968{
6969 assert(existing_id);
6970 *existing_id = -1;
6971
6972 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
6973 if (!osdmap.exists(i) &&
6974 pending_inc.new_up_client.count(i) == 0 &&
6975 (pending_inc.new_state.count(i) == 0 ||
6976 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
6977 *existing_id = i;
6978 return -1;
6979 }
6980 }
6981
6982 if (pending_inc.new_max_osd < 0) {
6983 return osdmap.get_max_osd();
6984 }
6985 return pending_inc.new_max_osd;
6986}
6987
6988void OSDMonitor::do_osd_create(
6989 const int32_t id,
6990 const uuid_d& uuid,
3a9019d9 6991 const string& device_class,
31f18b77
FG
6992 int32_t* new_id)
6993{
6994 dout(10) << __func__ << " uuid " << uuid << dendl;
6995 assert(new_id);
6996
6997 // We presume validation has been performed prior to calling this
6998 // function. We assert with prejudice.
6999
7000 int32_t allocated_id = -1; // declare here so we can jump
7001 int32_t existing_id = -1;
7002 if (!uuid.is_zero()) {
7003 existing_id = osdmap.identify_osd(uuid);
7004 if (existing_id >= 0) {
7005 assert(id < 0 || id == existing_id);
7006 *new_id = existing_id;
7007 goto out;
7008 } else if (id >= 0) {
7009 // uuid does not exist, and id has been provided, so just create
7010 // the new osd.id
7011 *new_id = id;
7012 goto out;
7013 }
7014 }
7015
7016 // allocate a new id
7017 allocated_id = _allocate_osd_id(&existing_id);
7018 dout(10) << __func__ << " allocated id " << allocated_id
7019 << " existing id " << existing_id << dendl;
7020 if (existing_id >= 0) {
7021 assert(existing_id < osdmap.get_max_osd());
7022 assert(allocated_id < 0);
7023 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
7024 *new_id = existing_id;
31f18b77
FG
7025 } else if (allocated_id >= 0) {
7026 assert(existing_id < 0);
7027 // raise max_osd
7028 if (pending_inc.new_max_osd < 0) {
7029 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
7030 } else {
7031 ++pending_inc.new_max_osd;
7032 }
7033 *new_id = pending_inc.new_max_osd - 1;
7034 assert(*new_id == allocated_id);
7035 } else {
7036 assert(0 == "unexpected condition");
7037 }
7038
7039out:
3a9019d9
FG
7040 if (device_class.size()) {
7041 CrushWrapper newcrush;
7042 _get_pending_crush(newcrush);
7043 if (newcrush.get_max_devices() < *new_id + 1) {
7044 newcrush.set_max_devices(*new_id + 1);
7045 }
7046 string name = string("osd.") + stringify(*new_id);
7047 if (!newcrush.item_exists(*new_id)) {
7048 newcrush.set_item_name(*new_id, name);
7049 }
7050 ostringstream ss;
7051 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
7052 if (r < 0) {
7053 derr << __func__ << " failed to set " << name << " device_class "
7054 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
7055 << dendl;
7056 // non-fatal... this might be a replay and we want to be idempotent.
7057 } else {
7058 dout(20) << __func__ << " set " << name << " device_class " << device_class
7059 << dendl;
7060 pending_inc.crush.clear();
7061 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7062 }
7063 } else {
7064 dout(20) << __func__ << " no device_class" << dendl;
7065 }
7066
31f18b77
FG
7067 dout(10) << __func__ << " using id " << *new_id << dendl;
7068 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
7069 pending_inc.new_max_osd = *new_id + 1;
7070 }
7071
7072 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
7073 if (!uuid.is_zero())
7074 pending_inc.new_uuid[*new_id] = uuid;
7075}
7076
7077int OSDMonitor::validate_osd_create(
7078 const int32_t id,
7079 const uuid_d& uuid,
7080 const bool check_osd_exists,
7081 int32_t* existing_id,
7082 stringstream& ss)
7083{
7084
7085 dout(10) << __func__ << " id " << id << " uuid " << uuid
7086 << " check_osd_exists " << check_osd_exists << dendl;
7087
7088 assert(existing_id);
7089
7090 if (id < 0 && uuid.is_zero()) {
7091 // we have nothing to validate
7092 *existing_id = -1;
7093 return 0;
7094 } else if (uuid.is_zero()) {
7095 // we have an id but we will ignore it - because that's what
7096 // `osd create` does.
7097 return 0;
7098 }
7099
7100 /*
7101 * This function will be used to validate whether we are able to
7102 * create a new osd when the `uuid` is specified.
7103 *
7104 * It will be used by both `osd create` and `osd new`, as the checks
7105 * are basically the same when it pertains to osd id and uuid validation.
7106 * However, `osd create` presumes an `uuid` is optional, for legacy
7107 * reasons, while `osd new` requires the `uuid` to be provided. This
7108 * means that `osd create` will not be idempotent if an `uuid` is not
7109 * provided, but we will always guarantee the idempotency of `osd new`.
7110 */
7111
7112 assert(!uuid.is_zero());
7113 if (pending_inc.identify_osd(uuid) >= 0) {
7114 // osd is about to exist
7115 return -EAGAIN;
7116 }
7117
7118 int32_t i = osdmap.identify_osd(uuid);
7119 if (i >= 0) {
7120 // osd already exists
7121 if (id >= 0 && i != id) {
7122 ss << "uuid " << uuid << " already in use for different id " << i;
7123 return -EEXIST;
7124 }
7125 // return a positive errno to distinguish between a blocking error
7126 // and an error we consider to not be a problem (i.e., this would be
7127 // an idempotent operation).
7128 *existing_id = i;
7129 return EEXIST;
7130 }
7131 // i < 0
7132 if (id >= 0) {
7133 if (pending_inc.new_state.count(id)) {
7134 // osd is about to exist
7135 return -EAGAIN;
7136 }
7137 // we may not care if an osd exists if we are recreating a previously
7138 // destroyed osd.
7139 if (check_osd_exists && osdmap.exists(id)) {
7140 ss << "id " << id << " already in use and does not match uuid "
7141 << uuid;
7142 return -EINVAL;
7143 }
7144 }
7145 return 0;
7146}
7147
7148int OSDMonitor::prepare_command_osd_create(
7149 const int32_t id,
7150 const uuid_d& uuid,
7151 int32_t* existing_id,
7152 stringstream& ss)
7153{
7154 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7155 assert(existing_id);
b5b8bbf5
FG
7156 if (osdmap.is_destroyed(id)) {
7157 ss << "ceph osd create has been deprecated. Please use ceph osd new "
7158 "instead.";
7159 return -EINVAL;
7160 }
31f18b77
FG
7161
7162 if (uuid.is_zero()) {
7163 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
7164 }
7165
7166 return validate_osd_create(id, uuid, true, existing_id, ss);
7167}
7168
7169int OSDMonitor::prepare_command_osd_new(
7170 MonOpRequestRef op,
7171 const map<string,cmd_vartype>& cmdmap,
3a9019d9 7172 const map<string,string>& params,
31f18b77
FG
7173 stringstream &ss,
7174 Formatter *f)
7175{
7176 uuid_d uuid;
7177 string uuidstr;
7178 int64_t id = -1;
7179
7180 assert(paxos->is_plugged());
7181
7182 dout(10) << __func__ << " " << op << dendl;
7183
7184 /* validate command. abort now if something's wrong. */
7185
7186 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
7187 *
7188 * If `id` is not specified, we will identify any existing osd based
7189 * on `uuid`. Operation will be idempotent iff secrets match.
7190 *
7191 * If `id` is specified, we will identify any existing osd based on
7192 * `uuid` and match against `id`. If they match, operation will be
7193 * idempotent iff secrets match.
7194 *
7195 * `-i secrets.json` will be optional. If supplied, will be used
7196 * to check for idempotency when `id` and `uuid` match.
7197 *
7198 * If `id` is not specified, and `uuid` does not exist, an id will
7199 * be found or allocated for the osd.
7200 *
7201 * If `id` is specified, and the osd has been previously marked
7202 * as destroyed, then the `id` will be reused.
7203 */
7204 if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
7205 ss << "requires the OSD's UUID to be specified.";
7206 return -EINVAL;
7207 } else if (!uuid.parse(uuidstr.c_str())) {
7208 ss << "invalid UUID value '" << uuidstr << "'.";
7209 return -EINVAL;
7210 }
7211
7212 if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
7213 (id < 0)) {
7214 ss << "invalid OSD id; must be greater or equal than zero.";
7215 return -EINVAL;
7216 }
7217
7218 // are we running an `osd create`-like command, or recreating
7219 // a previously destroyed osd?
7220
7221 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
7222
7223 // we will care about `id` to assess whether osd is `destroyed`, or
7224 // to create a new osd.
7225 // we will need an `id` by the time we reach auth.
7226
7227 int32_t existing_id = -1;
7228 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
7229 &existing_id, ss);
7230
7231 bool may_be_idempotent = false;
7232 if (err == EEXIST) {
7233 // this is idempotent from the osdmon's point-of-view
7234 may_be_idempotent = true;
7235 assert(existing_id >= 0);
7236 id = existing_id;
7237 } else if (err < 0) {
7238 return err;
7239 }
7240
7241 if (!may_be_idempotent) {
7242 // idempotency is out of the window. We are either creating a new
7243 // osd or recreating a destroyed osd.
7244 //
7245 // We now need to figure out if we have an `id` (and if it's valid),
7246 // of find an `id` if we don't have one.
7247
7248 // NOTE: we need to consider the case where the `id` is specified for
7249 // `osd create`, and we must honor it. So this means checking if
7250 // the `id` is destroyed, and if so assume the destroy; otherwise,
7251 // check if it `exists` - in which case we complain about not being
7252 // `destroyed`. In the end, if nothing fails, we must allow the
7253 // creation, so that we are compatible with `create`.
7254 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
7255 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
7256 ss << "OSD " << id << " has not yet been destroyed";
7257 return -EINVAL;
7258 } else if (id < 0) {
7259 // find an `id`
7260 id = _allocate_osd_id(&existing_id);
7261 if (id < 0) {
7262 assert(existing_id >= 0);
7263 id = existing_id;
7264 }
7265 dout(10) << __func__ << " found id " << id << " to use" << dendl;
7266 } else if (id >= 0 && osdmap.is_destroyed(id)) {
7267 dout(10) << __func__ << " recreating osd." << id << dendl;
7268 } else {
7269 dout(10) << __func__ << " creating new osd." << id << dendl;
7270 }
7271 } else {
7272 assert(id >= 0);
7273 assert(osdmap.exists(id));
7274 }
7275
7276 // we are now able to either create a brand new osd or reuse an existing
7277 // osd that has been previously destroyed.
7278
7279 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
7280
3a9019d9 7281 if (may_be_idempotent && params.empty()) {
31f18b77 7282 // nothing to do, really.
3a9019d9 7283 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
31f18b77
FG
7284 assert(id >= 0);
7285 if (f) {
7286 f->open_object_section("created_osd");
7287 f->dump_int("osdid", id);
7288 f->close_section();
7289 } else {
7290 ss << id;
7291 }
7292 return EEXIST;
7293 }
7294
3a9019d9
FG
7295 string device_class;
7296 auto p = params.find("crush_device_class");
7297 if (p != params.end()) {
7298 device_class = p->second;
7299 dout(20) << __func__ << " device_class will be " << device_class << dendl;
7300 }
31f18b77
FG
7301 string cephx_secret, lockbox_secret, dmcrypt_key;
7302 bool has_lockbox = false;
3a9019d9
FG
7303 bool has_secrets = params.count("cephx_secret")
7304 || params.count("cephx_lockbox_secret")
7305 || params.count("dmcrypt_key");
31f18b77
FG
7306
7307 ConfigKeyService *svc = nullptr;
7308 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
7309
7310 if (has_secrets) {
3a9019d9 7311 if (params.count("cephx_secret") == 0) {
31f18b77
FG
7312 ss << "requires a cephx secret.";
7313 return -EINVAL;
7314 }
3a9019d9 7315 cephx_secret = params.at("cephx_secret");
31f18b77 7316
3a9019d9
FG
7317 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
7318 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
31f18b77
FG
7319
7320 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
7321 << " dmcrypt " << has_dmcrypt_key << dendl;
7322
7323 if (has_lockbox_secret && has_dmcrypt_key) {
7324 has_lockbox = true;
3a9019d9
FG
7325 lockbox_secret = params.at("cephx_lockbox_secret");
7326 dmcrypt_key = params.at("dmcrypt_key");
31f18b77
FG
7327 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
7328 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
7329 return -EINVAL;
7330 }
7331
7332 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
7333
7334 err = mon->authmon()->validate_osd_new(id, uuid,
7335 cephx_secret,
7336 lockbox_secret,
7337 cephx_entity,
7338 lockbox_entity,
7339 ss);
7340 if (err < 0) {
7341 return err;
7342 } else if (may_be_idempotent && err != EEXIST) {
7343 // for this to be idempotent, `id` should already be >= 0; no need
7344 // to use validate_id.
7345 assert(id >= 0);
7346 ss << "osd." << id << " exists but secrets do not match";
7347 return -EEXIST;
7348 }
7349
7350 if (has_lockbox) {
7351 svc = (ConfigKeyService*)mon->config_key_service;
7352 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
7353 if (err < 0) {
7354 return err;
7355 } else if (may_be_idempotent && err != EEXIST) {
7356 assert(id >= 0);
7357 ss << "osd." << id << " exists but dm-crypt key does not match.";
7358 return -EEXIST;
7359 }
7360 }
7361 }
7362 assert(!has_secrets || !cephx_secret.empty());
7363 assert(!has_lockbox || !lockbox_secret.empty());
7364
7365 if (may_be_idempotent) {
7366 // we have nothing to do for either the osdmon or the authmon,
7367 // and we have no lockbox - so the config key service will not be
7368 // touched. This is therefore an idempotent operation, and we can
7369 // just return right away.
7370 dout(10) << __func__ << " idempotent -- no op." << dendl;
7371 assert(id >= 0);
7372 if (f) {
7373 f->open_object_section("created_osd");
7374 f->dump_int("osdid", id);
7375 f->close_section();
7376 } else {
7377 ss << id;
7378 }
7379 return EEXIST;
7380 }
7381 assert(!may_be_idempotent);
7382
7383 // perform updates.
7384 if (has_secrets) {
7385 assert(!cephx_secret.empty());
7386 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
7387 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
7388
7389 err = mon->authmon()->do_osd_new(cephx_entity,
7390 lockbox_entity,
7391 has_lockbox);
7392 assert(0 == err);
7393
7394 if (has_lockbox) {
7395 assert(nullptr != svc);
7396 svc->do_osd_new(uuid, dmcrypt_key);
7397 }
7398 }
7399
7400 if (is_recreate_destroyed) {
7401 assert(id >= 0);
7402 assert(osdmap.is_destroyed(id));
7403 pending_inc.new_weight[id] = CEPH_OSD_OUT;
7404 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
c07f9fc5
FG
7405 if (osdmap.get_state(id) & CEPH_OSD_UP) {
7406 // due to http://tracker.ceph.com/issues/20751 some clusters may
7407 // have UP set for non-existent OSDs; make sure it is cleared
7408 // for a newly created osd.
7409 pending_inc.new_state[id] |= CEPH_OSD_UP;
7410 }
31f18b77
FG
7411 pending_inc.new_uuid[id] = uuid;
7412 } else {
7413 assert(id >= 0);
7414 int32_t new_id = -1;
3a9019d9 7415 do_osd_create(id, uuid, device_class, &new_id);
31f18b77
FG
7416 assert(new_id >= 0);
7417 assert(id == new_id);
7418 }
7419
7420 if (f) {
7421 f->open_object_section("created_osd");
7422 f->dump_int("osdid", id);
7423 f->close_section();
7424 } else {
7425 ss << id;
7426 }
7427
7428 return 0;
7429}
7430
7c673cae
FG
7431bool OSDMonitor::prepare_command(MonOpRequestRef op)
7432{
7433 op->mark_osdmon_event(__func__);
7434 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7435 stringstream ss;
7436 map<string, cmd_vartype> cmdmap;
7437 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
7438 string rs = ss.str();
7439 mon->reply_command(op, -EINVAL, rs, get_last_committed());
7440 return true;
7441 }
7442
7443 MonSession *session = m->get_session();
7444 if (!session) {
7445 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
7446 return true;
7447 }
7448
7449 return prepare_command_impl(op, cmdmap);
7450}
7451
7452static int parse_reweights(CephContext *cct,
7453 const map<string,cmd_vartype> &cmdmap,
7454 const OSDMap& osdmap,
7455 map<int32_t, uint32_t>* weights)
7456{
7457 string weights_str;
7458 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
7459 return -EINVAL;
7460 }
7461 std::replace(begin(weights_str), end(weights_str), '\'', '"');
7462 json_spirit::mValue json_value;
7463 if (!json_spirit::read(weights_str, json_value)) {
7464 return -EINVAL;
7465 }
7466 if (json_value.type() != json_spirit::obj_type) {
7467 return -EINVAL;
7468 }
7469 const auto obj = json_value.get_obj();
7470 try {
7471 for (auto& osd_weight : obj) {
7472 auto osd_id = std::stoi(osd_weight.first);
7473 if (!osdmap.exists(osd_id)) {
7474 return -ENOENT;
7475 }
7476 if (osd_weight.second.type() != json_spirit::str_type) {
7477 return -EINVAL;
7478 }
7479 auto weight = std::stoul(osd_weight.second.get_str());
7480 weights->insert({osd_id, weight});
7481 }
7482 } catch (const std::logic_error& e) {
7483 return -EINVAL;
7484 }
7485 return 0;
7486}
7487
31f18b77
FG
7488int OSDMonitor::prepare_command_osd_destroy(
7489 int32_t id,
7490 stringstream& ss)
7491{
7492 assert(paxos->is_plugged());
7493
7494 // we check if the osd exists for the benefit of `osd purge`, which may
7495 // have previously removed the osd. If the osd does not exist, return
7496 // -ENOENT to convey this, and let the caller deal with it.
7497 //
7498 // we presume that all auth secrets and config keys were removed prior
7499 // to this command being called. if they exist by now, we also assume
7500 // they must have been created by some other command and do not pertain
7501 // to this non-existent osd.
7502 if (!osdmap.exists(id)) {
7503 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
7504 return -ENOENT;
7505 }
7506
7507 uuid_d uuid = osdmap.get_uuid(id);
7508 dout(10) << __func__ << " destroying osd." << id
7509 << " uuid " << uuid << dendl;
7510
7511 // if it has been destroyed, we assume our work here is done.
7512 if (osdmap.is_destroyed(id)) {
7513 ss << "destroyed osd." << id;
7514 return 0;
7515 }
7516
7517 EntityName cephx_entity, lockbox_entity;
7518 bool idempotent_auth = false, idempotent_cks = false;
7519
7520 int err = mon->authmon()->validate_osd_destroy(id, uuid,
7521 cephx_entity,
7522 lockbox_entity,
7523 ss);
7524 if (err < 0) {
7525 if (err == -ENOENT) {
7526 idempotent_auth = true;
31f18b77
FG
7527 } else {
7528 return err;
7529 }
7530 }
7531
7532 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
7533 err = svc->validate_osd_destroy(id, uuid);
7534 if (err < 0) {
7535 assert(err == -ENOENT);
7536 err = 0;
7537 idempotent_cks = true;
7538 }
7539
7540 if (!idempotent_auth) {
7541 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
7542 assert(0 == err);
7543 }
7544
7545 if (!idempotent_cks) {
7546 svc->do_osd_destroy(id, uuid);
7547 }
7548
7549 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
7550 pending_inc.new_uuid[id] = uuid_d();
7551
7552 // we can only propose_pending() once per service, otherwise we'll be
7553 // defying PaxosService and all laws of nature. Therefore, as we may
7554 // be used during 'osd purge', let's keep the caller responsible for
7555 // proposing.
7556 assert(err == 0);
7557 return 0;
7558}
7559
7560int OSDMonitor::prepare_command_osd_purge(
7561 int32_t id,
7562 stringstream& ss)
7563{
7564 assert(paxos->is_plugged());
7565 dout(10) << __func__ << " purging osd." << id << dendl;
7566
7567 assert(!osdmap.is_up(id));
7568
7569 /*
7570 * This may look a bit weird, but this is what's going to happen:
7571 *
7572 * 1. we make sure that removing from crush works
7573 * 2. we call `prepare_command_osd_destroy()`. If it returns an
7574 * error, then we abort the whole operation, as no updates
7575 * have been made. However, we this function will have
7576 * side-effects, thus we need to make sure that all operations
7577 * performed henceforth will *always* succeed.
7578 * 3. we call `prepare_command_osd_remove()`. Although this
7579 * function can return an error, it currently only checks if the
7580 * osd is up - and we have made sure that it is not so, so there
7581 * is no conflict, and it is effectively an update.
7582 * 4. finally, we call `do_osd_crush_remove()`, which will perform
7583 * the crush update we delayed from before.
7584 */
7585
7586 CrushWrapper newcrush;
7587 _get_pending_crush(newcrush);
7588
7589 bool may_be_idempotent = false;
7590
7591 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
7592 if (err == -ENOENT) {
7593 err = 0;
7594 may_be_idempotent = true;
7595 } else if (err < 0) {
7596 ss << "error removing osd." << id << " from crush";
7597 return err;
7598 }
7599
7600 // no point destroying the osd again if it has already been marked destroyed
7601 if (!osdmap.is_destroyed(id)) {
7602 err = prepare_command_osd_destroy(id, ss);
7603 if (err < 0) {
7604 if (err == -ENOENT) {
7605 err = 0;
7606 } else {
7607 return err;
7608 }
7609 } else {
7610 may_be_idempotent = false;
7611 }
7612 }
7613 assert(0 == err);
7614
7615 if (may_be_idempotent && !osdmap.exists(id)) {
7616 dout(10) << __func__ << " osd." << id << " does not exist and "
7617 << "we are idempotent." << dendl;
7618 return -ENOENT;
7619 }
7620
7621 err = prepare_command_osd_remove(id);
7622 // we should not be busy, as we should have made sure this id is not up.
7623 assert(0 == err);
7624
7625 do_osd_crush_remove(newcrush);
7626 return 0;
7627}
7628
7c673cae
FG
7629bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
7630 map<string,cmd_vartype> &cmdmap)
7631{
7632 op->mark_osdmon_event(__func__);
7633 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
7634 bool ret = false;
7635 stringstream ss;
7636 string rs;
7637 bufferlist rdata;
7638 int err = 0;
7639
7640 string format;
7641 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
7642 boost::scoped_ptr<Formatter> f(Formatter::create(format));
7643
7644 string prefix;
7645 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
7646
7647 int64_t osdid;
7648 string name;
b32b8144
FG
7649 bool osdid_present = false;
7650 if (prefix != "osd pg-temp" &&
7651 prefix != "osd pg-upmap" &&
7652 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
7653 osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
7654 }
7c673cae
FG
7655 if (osdid_present) {
7656 ostringstream oss;
7657 oss << "osd." << osdid;
7658 name = oss.str();
7659 }
7660
7661 // Even if there's a pending state with changes that could affect
7662 // a command, considering that said state isn't yet committed, we
7663 // just don't care about those changes if the command currently being
7664 // handled acts as a no-op against the current committed state.
7665 // In a nutshell, we assume this command happens *before*.
7666 //
7667 // Let me make this clearer:
7668 //
7669 // - If we have only one client, and that client issues some
7670 // operation that would conflict with this operation but is
7671 // still on the pending state, then we would be sure that said
7672 // operation wouldn't have returned yet, so the client wouldn't
7673 // issue this operation (unless the client didn't wait for the
7674 // operation to finish, and that would be the client's own fault).
7675 //
7676 // - If we have more than one client, each client will observe
7677 // whatever is the state at the moment of the commit. So, if we
7678 // have two clients, one issuing an unlink and another issuing a
7679 // link, and if the link happens while the unlink is still on the
7680 // pending state, from the link's point-of-view this is a no-op.
7681 // If different clients are issuing conflicting operations and
7682 // they care about that, then the clients should make sure they
7683 // enforce some kind of concurrency mechanism -- from our
7684 // perspective that's what Douglas Adams would call an SEP.
7685 //
7686 // This should be used as a general guideline for most commands handled
7687 // in this function. Adapt as you see fit, but please bear in mind that
7688 // this is the expected behavior.
7689
7690
7691 if (prefix == "osd setcrushmap" ||
7692 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
7693 if (pending_inc.crush.length()) {
7694 dout(10) << __func__ << " waiting for pending crush update " << dendl;
7695 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
7696 return true;
7697 }
7c673cae
FG
7698 dout(10) << "prepare_command setting new crush map" << dendl;
7699 bufferlist data(m->get_data());
7700 CrushWrapper crush;
7701 try {
7702 bufferlist::iterator bl(data.begin());
7703 crush.decode(bl);
7704 }
7705 catch (const std::exception &e) {
7706 err = -EINVAL;
7707 ss << "Failed to parse crushmap: " << e.what();
7708 goto reply;
7709 }
31f18b77
FG
7710
7711 int64_t prior_version = 0;
7712 if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
7713 if (prior_version == osdmap.get_crush_version() - 1) {
7714 // see if we are a resend of the last update. this is imperfect
7715 // (multiple racing updaters may not both get reliable success)
7716 // but we expect crush updaters (via this interface) to be rare-ish.
7717 bufferlist current, proposed;
7718 osdmap.crush->encode(current, mon->get_quorum_con_features());
7719 crush.encode(proposed, mon->get_quorum_con_features());
7720 if (current.contents_equal(proposed)) {
7721 dout(10) << __func__
7722 << " proposed matches current and version equals previous"
7723 << dendl;
7724 err = 0;
7725 ss << osdmap.get_crush_version();
7726 goto reply;
7727 }
7728 }
7729 if (prior_version != osdmap.get_crush_version()) {
7730 err = -EPERM;
7731 ss << "prior_version " << prior_version << " != crush version "
7732 << osdmap.get_crush_version();
7733 goto reply;
7734 }
7735 }
7c673cae 7736
3efd9988 7737 if (crush.has_legacy_rule_ids()) {
31f18b77
FG
7738 err = -EINVAL;
7739 ss << "crush maps with ruleset != ruleid are no longer allowed";
7740 goto reply;
7741 }
7c673cae
FG
7742 if (!validate_crush_against_features(&crush, ss)) {
7743 err = -EINVAL;
7744 goto reply;
7745 }
31f18b77 7746
3efd9988
FG
7747 err = osdmap.validate_crush_rules(&crush, &ss);
7748 if (err < 0) {
7749 goto reply;
7c673cae
FG
7750 }
7751
224ce89b
WB
7752 if (g_conf->mon_osd_crush_smoke_test) {
7753 // sanity check: test some inputs to make sure this map isn't
7754 // totally broken
7755 dout(10) << " testing map" << dendl;
7756 stringstream ess;
7757 CrushTester tester(crush, ess);
b5b8bbf5 7758 tester.set_min_x(0);
224ce89b 7759 tester.set_max_x(50);
b5b8bbf5 7760 auto start = ceph::coarse_mono_clock::now();
224ce89b 7761 int r = tester.test_with_fork(g_conf->mon_lease);
b5b8bbf5 7762 auto duration = ceph::coarse_mono_clock::now() - start;
224ce89b
WB
7763 if (r < 0) {
7764 dout(10) << " tester.test_with_fork returns " << r
7765 << ": " << ess.str() << dendl;
7766 ss << "crush smoke test failed with " << r << ": " << ess.str();
7767 err = r;
7768 goto reply;
7769 }
b5b8bbf5
FG
7770 dout(10) << __func__ << " crush somke test duration: "
7771 << duration << ", result: " << ess.str() << dendl;
7c673cae
FG
7772 }
7773
7c673cae 7774 pending_inc.crush = data;
31f18b77 7775 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
7776 goto update;
7777
3efd9988
FG
7778 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
7779 CrushWrapper newcrush;
7780 _get_pending_crush(newcrush);
7781 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
7782 int bid = -1 - b;
7783 if (newcrush.bucket_exists(bid) &&
7784 newcrush.get_bucket_alg(bid)) {
7785 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
7786 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
7787 }
7788 }
7789 if (!validate_crush_against_features(&newcrush, ss)) {
7790 err = -EINVAL;
7791 goto reply;
7792 }
7793 pending_inc.crush.clear();
7794 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7795 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7796 get_last_committed() + 1));
7797 return true;
7c673cae 7798 } else if (prefix == "osd crush set-device-class") {
224ce89b
WB
7799 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7800 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7801 << "luminous' before using crush device classes";
7802 err = -EPERM;
7c673cae
FG
7803 goto reply;
7804 }
7805
7806 string device_class;
7807 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7808 err = -EINVAL; // no value!
7809 goto reply;
7810 }
7811
224ce89b
WB
7812 bool stop = false;
7813 vector<string> idvec;
7814 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7c673cae
FG
7815 CrushWrapper newcrush;
7816 _get_pending_crush(newcrush);
224ce89b
WB
7817 set<int> updated;
7818 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7819 set<int> osds;
7820 // wildcard?
7821 if (j == 0 &&
7822 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7823 osdmap.get_all_osds(osds);
7824 stop = true;
7825 } else {
7826 // try traditional single osd way
7827 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7828 if (osd < 0) {
7829 // ss has reason for failure
7830 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7831 err = -EINVAL;
7832 continue;
7833 }
7834 osds.insert(osd);
7835 }
7c673cae 7836
224ce89b
WB
7837 for (auto &osd : osds) {
7838 if (!osdmap.exists(osd)) {
7839 ss << "osd." << osd << " does not exist. ";
7840 continue;
7841 }
7c673cae 7842
224ce89b
WB
7843 ostringstream oss;
7844 oss << "osd." << osd;
7845 string name = oss.str();
7c673cae 7846
3a9019d9
FG
7847 if (newcrush.get_max_devices() < osd + 1) {
7848 newcrush.set_max_devices(osd + 1);
7849 }
224ce89b
WB
7850 string action;
7851 if (newcrush.item_exists(osd)) {
7852 action = "updating";
7853 } else {
7854 action = "creating";
7855 newcrush.set_item_name(osd, name);
7856 }
7c673cae 7857
224ce89b
WB
7858 dout(5) << action << " crush item id " << osd << " name '" << name
7859 << "' device_class '" << device_class << "'"
7860 << dendl;
7861 err = newcrush.update_device_class(osd, device_class, name, &ss);
7862 if (err < 0) {
7863 goto reply;
7864 }
7865 if (err == 0 && !_have_pending_crush()) {
7866 if (!stop) {
7867 // for single osd only, wildcard makes too much noise
7868 ss << "set-device-class item id " << osd << " name '" << name
7869 << "' device_class '" << device_class << "': no change";
7870 }
7871 } else {
7872 updated.insert(osd);
7873 }
7874 }
7c673cae
FG
7875 }
7876
224ce89b
WB
7877 if (!updated.empty()) {
7878 pending_inc.crush.clear();
7879 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7880 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
7881 getline(ss, rs);
7882 wait_for_finished_proposal(op,
7883 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7884 return true;
7885 }
7c673cae 7886
c07f9fc5
FG
7887 } else if (prefix == "osd crush rm-device-class") {
7888 bool stop = false;
7889 vector<string> idvec;
7890 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7891 CrushWrapper newcrush;
7892 _get_pending_crush(newcrush);
7893 set<int> updated;
7894
7895 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7896 set<int> osds;
7897
7898 // wildcard?
7899 if (j == 0 &&
7900 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7901 osdmap.get_all_osds(osds);
7902 stop = true;
7903 } else {
7904 // try traditional single osd way
7905 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7906 if (osd < 0) {
7907 // ss has reason for failure
7908 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7909 err = -EINVAL;
7910 goto reply;
7911 }
7912 osds.insert(osd);
7913 }
7914
7915 for (auto &osd : osds) {
7916 if (!osdmap.exists(osd)) {
7917 ss << "osd." << osd << " does not exist. ";
7918 continue;
7919 }
7920
7921 auto class_name = newcrush.get_item_class(osd);
c07f9fc5
FG
7922 if (!class_name) {
7923 ss << "osd." << osd << " belongs to no class, ";
7924 continue;
7925 }
7926 // note that we do not verify if class_is_in_use here
7927 // in case the device is misclassified and user wants
7928 // to overridely reset...
7929
7930 err = newcrush.remove_device_class(g_ceph_context, osd, &ss);
7931 if (err < 0) {
7932 // ss has reason for failure
7933 goto reply;
7934 }
7935 updated.insert(osd);
7936 }
7937 }
7938
7939 if (!updated.empty()) {
7940 pending_inc.crush.clear();
7941 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7942 ss << "done removing class of osd(s): " << updated;
7943 getline(ss, rs);
7944 wait_for_finished_proposal(op,
7945 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7946 return true;
7947 }
35e4c445
FG
7948 } else if (prefix == "osd crush class rename") {
7949 string srcname, dstname;
7950 if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
7951 err = -EINVAL;
7952 goto reply;
7953 }
7954 if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
7955 err = -EINVAL;
7956 goto reply;
7957 }
7958
7959 CrushWrapper newcrush;
7960 _get_pending_crush(newcrush);
181888fb
FG
7961 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
7962 // suppose this is a replay and return success
7963 // so command is idempotent
7964 ss << "already renamed to '" << dstname << "'";
7965 err = 0;
35e4c445
FG
7966 goto reply;
7967 }
c07f9fc5 7968
35e4c445
FG
7969 err = newcrush.rename_class(srcname, dstname);
7970 if (err < 0) {
7971 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
7972 << cpp_strerror(err);
7973 goto reply;
7974 }
7975
7976 pending_inc.crush.clear();
7977 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7978 ss << "rename class '" << srcname << "' to '" << dstname << "'";
7979 goto update;
7c673cae
FG
7980 } else if (prefix == "osd crush add-bucket") {
7981 // os crush add-bucket <name> <type>
7982 string name, typestr;
7983 cmd_getval(g_ceph_context, cmdmap, "name", name);
7984 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
7985
7986 if (!_have_pending_crush() &&
7987 _get_stable_crush().name_exists(name)) {
7988 ss << "bucket '" << name << "' already exists";
7989 goto reply;
7990 }
7991
7992 CrushWrapper newcrush;
7993 _get_pending_crush(newcrush);
7994
7995 if (newcrush.name_exists(name)) {
7996 ss << "bucket '" << name << "' already exists";
7997 goto update;
7998 }
7999 int type = newcrush.get_type_id(typestr);
8000 if (type < 0) {
8001 ss << "type '" << typestr << "' does not exist";
8002 err = -EINVAL;
8003 goto reply;
8004 }
8005 if (type == 0) {
8006 ss << "type '" << typestr << "' is for devices, not buckets";
8007 err = -EINVAL;
8008 goto reply;
8009 }
8010 int bucketno;
8011 err = newcrush.add_bucket(0, 0,
8012 CRUSH_HASH_DEFAULT, type, 0, NULL,
8013 NULL, &bucketno);
8014 if (err < 0) {
8015 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
8016 goto reply;
8017 }
8018 err = newcrush.set_item_name(bucketno, name);
8019 if (err < 0) {
8020 ss << "error setting bucket name to '" << name << "'";
8021 goto reply;
8022 }
8023
8024 pending_inc.crush.clear();
8025 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8026 ss << "added bucket " << name << " type " << typestr
8027 << " to crush map";
8028 goto update;
8029 } else if (prefix == "osd crush rename-bucket") {
8030 string srcname, dstname;
8031 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
8032 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
8033
8034 err = crush_rename_bucket(srcname, dstname, &ss);
8035 if (err == -EALREADY) // equivalent to success for idempotency
8036 err = 0;
8037 if (err)
8038 goto reply;
8039 else
8040 goto update;
c07f9fc5
FG
8041 } else if (prefix == "osd crush weight-set create" ||
8042 prefix == "osd crush weight-set create-compat") {
8043 CrushWrapper newcrush;
8044 _get_pending_crush(newcrush);
8045 int64_t pool;
8046 int positions;
8047 if (newcrush.has_non_straw2_buckets()) {
8048 ss << "crush map contains one or more bucket(s) that are not straw2";
224ce89b
WB
8049 err = -EPERM;
8050 goto reply;
8051 }
c07f9fc5
FG
8052 if (prefix == "osd crush weight-set create") {
8053 if (osdmap.require_min_compat_client > 0 &&
8054 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8055 ss << "require_min_compat_client "
8056 << ceph_release_name(osdmap.require_min_compat_client)
8057 << " < luminous, which is required for per-pool weight-sets. "
8058 << "Try 'ceph osd set-require-min-compat-client luminous' "
8059 << "before using the new interface";
8060 err = -EPERM;
8061 goto reply;
8062 }
8063 string poolname, mode;
8064 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8065 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8066 if (pool < 0) {
8067 ss << "pool '" << poolname << "' not found";
8068 err = -ENOENT;
8069 goto reply;
8070 }
8071 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8072 if (mode != "flat" && mode != "positional") {
8073 ss << "unrecognized weight-set mode '" << mode << "'";
8074 err = -EINVAL;
8075 goto reply;
8076 }
8077 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
8078 } else {
8079 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8080 positions = 1;
224ce89b 8081 }
c07f9fc5
FG
8082 newcrush.create_choose_args(pool, positions);
8083 pending_inc.crush.clear();
8084 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8085 goto update;
224ce89b 8086
c07f9fc5
FG
8087 } else if (prefix == "osd crush weight-set rm" ||
8088 prefix == "osd crush weight-set rm-compat") {
224ce89b
WB
8089 CrushWrapper newcrush;
8090 _get_pending_crush(newcrush);
c07f9fc5
FG
8091 int64_t pool;
8092 if (prefix == "osd crush weight-set rm") {
8093 string poolname;
8094 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8095 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8096 if (pool < 0) {
8097 ss << "pool '" << poolname << "' not found";
8098 err = -ENOENT;
8099 goto reply;
8100 }
8101 } else {
8102 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
224ce89b 8103 }
c07f9fc5
FG
8104 newcrush.rm_choose_args(pool);
8105 pending_inc.crush.clear();
8106 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8107 goto update;
224ce89b 8108
c07f9fc5
FG
8109 } else if (prefix == "osd crush weight-set reweight" ||
8110 prefix == "osd crush weight-set reweight-compat") {
8111 string poolname, item;
8112 vector<double> weight;
8113 cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
8114 cmd_getval(g_ceph_context, cmdmap, "item", item);
8115 cmd_getval(g_ceph_context, cmdmap, "weight", weight);
8116 CrushWrapper newcrush;
8117 _get_pending_crush(newcrush);
8118 int64_t pool;
8119 if (prefix == "osd crush weight-set reweight") {
8120 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
8121 if (pool < 0) {
8122 ss << "pool '" << poolname << "' not found";
8123 err = -ENOENT;
8124 goto reply;
8125 }
8126 if (!newcrush.have_choose_args(pool)) {
8127 ss << "no weight-set for pool '" << poolname << "'";
8128 err = -ENOENT;
8129 goto reply;
8130 }
8131 auto arg_map = newcrush.choose_args_get(pool);
8132 int positions = newcrush.get_choose_args_positions(arg_map);
8133 if (weight.size() != (size_t)positions) {
8134 ss << "must specify exact " << positions << " weight values";
8135 err = -EINVAL;
8136 goto reply;
8137 }
8138 } else {
8139 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
8140 if (!newcrush.have_choose_args(pool)) {
8141 ss << "no backward-compatible weight-set";
8142 err = -ENOENT;
8143 goto reply;
8144 }
224ce89b 8145 }
c07f9fc5
FG
8146 if (!newcrush.name_exists(item)) {
8147 ss << "item '" << item << "' does not exist";
8148 err = -ENOENT;
224ce89b
WB
8149 goto reply;
8150 }
c07f9fc5
FG
8151 err = newcrush.choose_args_adjust_item_weightf(
8152 g_ceph_context,
8153 newcrush.choose_args_get(pool),
8154 newcrush.get_item_id(item),
8155 weight,
8156 &ss);
224ce89b 8157 if (err < 0) {
224ce89b
WB
8158 goto reply;
8159 }
c07f9fc5 8160 err = 0;
224ce89b
WB
8161 pending_inc.crush.clear();
8162 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
224ce89b 8163 goto update;
7c673cae
FG
8164 } else if (osdid_present &&
8165 (prefix == "osd crush set" || prefix == "osd crush add")) {
8166 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
8167 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
8168 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
8169
8170 if (!osdmap.exists(osdid)) {
8171 err = -ENOENT;
c07f9fc5 8172 ss << name << " does not exist. Create it before updating the crush map";
7c673cae
FG
8173 goto reply;
8174 }
8175
8176 double weight;
8177 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
8178 ss << "unable to parse weight value '"
8179 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8180 err = -EINVAL;
8181 goto reply;
8182 }
8183
8184 string args;
8185 vector<string> argvec;
8186 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8187 map<string,string> loc;
8188 CrushWrapper::parse_loc_map(argvec, &loc);
8189
8190 if (prefix == "osd crush set"
8191 && !_get_stable_crush().item_exists(osdid)) {
8192 err = -ENOENT;
8193 ss << "unable to set item id " << osdid << " name '" << name
8194 << "' weight " << weight << " at location " << loc
8195 << ": does not exist";
8196 goto reply;
8197 }
8198
8199 dout(5) << "adding/updating crush item id " << osdid << " name '"
8200 << name << "' weight " << weight << " at location "
8201 << loc << dendl;
8202 CrushWrapper newcrush;
8203 _get_pending_crush(newcrush);
8204
8205 string action;
8206 if (prefix == "osd crush set" ||
8207 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
8208 action = "set";
8209 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
8210 } else {
8211 action = "add";
8212 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
8213 if (err == 0)
8214 err = 1;
8215 }
8216
8217 if (err < 0)
8218 goto reply;
8219
8220 if (err == 0 && !_have_pending_crush()) {
8221 ss << action << " item id " << osdid << " name '" << name << "' weight "
8222 << weight << " at location " << loc << ": no change";
8223 goto reply;
8224 }
8225
8226 pending_inc.crush.clear();
8227 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8228 ss << action << " item id " << osdid << " name '" << name << "' weight "
8229 << weight << " at location " << loc << " to crush map";
8230 getline(ss, rs);
8231 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8232 get_last_committed() + 1));
8233 return true;
8234
8235 } else if (prefix == "osd crush create-or-move") {
8236 do {
8237 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
8238 if (!osdmap.exists(osdid)) {
8239 err = -ENOENT;
8240 ss << name << " does not exist. create it before updating the crush map";
8241 goto reply;
8242 }
8243
8244 double weight;
8245 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
8246 ss << "unable to parse weight value '"
8247 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8248 err = -EINVAL;
8249 goto reply;
8250 }
8251
8252 string args;
8253 vector<string> argvec;
8254 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8255 map<string,string> loc;
8256 CrushWrapper::parse_loc_map(argvec, &loc);
8257
8258 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
8259 << " at location " << loc << dendl;
8260
8261 CrushWrapper newcrush;
8262 _get_pending_crush(newcrush);
8263
8264 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
8265 if (err == 0) {
8266 ss << "create-or-move updated item name '" << name << "' weight " << weight
8267 << " at location " << loc << " to crush map";
8268 break;
8269 }
8270 if (err > 0) {
8271 pending_inc.crush.clear();
8272 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8273 ss << "create-or-move updating item name '" << name << "' weight " << weight
8274 << " at location " << loc << " to crush map";
8275 getline(ss, rs);
8276 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8277 get_last_committed() + 1));
8278 return true;
8279 }
8280 } while (false);
8281
8282 } else if (prefix == "osd crush move") {
8283 do {
8284 // osd crush move <name> <loc1> [<loc2> ...]
8285
8286 string args;
8287 vector<string> argvec;
8288 cmd_getval(g_ceph_context, cmdmap, "name", name);
8289 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8290 map<string,string> loc;
8291 CrushWrapper::parse_loc_map(argvec, &loc);
8292
8293 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
8294 CrushWrapper newcrush;
8295 _get_pending_crush(newcrush);
8296
8297 if (!newcrush.name_exists(name)) {
8298 err = -ENOENT;
8299 ss << "item " << name << " does not exist";
8300 break;
8301 }
8302 int id = newcrush.get_item_id(name);
8303
8304 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8305 if (id >= 0) {
8306 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
8307 } else {
8308 err = newcrush.move_bucket(g_ceph_context, id, loc);
8309 }
8310 if (err >= 0) {
8311 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8312 pending_inc.crush.clear();
8313 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8314 getline(ss, rs);
8315 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8316 get_last_committed() + 1));
8317 return true;
8318 }
8319 } else {
8320 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
8321 err = 0;
8322 }
8323 } while (false);
31f18b77
FG
8324 } else if (prefix == "osd crush swap-bucket") {
8325 string source, dest, force;
8326 cmd_getval(g_ceph_context, cmdmap, "source", source);
8327 cmd_getval(g_ceph_context, cmdmap, "dest", dest);
8328 cmd_getval(g_ceph_context, cmdmap, "force", force);
8329 CrushWrapper newcrush;
8330 _get_pending_crush(newcrush);
8331 if (!newcrush.name_exists(source)) {
8332 ss << "source item " << source << " does not exist";
8333 err = -ENOENT;
8334 goto reply;
8335 }
8336 if (!newcrush.name_exists(dest)) {
8337 ss << "dest item " << dest << " does not exist";
8338 err = -ENOENT;
8339 goto reply;
8340 }
8341 int sid = newcrush.get_item_id(source);
8342 int did = newcrush.get_item_id(dest);
8343 int sparent;
8344 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
8345 force != "--yes-i-really-mean-it") {
8346 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
8347 err = -EPERM;
8348 goto reply;
8349 }
8350 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
8351 force != "--yes-i-really-mean-it") {
8352 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
8353 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
8354 << "; pass --yes-i-really-mean-it to proceed anyway";
8355 err = -EPERM;
8356 goto reply;
8357 }
8358 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
8359 if (r < 0) {
8360 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 8361 err = r;
31f18b77
FG
8362 goto reply;
8363 }
8364 ss << "swapped bucket of " << source << " to " << dest;
8365 pending_inc.crush.clear();
8366 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8367 wait_for_finished_proposal(op,
8368 new Monitor::C_Command(mon, op, err, ss.str(),
8369 get_last_committed() + 1));
8370 return true;
8371 } else if (prefix == "osd crush link") {
8372 // osd crush link <name> <loc1> [<loc2> ...]
8373 string name;
8374 cmd_getval(g_ceph_context, cmdmap, "name", name);
8375 vector<string> argvec;
8376 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
8377 map<string,string> loc;
8378 CrushWrapper::parse_loc_map(argvec, &loc);
8379
8380 // Need an explicit check for name_exists because get_item_id returns
8381 // 0 on unfound.
8382 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
8383 if (!osdmap.crush->name_exists(name)) {
8384 err = -ENOENT;
8385 ss << "item " << name << " does not exist";
8386 goto reply;
8387 } else {
8388 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
8389 }
8390 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
8391 ss << "no need to move item id " << id << " name '" << name
8392 << "' to location " << loc << " in crush map";
8393 err = 0;
8394 goto reply;
8395 }
8396
8397 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
8398 CrushWrapper newcrush;
8399 _get_pending_crush(newcrush);
8400
8401 if (!newcrush.name_exists(name)) {
8402 err = -ENOENT;
8403 ss << "item " << name << " does not exist";
8404 goto reply;
8405 } else {
8406 int id = newcrush.get_item_id(name);
8407 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
8408 err = newcrush.link_bucket(g_ceph_context, id, loc);
8409 if (err >= 0) {
8410 ss << "linked item id " << id << " name '" << name
8411 << "' to location " << loc << " in crush map";
8412 pending_inc.crush.clear();
8413 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8414 } else {
8415 ss << "cannot link item id " << id << " name '" << name
8416 << "' to location " << loc;
8417 goto reply;
8418 }
8419 } else {
8420 ss << "no need to move item id " << id << " name '" << name
8421 << "' to location " << loc << " in crush map";
8422 err = 0;
8423 }
8424 }
8425 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
8426 get_last_committed() + 1));
8427 return true;
8428 } else if (prefix == "osd crush rm" ||
8429 prefix == "osd crush remove" ||
8430 prefix == "osd crush unlink") {
8431 do {
8432 // osd crush rm <id> [ancestor]
8433 CrushWrapper newcrush;
8434 _get_pending_crush(newcrush);
8435
8436 string name;
8437 cmd_getval(g_ceph_context, cmdmap, "name", name);
8438
8439 if (!osdmap.crush->name_exists(name)) {
8440 err = 0;
8441 ss << "device '" << name << "' does not appear in the crush map";
8442 break;
8443 }
8444 if (!newcrush.name_exists(name)) {
8445 err = 0;
8446 ss << "device '" << name << "' does not appear in the crush map";
8447 getline(ss, rs);
8448 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8449 get_last_committed() + 1));
8450 return true;
8451 }
8452 int id = newcrush.get_item_id(name);
31f18b77
FG
8453 int ancestor = 0;
8454
7c673cae
FG
8455 bool unlink_only = prefix == "osd crush unlink";
8456 string ancestor_str;
8457 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
8458 if (!newcrush.name_exists(ancestor_str)) {
8459 err = -ENOENT;
8460 ss << "ancestor item '" << ancestor_str
8461 << "' does not appear in the crush map";
8462 break;
8463 }
31f18b77 8464 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 8465 }
31f18b77
FG
8466
8467 err = prepare_command_osd_crush_remove(
8468 newcrush,
8469 id, ancestor,
8470 (ancestor < 0), unlink_only);
8471
7c673cae
FG
8472 if (err == -ENOENT) {
8473 ss << "item " << id << " does not appear in that position";
8474 err = 0;
8475 break;
8476 }
8477 if (err == 0) {
7c673cae
FG
8478 ss << "removed item id " << id << " name '" << name << "' from crush map";
8479 getline(ss, rs);
8480 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8481 get_last_committed() + 1));
8482 return true;
8483 }
8484 } while (false);
8485
8486 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
8487 CrushWrapper newcrush;
8488 _get_pending_crush(newcrush);
8489
8490 newcrush.reweight(g_ceph_context);
8491 pending_inc.crush.clear();
8492 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8493 ss << "reweighted crush hierarchy";
8494 getline(ss, rs);
8495 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8496 get_last_committed() + 1));
8497 return true;
8498 } else if (prefix == "osd crush reweight") {
8499 // osd crush reweight <name> <weight>
8500 CrushWrapper newcrush;
8501 _get_pending_crush(newcrush);
8502
8503 string name;
8504 cmd_getval(g_ceph_context, cmdmap, "name", name);
8505 if (!newcrush.name_exists(name)) {
8506 err = -ENOENT;
8507 ss << "device '" << name << "' does not appear in the crush map";
8508 goto reply;
8509 }
8510
8511 int id = newcrush.get_item_id(name);
8512 if (id < 0) {
8513 ss << "device '" << name << "' is not a leaf in the crush map";
8514 err = -EINVAL;
8515 goto reply;
8516 }
8517 double w;
8518 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8519 ss << "unable to parse weight value '"
8520 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8521 err = -EINVAL;
8522 goto reply;
8523 }
8524
8525 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
8526 if (err < 0)
8527 goto reply;
8528 pending_inc.crush.clear();
8529 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8530 ss << "reweighted item id " << id << " name '" << name << "' to " << w
8531 << " in crush map";
8532 getline(ss, rs);
8533 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8534 get_last_committed() + 1));
8535 return true;
8536 } else if (prefix == "osd crush reweight-subtree") {
8537 // osd crush reweight <name> <weight>
8538 CrushWrapper newcrush;
8539 _get_pending_crush(newcrush);
8540
8541 string name;
8542 cmd_getval(g_ceph_context, cmdmap, "name", name);
8543 if (!newcrush.name_exists(name)) {
8544 err = -ENOENT;
8545 ss << "device '" << name << "' does not appear in the crush map";
8546 goto reply;
8547 }
8548
8549 int id = newcrush.get_item_id(name);
8550 if (id >= 0) {
8551 ss << "device '" << name << "' is not a subtree in the crush map";
8552 err = -EINVAL;
8553 goto reply;
8554 }
8555 double w;
8556 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
8557 ss << "unable to parse weight value '"
8558 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
8559 err = -EINVAL;
8560 goto reply;
8561 }
8562
8563 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
8564 if (err < 0)
8565 goto reply;
8566 pending_inc.crush.clear();
8567 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8568 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
8569 << " in crush map";
8570 getline(ss, rs);
8571 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8572 get_last_committed() + 1));
8573 return true;
8574 } else if (prefix == "osd crush tunables") {
8575 CrushWrapper newcrush;
8576 _get_pending_crush(newcrush);
8577
8578 err = 0;
8579 string profile;
8580 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8581 if (profile == "legacy" || profile == "argonaut") {
8582 newcrush.set_tunables_legacy();
8583 } else if (profile == "bobtail") {
8584 newcrush.set_tunables_bobtail();
8585 } else if (profile == "firefly") {
8586 newcrush.set_tunables_firefly();
8587 } else if (profile == "hammer") {
8588 newcrush.set_tunables_hammer();
8589 } else if (profile == "jewel") {
8590 newcrush.set_tunables_jewel();
8591 } else if (profile == "optimal") {
8592 newcrush.set_tunables_optimal();
8593 } else if (profile == "default") {
8594 newcrush.set_tunables_default();
8595 } else {
8596 ss << "unrecognized profile '" << profile << "'";
8597 err = -EINVAL;
8598 goto reply;
8599 }
8600
8601 if (!validate_crush_against_features(&newcrush, ss)) {
8602 err = -EINVAL;
8603 goto reply;
8604 }
8605
8606 pending_inc.crush.clear();
8607 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8608 ss << "adjusted tunables profile to " << profile;
8609 getline(ss, rs);
8610 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8611 get_last_committed() + 1));
8612 return true;
8613 } else if (prefix == "osd crush set-tunable") {
8614 CrushWrapper newcrush;
8615 _get_pending_crush(newcrush);
8616
8617 err = 0;
8618 string tunable;
8619 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
8620
8621 int64_t value = -1;
8622 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
8623 err = -EINVAL;
8624 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
8625 goto reply;
8626 }
8627
8628 if (tunable == "straw_calc_version") {
224ce89b 8629 if (value != 0 && value != 1) {
7c673cae
FG
8630 ss << "value must be 0 or 1; got " << value;
8631 err = -EINVAL;
8632 goto reply;
8633 }
8634 newcrush.set_straw_calc_version(value);
8635 } else {
8636 ss << "unrecognized tunable '" << tunable << "'";
8637 err = -EINVAL;
8638 goto reply;
8639 }
8640
8641 if (!validate_crush_against_features(&newcrush, ss)) {
8642 err = -EINVAL;
8643 goto reply;
8644 }
8645
8646 pending_inc.crush.clear();
8647 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8648 ss << "adjusted tunable " << tunable << " to " << value;
8649 getline(ss, rs);
8650 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8651 get_last_committed() + 1));
8652 return true;
8653
8654 } else if (prefix == "osd crush rule create-simple") {
8655 string name, root, type, mode;
8656 cmd_getval(g_ceph_context, cmdmap, "name", name);
8657 cmd_getval(g_ceph_context, cmdmap, "root", root);
8658 cmd_getval(g_ceph_context, cmdmap, "type", type);
8659 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
8660 if (mode == "")
8661 mode = "firstn";
8662
8663 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
8664 // The name is uniquely associated to a ruleid and the rule it contains
8665 // From the user point of view, the rule is more meaningfull.
8666 ss << "rule " << name << " already exists";
7c673cae
FG
8667 err = 0;
8668 goto reply;
8669 }
8670
8671 CrushWrapper newcrush;
8672 _get_pending_crush(newcrush);
8673
8674 if (newcrush.rule_exists(name)) {
31f18b77
FG
8675 // The name is uniquely associated to a ruleid and the rule it contains
8676 // From the user point of view, the rule is more meaningfull.
8677 ss << "rule " << name << " already exists";
7c673cae
FG
8678 err = 0;
8679 } else {
224ce89b 8680 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
8681 pg_pool_t::TYPE_REPLICATED, &ss);
8682 if (ruleno < 0) {
8683 err = ruleno;
8684 goto reply;
8685 }
8686
8687 pending_inc.crush.clear();
8688 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8689 }
8690 getline(ss, rs);
8691 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8692 get_last_committed() + 1));
8693 return true;
8694
224ce89b
WB
8695 } else if (prefix == "osd crush rule create-replicated") {
8696 string name, root, type, device_class;
8697 cmd_getval(g_ceph_context, cmdmap, "name", name);
8698 cmd_getval(g_ceph_context, cmdmap, "root", root);
8699 cmd_getval(g_ceph_context, cmdmap, "type", type);
8700 cmd_getval(g_ceph_context, cmdmap, "class", device_class);
8701
8702 if (!device_class.empty()) {
8703 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8704 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8705 << "luminous' before using crush device classes";
8706 err = -EPERM;
8707 goto reply;
8708 }
8709 }
8710
8711 if (osdmap.crush->rule_exists(name)) {
8712 // The name is uniquely associated to a ruleid and the rule it contains
8713 // From the user point of view, the rule is more meaningfull.
8714 ss << "rule " << name << " already exists";
8715 err = 0;
8716 goto reply;
8717 }
8718
8719 CrushWrapper newcrush;
8720 _get_pending_crush(newcrush);
8721
8722 if (newcrush.rule_exists(name)) {
8723 // The name is uniquely associated to a ruleid and the rule it contains
8724 // From the user point of view, the rule is more meaningfull.
8725 ss << "rule " << name << " already exists";
8726 err = 0;
8727 } else {
8728 int ruleno = newcrush.add_simple_rule(
8729 name, root, type, device_class,
8730 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
8731 if (ruleno < 0) {
8732 err = ruleno;
8733 goto reply;
8734 }
8735
8736 pending_inc.crush.clear();
8737 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8738 }
8739 getline(ss, rs);
8740 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8741 get_last_committed() + 1));
8742 return true;
8743
7c673cae
FG
8744 } else if (prefix == "osd erasure-code-profile rm") {
8745 string name;
8746 cmd_getval(g_ceph_context, cmdmap, "name", name);
8747
8748 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
8749 goto wait;
8750
8751 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
8752 err = -EBUSY;
8753 goto reply;
8754 }
8755
8756 if (osdmap.has_erasure_code_profile(name) ||
8757 pending_inc.new_erasure_code_profiles.count(name)) {
8758 if (osdmap.has_erasure_code_profile(name)) {
8759 pending_inc.old_erasure_code_profiles.push_back(name);
8760 } else {
8761 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
8762 pending_inc.new_erasure_code_profiles.erase(name);
8763 }
8764
8765 getline(ss, rs);
8766 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8767 get_last_committed() + 1));
8768 return true;
8769 } else {
8770 ss << "erasure-code-profile " << name << " does not exist";
8771 err = 0;
8772 goto reply;
8773 }
8774
8775 } else if (prefix == "osd erasure-code-profile set") {
8776 string name;
8777 cmd_getval(g_ceph_context, cmdmap, "name", name);
8778 vector<string> profile;
8779 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8780 bool force;
8781 if (profile.size() > 0 && profile.back() == "--force") {
8782 profile.pop_back();
8783 force = true;
8784 } else {
8785 force = false;
8786 }
8787 map<string,string> profile_map;
8788 err = parse_erasure_code_profile(profile, &profile_map, &ss);
8789 if (err)
8790 goto reply;
8791 if (profile_map.find("plugin") == profile_map.end()) {
8792 ss << "erasure-code-profile " << profile_map
8793 << " must contain a plugin entry" << std::endl;
8794 err = -EINVAL;
8795 goto reply;
8796 }
8797 string plugin = profile_map["plugin"];
8798
8799 if (pending_inc.has_erasure_code_profile(name)) {
8800 dout(20) << "erasure code profile " << name << " try again" << dendl;
8801 goto wait;
8802 } else {
8803 if (plugin == "isa" || plugin == "lrc") {
8804 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
8805 if (err == -EAGAIN)
8806 goto wait;
8807 if (err)
8808 goto reply;
8809 } else if (plugin == "shec") {
8810 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
8811 if (err == -EAGAIN)
8812 goto wait;
8813 if (err)
8814 goto reply;
8815 }
8816 err = normalize_profile(name, profile_map, force, &ss);
8817 if (err)
8818 goto reply;
8819
8820 if (osdmap.has_erasure_code_profile(name)) {
8821 ErasureCodeProfile existing_profile_map =
8822 osdmap.get_erasure_code_profile(name);
8823 err = normalize_profile(name, existing_profile_map, force, &ss);
8824 if (err)
8825 goto reply;
8826
8827 if (existing_profile_map == profile_map) {
8828 err = 0;
8829 goto reply;
8830 }
8831 if (!force) {
8832 err = -EPERM;
8833 ss << "will not override erasure code profile " << name
8834 << " because the existing profile "
8835 << existing_profile_map
8836 << " is different from the proposed profile "
8837 << profile_map;
8838 goto reply;
8839 }
8840 }
8841
8842 dout(20) << "erasure code profile set " << name << "="
8843 << profile_map << dendl;
8844 pending_inc.set_erasure_code_profile(name, profile_map);
8845 }
8846
8847 getline(ss, rs);
8848 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8849 get_last_committed() + 1));
8850 return true;
8851
8852 } else if (prefix == "osd crush rule create-erasure") {
8853 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
8854 if (err == -EAGAIN)
8855 goto wait;
8856 if (err)
8857 goto reply;
8858 string name, poolstr;
8859 cmd_getval(g_ceph_context, cmdmap, "name", name);
8860 string profile;
8861 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8862 if (profile == "")
8863 profile = "default";
8864 if (profile == "default") {
8865 if (!osdmap.has_erasure_code_profile(profile)) {
8866 if (pending_inc.has_erasure_code_profile(profile)) {
8867 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
8868 goto wait;
8869 }
8870
8871 map<string,string> profile_map;
8872 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
8873 profile_map,
8874 &ss);
8875 if (err)
8876 goto reply;
8877 err = normalize_profile(name, profile_map, true, &ss);
8878 if (err)
8879 goto reply;
8880 dout(20) << "erasure code profile set " << profile << "="
8881 << profile_map << dendl;
8882 pending_inc.set_erasure_code_profile(profile, profile_map);
8883 goto wait;
8884 }
8885 }
8886
31f18b77
FG
8887 int rule;
8888 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
8889 if (err < 0) {
8890 switch(err) {
8891 case -EEXIST: // return immediately
8892 ss << "rule " << name << " already exists";
8893 err = 0;
8894 goto reply;
8895 break;
8896 case -EALREADY: // wait for pending to be proposed
8897 ss << "rule " << name << " already exists";
8898 err = 0;
8899 break;
8900 default: // non recoverable error
8901 goto reply;
8902 break;
8903 }
8904 } else {
31f18b77 8905 ss << "created rule " << name << " at " << rule;
7c673cae
FG
8906 }
8907
8908 getline(ss, rs);
8909 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8910 get_last_committed() + 1));
8911 return true;
8912
8913 } else if (prefix == "osd crush rule rm") {
8914 string name;
8915 cmd_getval(g_ceph_context, cmdmap, "name", name);
8916
8917 if (!osdmap.crush->rule_exists(name)) {
8918 ss << "rule " << name << " does not exist";
8919 err = 0;
8920 goto reply;
8921 }
8922
8923 CrushWrapper newcrush;
8924 _get_pending_crush(newcrush);
8925
8926 if (!newcrush.rule_exists(name)) {
8927 ss << "rule " << name << " does not exist";
8928 err = 0;
8929 } else {
8930 int ruleno = newcrush.get_rule_id(name);
8931 assert(ruleno >= 0);
8932
8933 // make sure it is not in use.
8934 // FIXME: this is ok in some situations, but let's not bother with that
8935 // complexity now.
8936 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
3efd9988 8937 if (osdmap.crush_rule_in_use(ruleset)) {
7c673cae
FG
8938 ss << "crush ruleset " << name << " " << ruleset << " is in use";
8939 err = -EBUSY;
8940 goto reply;
8941 }
8942
8943 err = newcrush.remove_rule(ruleno);
8944 if (err < 0) {
8945 goto reply;
8946 }
8947
8948 pending_inc.crush.clear();
8949 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8950 }
8951 getline(ss, rs);
8952 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8953 get_last_committed() + 1));
8954 return true;
8955
b5b8bbf5
FG
8956 } else if (prefix == "osd crush rule rename") {
8957 string srcname;
8958 string dstname;
8959 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
8960 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
8961 if (srcname.empty() || dstname.empty()) {
8962 ss << "must specify both source rule name and destination rule name";
8963 err = -EINVAL;
8964 goto reply;
8965 }
8966 if (srcname == dstname) {
8967 ss << "destination rule name is equal to source rule name";
8968 err = 0;
8969 goto reply;
8970 }
8971
8972 CrushWrapper newcrush;
8973 _get_pending_crush(newcrush);
181888fb
FG
8974 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
8975 // srcname does not exist and dstname already exists
8976 // suppose this is a replay and return success
8977 // (so this command is idempotent)
8978 ss << "already renamed to '" << dstname << "'";
8979 err = 0;
8980 goto reply;
8981 }
8982
b5b8bbf5
FG
8983 err = newcrush.rename_rule(srcname, dstname, &ss);
8984 if (err < 0) {
8985 // ss has reason for failure
8986 goto reply;
8987 }
8988 pending_inc.crush.clear();
8989 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8990 getline(ss, rs);
8991 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8992 get_last_committed() + 1));
8993 return true;
8994
7c673cae
FG
8995 } else if (prefix == "osd setmaxosd") {
8996 int64_t newmax;
8997 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
8998 ss << "unable to parse 'newmax' value '"
8999 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
9000 err = -EINVAL;
9001 goto reply;
9002 }
9003
9004 if (newmax > g_conf->mon_max_osd) {
9005 err = -ERANGE;
9006 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
9007 << g_conf->mon_max_osd << ")";
9008 goto reply;
9009 }
9010
9011 // Don't allow shrinking OSD number as this will cause data loss
9012 // and may cause kernel crashes.
9013 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
9014 if (newmax < osdmap.get_max_osd()) {
9015 // Check if the OSDs exist between current max and new value.
9016 // If there are any OSDs exist, then don't allow shrinking number
9017 // of OSDs.
9018 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
9019 if (osdmap.exists(i)) {
9020 err = -EBUSY;
9021 ss << "cannot shrink max_osd to " << newmax
9022 << " because osd." << i << " (and possibly others) still in use";
9023 goto reply;
9024 }
9025 }
9026 }
9027
9028 pending_inc.new_max_osd = newmax;
9029 ss << "set new max_osd = " << pending_inc.new_max_osd;
9030 getline(ss, rs);
9031 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9032 get_last_committed() + 1));
9033 return true;
9034
9035 } else if (prefix == "osd set-full-ratio" ||
9036 prefix == "osd set-backfillfull-ratio" ||
9037 prefix == "osd set-nearfull-ratio") {
31f18b77 9038 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9039 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9040 << "luminous' before using the new interface";
7c673cae
FG
9041 err = -EPERM;
9042 goto reply;
9043 }
9044 double n;
9045 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
9046 ss << "unable to parse 'ratio' value '"
224ce89b 9047 << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
7c673cae
FG
9048 err = -EINVAL;
9049 goto reply;
9050 }
9051 if (prefix == "osd set-full-ratio")
9052 pending_inc.new_full_ratio = n;
9053 else if (prefix == "osd set-backfillfull-ratio")
9054 pending_inc.new_backfillfull_ratio = n;
9055 else if (prefix == "osd set-nearfull-ratio")
9056 pending_inc.new_nearfull_ratio = n;
9057 ss << prefix << " " << n;
9058 getline(ss, rs);
9059 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9060 get_last_committed() + 1));
9061 return true;
9062 } else if (prefix == "osd set-require-min-compat-client") {
31f18b77 9063 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9064 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9065 << "luminous' before using the new interface";
7c673cae
FG
9066 err = -EPERM;
9067 goto reply;
9068 }
9069 string v;
9070 cmd_getval(g_ceph_context, cmdmap, "version", v);
31f18b77
FG
9071 int vno = ceph_release_from_name(v.c_str());
9072 if (vno <= 0) {
7c673cae
FG
9073 ss << "version " << v << " is not recognized";
9074 err = -EINVAL;
9075 goto reply;
9076 }
9077 OSDMap newmap;
9078 newmap.deepish_copy_from(osdmap);
9079 newmap.apply_incremental(pending_inc);
31f18b77
FG
9080 newmap.require_min_compat_client = vno;
9081 auto mvno = newmap.get_min_compat_client();
9082 if (vno < mvno) {
9083 ss << "osdmap current utilizes features that require "
9084 << ceph_release_name(mvno)
9085 << "; cannot set require_min_compat_client below that to "
9086 << ceph_release_name(vno);
7c673cae
FG
9087 err = -EPERM;
9088 goto reply;
9089 }
31f18b77
FG
9090 string sure;
9091 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9092 if (sure != "--yes-i-really-mean-it") {
9093 FeatureMap m;
9094 mon->get_combined_feature_map(&m);
9095 uint64_t features = ceph_release_features(vno);
9096 bool first = true;
9097 bool ok = true;
9098 for (int type : {
9099 CEPH_ENTITY_TYPE_CLIENT,
9100 CEPH_ENTITY_TYPE_MDS,
9101 CEPH_ENTITY_TYPE_MGR }) {
9102 auto p = m.m.find(type);
9103 if (p == m.m.end()) {
9104 continue;
9105 }
9106 for (auto& q : p->second) {
9107 uint64_t missing = ~q.first & features;
9108 if (missing) {
9109 if (first) {
9110 ss << "cannot set require_min_compat_client to " << v << ": ";
9111 } else {
9112 ss << "; ";
9113 }
9114 first = false;
9115 ss << q.second << " connected " << ceph_entity_type_name(type)
9116 << "(s) look like " << ceph_release_name(
9117 ceph_release_from_features(q.first))
9118 << " (missing 0x" << std::hex << missing << std::dec << ")";
9119 ok = false;
9120 }
9121 }
9122 }
9123 if (!ok) {
9124 ss << "; add --yes-i-really-mean-it to do it anyway";
9125 err = -EPERM;
9126 goto reply;
9127 }
9128 }
9129 ss << "set require_min_compat_client to " << ceph_release_name(vno);
9130 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
9131 getline(ss, rs);
9132 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9133 get_last_committed() + 1));
9134 return true;
9135 } else if (prefix == "osd pause") {
9136 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9137
9138 } else if (prefix == "osd unpause") {
9139 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9140
9141 } else if (prefix == "osd set") {
3efd9988
FG
9142 string sure;
9143 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
7c673cae
FG
9144 string key;
9145 cmd_getval(g_ceph_context, cmdmap, "key", key);
9146 if (key == "full")
9147 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
9148 else if (key == "pause")
9149 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9150 else if (key == "noup")
9151 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
9152 else if (key == "nodown")
9153 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
9154 else if (key == "noout")
9155 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
9156 else if (key == "noin")
9157 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
9158 else if (key == "nobackfill")
9159 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
9160 else if (key == "norebalance")
9161 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
9162 else if (key == "norecover")
9163 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
9164 else if (key == "noscrub")
9165 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
9166 else if (key == "nodeep-scrub")
9167 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9168 else if (key == "notieragent")
9169 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
9170 else if (key == "sortbitwise") {
3efd9988
FG
9171 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9172 ss << "Not advisable to continue since no OSDs are up. Pass "
9173 << "--yes-i-really-mean-it if you really wish to continue.";
9174 err = -EPERM;
9175 goto reply;
9176 }
9177 if ((osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)
9178 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9179 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
9180 } else {
9181 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
9182 err = -EPERM;
31f18b77 9183 goto reply;
7c673cae 9184 }
c07f9fc5 9185 } else if (key == "recovery_deletes") {
3efd9988
FG
9186 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9187 ss << "Not advisable to continue since no OSDs are up. Pass "
9188 << "--yes-i-really-mean-it if you really wish to continue.";
9189 err = -EPERM;
9190 goto reply;
9191 }
9192 if (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_RECOVERY_DELETES)
9193 || sure == "--yes-i-really-mean-it") {
c07f9fc5
FG
9194 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9195 } else {
9196 ss << "not all up OSDs have OSD_RECOVERY_DELETES feature";
9197 err = -EPERM;
9198 goto reply;
9199 }
7c673cae 9200 } else if (key == "require_jewel_osds") {
3efd9988
FG
9201 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9202 ss << "Not advisable to continue since no OSDs are up. Pass "
9203 << "--yes-i-really-mean-it if you really wish to continue.";
9204 err = -EPERM;
9205 goto reply;
9206 }
7c673cae
FG
9207 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9208 ss << "the sortbitwise flag must be set before require_jewel_osds";
9209 err = -EPERM;
31f18b77
FG
9210 goto reply;
9211 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
9212 ss << "require_osd_release is already >= jewel";
9213 err = 0;
9214 goto reply;
3efd9988
FG
9215 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)
9216 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9217 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
9218 } else {
9219 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
9220 err = -EPERM;
9221 }
9222 } else if (key == "require_kraken_osds") {
3efd9988
FG
9223 if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
9224 ss << "Not advisable to continue since no OSDs are up. Pass "
9225 << "--yes-i-really-mean-it if you really wish to continue.";
9226 err = -EPERM;
9227 goto reply;
9228 }
7c673cae
FG
9229 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9230 ss << "the sortbitwise flag must be set before require_kraken_osds";
9231 err = -EPERM;
31f18b77
FG
9232 goto reply;
9233 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
9234 ss << "require_osd_release is already >= kraken";
9235 err = 0;
9236 goto reply;
3efd9988
FG
9237 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)
9238 || sure == "--yes-i-really-mean-it") {
7c673cae
FG
9239 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
9240 // ensure JEWEL is also set
9241 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
9242 return r;
9243 } else {
9244 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
9245 err = -EPERM;
9246 }
7c673cae
FG
9247 } else {
9248 ss << "unrecognized flag '" << key << "'";
9249 err = -EINVAL;
9250 }
9251
9252 } else if (prefix == "osd unset") {
9253 string key;
9254 cmd_getval(g_ceph_context, cmdmap, "key", key);
9255 if (key == "full")
9256 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
9257 else if (key == "pause")
9258 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
9259 else if (key == "noup")
9260 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
9261 else if (key == "nodown")
9262 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
9263 else if (key == "noout")
9264 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
9265 else if (key == "noin")
9266 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
9267 else if (key == "nobackfill")
9268 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
9269 else if (key == "norebalance")
9270 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
9271 else if (key == "norecover")
9272 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
9273 else if (key == "noscrub")
9274 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
9275 else if (key == "nodeep-scrub")
9276 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
9277 else if (key == "notieragent")
9278 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
224ce89b 9279 else {
7c673cae
FG
9280 ss << "unrecognized flag '" << key << "'";
9281 err = -EINVAL;
9282 }
9283
31f18b77
FG
9284 } else if (prefix == "osd require-osd-release") {
9285 string release;
9286 cmd_getval(g_ceph_context, cmdmap, "release", release);
3efd9988
FG
9287 string sure;
9288 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
31f18b77
FG
9289 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
9290 ss << "the sortbitwise flag must be set first";
9291 err = -EPERM;
9292 goto reply;
9293 }
9294 int rel = ceph_release_from_name(release.c_str());
9295 if (rel <= 0) {
9296 ss << "unrecognized release " << release;
9297 err = -EINVAL;
9298 goto reply;
9299 }
9300 if (rel < CEPH_RELEASE_LUMINOUS) {
9301 ss << "use this command only for luminous and later";
9302 err = -EINVAL;
9303 goto reply;
9304 }
d2e6a577
FG
9305 if (rel == osdmap.require_osd_release) {
9306 // idempotent
9307 err = 0;
9308 goto reply;
9309 }
31f18b77
FG
9310 if (rel == CEPH_RELEASE_LUMINOUS) {
9311 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
9312 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
9313 err = -EPERM;
9314 goto reply;
9315 }
9316 } else {
9317 ss << "not supported for this release yet";
9318 err = -EPERM;
9319 goto reply;
9320 }
9321 if (rel < osdmap.require_osd_release) {
9322 ss << "require_osd_release cannot be lowered once it has been set";
9323 err = -EPERM;
9324 goto reply;
9325 }
9326 pending_inc.new_require_osd_release = rel;
c07f9fc5
FG
9327 if (rel >= CEPH_RELEASE_LUMINOUS &&
9328 !osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
9329 return prepare_set_flag(op, CEPH_OSDMAP_RECOVERY_DELETES);
9330 }
31f18b77 9331 goto update;
7c673cae
FG
9332 } else if (prefix == "osd cluster_snap") {
9333 // ** DISABLE THIS FOR NOW **
9334 ss << "cluster snapshot currently disabled (broken implementation)";
9335 // ** DISABLE THIS FOR NOW **
9336
9337 } else if (prefix == "osd down" ||
9338 prefix == "osd out" ||
9339 prefix == "osd in" ||
9340 prefix == "osd rm") {
9341
9342 bool any = false;
31f18b77
FG
9343 bool stop = false;
9344 bool verbose = true;
7c673cae
FG
9345
9346 vector<string> idvec;
9347 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
31f18b77
FG
9348 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9349 set<int> osds;
9350
9351 // wildcard?
9352 if (j == 0 &&
9353 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9354 if (prefix == "osd in") {
9355 // touch out osds only
9356 osdmap.get_out_osds(osds);
9357 } else {
9358 osdmap.get_all_osds(osds);
9359 }
9360 stop = true;
9361 verbose = false; // so the output is less noisy.
9362 } else {
9363 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9364 if (osd < 0) {
9365 ss << "invalid osd id" << osd;
9366 err = -EINVAL;
9367 continue;
9368 } else if (!osdmap.exists(osd)) {
9369 ss << "osd." << osd << " does not exist. ";
9370 continue;
9371 }
9372
9373 osds.insert(osd);
7c673cae 9374 }
31f18b77
FG
9375
9376 for (auto &osd : osds) {
9377 if (prefix == "osd down") {
9378 if (osdmap.is_down(osd)) {
9379 if (verbose)
9380 ss << "osd." << osd << " is already down. ";
9381 } else {
9382 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
9383 ss << "marked down osd." << osd << ". ";
9384 any = true;
9385 }
9386 } else if (prefix == "osd out") {
9387 if (osdmap.is_out(osd)) {
9388 if (verbose)
9389 ss << "osd." << osd << " is already out. ";
9390 } else {
9391 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
9392 if (osdmap.osd_weight[osd]) {
9393 if (pending_inc.new_xinfo.count(osd) == 0) {
9394 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9395 }
9396 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 9397 }
31f18b77 9398 ss << "marked out osd." << osd << ". ";
224ce89b
WB
9399 std::ostringstream msg;
9400 msg << "Client " << op->get_session()->entity_name
9401 << " marked osd." << osd << " out";
9402 if (osdmap.is_up(osd)) {
9403 msg << ", while it was still marked up";
9404 } else {
3efd9988
FG
9405 auto period = ceph_clock_now() - down_pending_out[osd];
9406 msg << ", after it was down for " << int(period.sec())
224ce89b
WB
9407 << " seconds";
9408 }
9409
9410 mon->clog->info() << msg.str();
31f18b77 9411 any = true;
7c673cae 9412 }
31f18b77
FG
9413 } else if (prefix == "osd in") {
9414 if (osdmap.is_in(osd)) {
9415 if (verbose)
9416 ss << "osd." << osd << " is already in. ";
9417 } else {
9418 if (osdmap.osd_xinfo[osd].old_weight > 0) {
9419 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
9420 if (pending_inc.new_xinfo.count(osd) == 0) {
9421 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
9422 }
9423 pending_inc.new_xinfo[osd].old_weight = 0;
9424 } else {
9425 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 9426 }
31f18b77
FG
9427 ss << "marked in osd." << osd << ". ";
9428 any = true;
9429 }
9430 } else if (prefix == "osd rm") {
9431 err = prepare_command_osd_remove(osd);
9432
9433 if (err == -EBUSY) {
9434 if (any)
9435 ss << ", ";
9436 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 9437 } else {
31f18b77
FG
9438 assert(err == 0);
9439 if (any) {
9440 ss << ", osd." << osd;
9441 } else {
9442 ss << "removed osd." << osd;
9443 }
9444 any = true;
7c673cae 9445 }
31f18b77
FG
9446 }
9447 }
9448 }
9449 if (any) {
9450 getline(ss, rs);
9451 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9452 get_last_committed() + 1));
9453 return true;
9454 }
9455 } else if (prefix == "osd add-noup" ||
9456 prefix == "osd add-nodown" ||
9457 prefix == "osd add-noin" ||
9458 prefix == "osd add-noout") {
9459
9460 enum {
9461 OP_NOUP,
9462 OP_NODOWN,
9463 OP_NOIN,
9464 OP_NOOUT,
9465 } option;
9466
9467 if (prefix == "osd add-noup") {
9468 option = OP_NOUP;
9469 } else if (prefix == "osd add-nodown") {
9470 option = OP_NODOWN;
9471 } else if (prefix == "osd add-noin") {
9472 option = OP_NOIN;
9473 } else {
9474 option = OP_NOOUT;
9475 }
9476
9477 bool any = false;
9478 bool stop = false;
9479
9480 vector<string> idvec;
9481 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9482 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9483
9484 set<int> osds;
9485
9486 // wildcard?
9487 if (j == 0 &&
9488 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9489 osdmap.get_all_osds(osds);
9490 stop = true;
9491 } else {
9492 // try traditional single osd way
9493
9494 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9495 if (osd < 0) {
9496 // ss has reason for failure
9497 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9498 err = -EINVAL;
9499 continue;
9500 }
9501
9502 osds.insert(osd);
9503 }
9504
9505 for (auto &osd : osds) {
9506
9507 if (!osdmap.exists(osd)) {
9508 ss << "osd." << osd << " does not exist. ";
9509 continue;
9510 }
9511
9512 switch (option) {
9513 case OP_NOUP:
9514 if (osdmap.is_up(osd)) {
9515 ss << "osd." << osd << " is already up. ";
9516 continue;
9517 }
9518
9519 if (osdmap.is_noup(osd)) {
9520 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
9521 any = true;
7c673cae 9522 } else {
31f18b77
FG
9523 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9524 any = true;
7c673cae 9525 }
31f18b77
FG
9526
9527 break;
9528
9529 case OP_NODOWN:
9530 if (osdmap.is_down(osd)) {
9531 ss << "osd." << osd << " is already down. ";
9532 continue;
9533 }
9534
9535 if (osdmap.is_nodown(osd)) {
9536 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
9537 any = true;
9538 } else {
9539 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9540 any = true;
9541 }
9542
9543 break;
9544
9545 case OP_NOIN:
9546 if (osdmap.is_in(osd)) {
9547 ss << "osd." << osd << " is already in. ";
9548 continue;
9549 }
9550
9551 if (osdmap.is_noin(osd)) {
9552 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
9553 any = true;
9554 } else {
9555 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9556 any = true;
9557 }
9558
9559 break;
9560
9561 case OP_NOOUT:
9562 if (osdmap.is_out(osd)) {
9563 ss << "osd." << osd << " is already out. ";
9564 continue;
9565 }
9566
9567 if (osdmap.is_noout(osd)) {
9568 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
9569 any = true;
9570 } else {
9571 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9572 any = true;
9573 }
9574
9575 break;
9576
9577 default:
9578 assert(0 == "invalid option");
9579 }
7c673cae
FG
9580 }
9581 }
31f18b77 9582
7c673cae
FG
9583 if (any) {
9584 getline(ss, rs);
9585 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
31f18b77
FG
9586 get_last_committed() + 1));
9587 return true;
9588 }
9589 } else if (prefix == "osd rm-noup" ||
9590 prefix == "osd rm-nodown" ||
9591 prefix == "osd rm-noin" ||
9592 prefix == "osd rm-noout") {
9593
9594 enum {
9595 OP_NOUP,
9596 OP_NODOWN,
9597 OP_NOIN,
9598 OP_NOOUT,
9599 } option;
9600
9601 if (prefix == "osd rm-noup") {
9602 option = OP_NOUP;
9603 } else if (prefix == "osd rm-nodown") {
9604 option = OP_NODOWN;
9605 } else if (prefix == "osd rm-noin") {
9606 option = OP_NOIN;
9607 } else {
9608 option = OP_NOOUT;
9609 }
9610
9611 bool any = false;
9612 bool stop = false;
9613
9614 vector<string> idvec;
9615 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
9616
9617 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9618
9619 vector<int> osds;
9620
9621 // wildcard?
9622 if (j == 0 &&
9623 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9624
9625 // touch previous noup/nodown/noin/noout osds only
9626 switch (option) {
9627 case OP_NOUP:
9628 osdmap.get_noup_osds(&osds);
9629 break;
9630 case OP_NODOWN:
9631 osdmap.get_nodown_osds(&osds);
9632 break;
9633 case OP_NOIN:
9634 osdmap.get_noin_osds(&osds);
9635 break;
9636 case OP_NOOUT:
9637 osdmap.get_noout_osds(&osds);
9638 break;
9639 default:
9640 assert(0 == "invalid option");
9641 }
9642
9643 // cancel any pending noup/nodown/noin/noout requests too
9644 vector<int> pending_state_osds;
9645 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
9646 for (auto &p : pending_state_osds) {
9647
9648 switch (option) {
9649 case OP_NOUP:
9650 if (!osdmap.is_noup(p) &&
9651 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
9652 any = true;
9653 }
9654 break;
9655
9656 case OP_NODOWN:
9657 if (!osdmap.is_nodown(p) &&
9658 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
9659 any = true;
9660 }
9661 break;
9662
9663 case OP_NOIN:
9664 if (!osdmap.is_noin(p) &&
9665 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
9666 any = true;
9667 }
9668 break;
9669
9670 case OP_NOOUT:
9671 if (!osdmap.is_noout(p) &&
9672 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
9673 any = true;
9674 }
9675 break;
9676
9677 default:
9678 assert(0 == "invalid option");
9679 }
9680 }
9681
9682 stop = true;
9683 } else {
9684 // try traditional single osd way
9685
9686 long osd = parse_osd_id(idvec[j].c_str(), &ss);
9687 if (osd < 0) {
9688 // ss has reason for failure
9689 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9690 err = -EINVAL;
9691 continue;
9692 }
9693
9694 osds.push_back(osd);
9695 }
9696
9697 for (auto &osd : osds) {
9698
9699 if (!osdmap.exists(osd)) {
9700 ss << "osd." << osd << " does not exist. ";
9701 continue;
9702 }
9703
9704 switch (option) {
9705 case OP_NOUP:
9706 if (osdmap.is_noup(osd)) {
9707 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
9708 any = true;
9709 } else if (pending_inc.pending_osd_state_clear(
9710 osd, CEPH_OSD_NOUP)) {
9711 any = true;
9712 }
9713 break;
9714
9715 case OP_NODOWN:
9716 if (osdmap.is_nodown(osd)) {
9717 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
9718 any = true;
9719 } else if (pending_inc.pending_osd_state_clear(
9720 osd, CEPH_OSD_NODOWN)) {
9721 any = true;
9722 }
9723 break;
9724
9725 case OP_NOIN:
9726 if (osdmap.is_noin(osd)) {
9727 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
9728 any = true;
9729 } else if (pending_inc.pending_osd_state_clear(
9730 osd, CEPH_OSD_NOIN)) {
9731 any = true;
9732 }
9733 break;
9734
9735 case OP_NOOUT:
9736 if (osdmap.is_noout(osd)) {
9737 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
9738 any = true;
9739 } else if (pending_inc.pending_osd_state_clear(
9740 osd, CEPH_OSD_NOOUT)) {
9741 any = true;
9742 }
9743 break;
9744
9745 default:
9746 assert(0 == "invalid option");
9747 }
9748 }
9749 }
9750
9751 if (any) {
9752 getline(ss, rs);
9753 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
9754 get_last_committed() + 1));
7c673cae
FG
9755 return true;
9756 }
9757 } else if (prefix == "osd pg-temp") {
9758 string pgidstr;
9759 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9760 ss << "unable to parse 'pgid' value '"
9761 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9762 err = -EINVAL;
9763 goto reply;
9764 }
9765 pg_t pgid;
9766 if (!pgid.parse(pgidstr.c_str())) {
9767 ss << "invalid pgid '" << pgidstr << "'";
9768 err = -EINVAL;
9769 goto reply;
9770 }
9771 if (!osdmap.pg_exists(pgid)) {
9772 ss << "pg " << pgid << " does not exist";
9773 err = -ENOENT;
9774 goto reply;
9775 }
9776 if (pending_inc.new_pg_temp.count(pgid)) {
9777 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
9778 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9779 return true;
9780 }
9781
9782 vector<int64_t> id_vec;
9783 vector<int32_t> new_pg_temp;
9784 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9785 ss << "unable to parse 'id' value(s) '"
9786 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9787 err = -EINVAL;
9788 goto reply;
9789 }
9790 for (auto osd : id_vec) {
9791 if (!osdmap.exists(osd)) {
9792 ss << "osd." << osd << " does not exist";
9793 err = -ENOENT;
9794 goto reply;
9795 }
9796 new_pg_temp.push_back(osd);
9797 }
9798
224ce89b
WB
9799 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9800 if ((int)new_pg_temp.size() < pool_min_size) {
9801 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
9802 << pool_min_size << ")";
9803 err = -EINVAL;
9804 goto reply;
9805 }
9806
9807 int pool_size = osdmap.get_pg_pool_size(pgid);
9808 if ((int)new_pg_temp.size() > pool_size) {
9809 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
9810 << pool_size << ")";
9811 err = -EINVAL;
9812 goto reply;
9813 }
9814
7c673cae
FG
9815 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
9816 new_pg_temp.begin(), new_pg_temp.end());
9817 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
9818 goto update;
9819 } else if (prefix == "osd primary-temp") {
9820 string pgidstr;
9821 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9822 ss << "unable to parse 'pgid' value '"
9823 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9824 err = -EINVAL;
9825 goto reply;
9826 }
9827 pg_t pgid;
9828 if (!pgid.parse(pgidstr.c_str())) {
9829 ss << "invalid pgid '" << pgidstr << "'";
9830 err = -EINVAL;
9831 goto reply;
9832 }
9833 if (!osdmap.pg_exists(pgid)) {
9834 ss << "pg " << pgid << " does not exist";
9835 err = -ENOENT;
9836 goto reply;
9837 }
9838
9839 int64_t osd;
9840 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
9841 ss << "unable to parse 'id' value '"
9842 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9843 err = -EINVAL;
9844 goto reply;
9845 }
9846 if (osd != -1 && !osdmap.exists(osd)) {
9847 ss << "osd." << osd << " does not exist";
9848 err = -ENOENT;
9849 goto reply;
9850 }
9851
31f18b77
FG
9852 if (osdmap.require_min_compat_client > 0 &&
9853 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9854 ss << "require_min_compat_client "
9855 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
9856 << " < firefly, which is required for primary-temp";
9857 err = -EPERM;
9858 goto reply;
9859 } else if (!g_conf->mon_osd_allow_primary_temp) {
9860 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
9861 err = -EPERM;
9862 goto reply;
9863 }
9864
9865 pending_inc.new_primary_temp[pgid] = osd;
9866 ss << "set " << pgid << " primary_temp mapping to " << osd;
9867 goto update;
224ce89b
WB
9868 } else if (prefix == "osd pg-upmap" ||
9869 prefix == "osd rm-pg-upmap" ||
9870 prefix == "osd pg-upmap-items" ||
9871 prefix == "osd rm-pg-upmap-items") {
31f18b77 9872 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
9873 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9874 << "luminous' before using the new interface";
7c673cae
FG
9875 err = -EPERM;
9876 goto reply;
9877 }
31f18b77
FG
9878 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9879 ss << "min_compat_client "
9880 << ceph_release_name(osdmap.require_min_compat_client)
224ce89b
WB
9881 << " < luminous, which is required for pg-upmap. "
9882 << "Try 'ceph osd set-require-min-compat-client luminous' "
9883 << "before using the new interface";
7c673cae
FG
9884 err = -EPERM;
9885 goto reply;
9886 }
9887 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
9888 if (err == -EAGAIN)
9889 goto wait;
9890 if (err < 0)
9891 goto reply;
9892 string pgidstr;
9893 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
9894 ss << "unable to parse 'pgid' value '"
9895 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
9896 err = -EINVAL;
9897 goto reply;
9898 }
9899 pg_t pgid;
9900 if (!pgid.parse(pgidstr.c_str())) {
9901 ss << "invalid pgid '" << pgidstr << "'";
9902 err = -EINVAL;
9903 goto reply;
9904 }
9905 if (!osdmap.pg_exists(pgid)) {
9906 ss << "pg " << pgid << " does not exist";
9907 err = -ENOENT;
9908 goto reply;
9909 }
94b18763
FG
9910 if (pending_inc.old_pools.count(pgid.pool())) {
9911 ss << "pool of " << pgid << " is pending removal";
9912 err = -ENOENT;
9913 getline(ss, rs);
9914 wait_for_finished_proposal(op,
9915 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
9916 return true;
9917 }
224ce89b
WB
9918
9919 enum {
9920 OP_PG_UPMAP,
9921 OP_RM_PG_UPMAP,
9922 OP_PG_UPMAP_ITEMS,
9923 OP_RM_PG_UPMAP_ITEMS,
9924 } option;
9925
9926 if (prefix == "osd pg-upmap") {
9927 option = OP_PG_UPMAP;
9928 } else if (prefix == "osd rm-pg-upmap") {
9929 option = OP_RM_PG_UPMAP;
9930 } else if (prefix == "osd pg-upmap-items") {
9931 option = OP_PG_UPMAP_ITEMS;
9932 } else {
9933 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 9934 }
224ce89b
WB
9935
9936 // check pending upmap changes
9937 switch (option) {
9938 case OP_PG_UPMAP: // fall through
9939 case OP_RM_PG_UPMAP:
9940 if (pending_inc.new_pg_upmap.count(pgid) ||
9941 pending_inc.old_pg_upmap.count(pgid)) {
9942 dout(10) << __func__ << " waiting for pending update on "
9943 << pgid << dendl;
9944 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9945 return true;
7c673cae 9946 }
224ce89b 9947 break;
7c673cae 9948
224ce89b
WB
9949 case OP_PG_UPMAP_ITEMS: // fall through
9950 case OP_RM_PG_UPMAP_ITEMS:
9951 if (pending_inc.new_pg_upmap_items.count(pgid) ||
9952 pending_inc.old_pg_upmap_items.count(pgid)) {
9953 dout(10) << __func__ << " waiting for pending update on "
9954 << pgid << dendl;
9955 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9956 return true;
9957 }
9958 break;
7c673cae 9959
224ce89b
WB
9960 default:
9961 assert(0 == "invalid option");
7c673cae 9962 }
224ce89b
WB
9963
9964 switch (option) {
9965 case OP_PG_UPMAP:
9966 {
9967 vector<int64_t> id_vec;
9968 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9969 ss << "unable to parse 'id' value(s) '"
9970 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9971 err = -EINVAL;
9972 goto reply;
9973 }
9974
9975 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9976 if ((int)id_vec.size() < pool_min_size) {
9977 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
9978 << pool_min_size << ")";
9979 err = -EINVAL;
9980 goto reply;
9981 }
9982
9983 int pool_size = osdmap.get_pg_pool_size(pgid);
9984 if ((int)id_vec.size() > pool_size) {
9985 ss << "num of osds (" << id_vec.size() <<") > pool size ("
9986 << pool_size << ")";
9987 err = -EINVAL;
9988 goto reply;
9989 }
9990
9991 vector<int32_t> new_pg_upmap;
9992 for (auto osd : id_vec) {
9993 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
9994 ss << "osd." << osd << " does not exist";
9995 err = -ENOENT;
9996 goto reply;
9997 }
9998 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
9999 if (it != new_pg_upmap.end()) {
10000 ss << "osd." << osd << " already exists, ";
10001 continue;
10002 }
10003 new_pg_upmap.push_back(osd);
10004 }
10005
10006 if (new_pg_upmap.empty()) {
10007 ss << "no valid upmap items(pairs) is specified";
10008 err = -EINVAL;
10009 goto reply;
10010 }
10011
10012 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
10013 new_pg_upmap.begin(), new_pg_upmap.end());
10014 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 10015 }
224ce89b
WB
10016 break;
10017
10018 case OP_RM_PG_UPMAP:
10019 {
10020 pending_inc.old_pg_upmap.insert(pgid);
10021 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 10022 }
224ce89b 10023 break;
7c673cae 10024
224ce89b
WB
10025 case OP_PG_UPMAP_ITEMS:
10026 {
10027 vector<int64_t> id_vec;
10028 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
10029 ss << "unable to parse 'id' value(s) '"
10030 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10031 err = -EINVAL;
10032 goto reply;
10033 }
10034
10035 if (id_vec.size() % 2) {
10036 ss << "you must specify pairs of osd ids to be remapped";
10037 err = -EINVAL;
10038 goto reply;
10039 }
10040
10041 int pool_size = osdmap.get_pg_pool_size(pgid);
10042 if ((int)(id_vec.size() / 2) > pool_size) {
10043 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
10044 << pool_size << ")";
10045 err = -EINVAL;
10046 goto reply;
10047 }
10048
10049 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
10050 ostringstream items;
10051 items << "[";
10052 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
10053 int from = *p++;
10054 int to = *p;
10055 if (from == to) {
10056 ss << "from osd." << from << " == to osd." << to << ", ";
10057 continue;
10058 }
10059 if (!osdmap.exists(from)) {
10060 ss << "osd." << from << " does not exist";
10061 err = -ENOENT;
10062 goto reply;
10063 }
10064 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
10065 ss << "osd." << to << " does not exist";
10066 err = -ENOENT;
10067 goto reply;
10068 }
c07f9fc5
FG
10069 pair<int32_t,int32_t> entry = make_pair(from, to);
10070 auto it = std::find(new_pg_upmap_items.begin(),
10071 new_pg_upmap_items.end(), entry);
10072 if (it != new_pg_upmap_items.end()) {
10073 ss << "osd." << from << " -> osd." << to << " already exists, ";
10074 continue;
10075 }
10076 new_pg_upmap_items.push_back(entry);
224ce89b
WB
10077 items << from << "->" << to << ",";
10078 }
10079 string out(items.str());
10080 out.resize(out.size() - 1); // drop last ','
10081 out += "]";
10082
10083 if (new_pg_upmap_items.empty()) {
10084 ss << "no valid upmap items(pairs) is specified";
10085 err = -EINVAL;
10086 goto reply;
10087 }
10088
10089 pending_inc.new_pg_upmap_items[pgid] =
10090 mempool::osdmap::vector<pair<int32_t,int32_t>>(
10091 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
10092 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
10093 }
10094 break;
10095
10096 case OP_RM_PG_UPMAP_ITEMS:
10097 {
10098 pending_inc.old_pg_upmap_items.insert(pgid);
10099 ss << "clear " << pgid << " pg_upmap_items mapping";
10100 }
10101 break;
10102
10103 default:
10104 assert(0 == "invalid option");
7c673cae
FG
10105 }
10106
7c673cae
FG
10107 goto update;
10108 } else if (prefix == "osd primary-affinity") {
10109 int64_t id;
10110 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10111 ss << "invalid osd id value '"
10112 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10113 err = -EINVAL;
10114 goto reply;
10115 }
10116 double w;
10117 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
10118 ss << "unable to parse 'weight' value '"
10119 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10120 err = -EINVAL;
10121 goto reply;
10122 }
10123 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
10124 if (ww < 0L) {
10125 ss << "weight must be >= 0";
10126 err = -EINVAL;
10127 goto reply;
10128 }
31f18b77
FG
10129 if (osdmap.require_min_compat_client > 0 &&
10130 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10131 ss << "require_min_compat_client "
10132 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
10133 << " < firefly, which is required for primary-affinity";
10134 err = -EPERM;
10135 goto reply;
10136 } else if (!g_conf->mon_osd_allow_primary_affinity) {
10137 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
10138 err = -EPERM;
10139 goto reply;
10140 }
10141 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
10142 if (err == -EAGAIN)
10143 goto wait;
10144 if (err < 0)
10145 goto reply;
10146 if (osdmap.exists(id)) {
10147 pending_inc.new_primary_affinity[id] = ww;
10148 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
10149 getline(ss, rs);
10150 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10151 get_last_committed() + 1));
10152 return true;
10153 } else {
10154 ss << "osd." << id << " does not exist";
10155 err = -ENOENT;
10156 goto reply;
10157 }
10158 } else if (prefix == "osd reweight") {
10159 int64_t id;
10160 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10161 ss << "unable to parse osd id value '"
10162 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10163 err = -EINVAL;
10164 goto reply;
10165 }
10166 double w;
10167 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
10168 ss << "unable to parse weight value '"
10169 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
10170 err = -EINVAL;
10171 goto reply;
10172 }
10173 long ww = (int)((double)CEPH_OSD_IN*w);
10174 if (ww < 0L) {
10175 ss << "weight must be >= 0";
10176 err = -EINVAL;
10177 goto reply;
10178 }
10179 if (osdmap.exists(id)) {
10180 pending_inc.new_weight[id] = ww;
10181 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
10182 getline(ss, rs);
10183 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10184 get_last_committed() + 1));
10185 return true;
10186 } else {
10187 ss << "osd." << id << " does not exist";
10188 err = -ENOENT;
10189 goto reply;
10190 }
10191 } else if (prefix == "osd reweightn") {
10192 map<int32_t, uint32_t> weights;
10193 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
10194 if (err) {
10195 ss << "unable to parse 'weights' value '"
10196 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
10197 goto reply;
10198 }
10199 pending_inc.new_weight.insert(weights.begin(), weights.end());
10200 wait_for_finished_proposal(
10201 op,
10202 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 10203 return true;
7c673cae
FG
10204 } else if (prefix == "osd lost") {
10205 int64_t id;
10206 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10207 ss << "unable to parse osd id value '"
10208 << cmd_vartype_stringify(cmdmap["id"]) << "'";
10209 err = -EINVAL;
10210 goto reply;
10211 }
10212 string sure;
10213 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
10214 ss << "are you SURE? this might mean real, permanent data loss. pass "
10215 "--yes-i-really-mean-it if you really do.";
10216 err = -EPERM;
10217 goto reply;
10218 } else if (!osdmap.exists(id)) {
10219 ss << "osd." << id << " does not exist";
10220 err = -ENOENT;
10221 goto reply;
10222 } else if (!osdmap.is_down(id)) {
10223 ss << "osd." << id << " is not down";
10224 err = -EBUSY;
10225 goto reply;
10226 } else {
10227 epoch_t e = osdmap.get_info(id).down_at;
10228 pending_inc.new_lost[id] = e;
10229 ss << "marked osd lost in epoch " << e;
10230 getline(ss, rs);
10231 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10232 get_last_committed() + 1));
10233 return true;
10234 }
10235
31f18b77
FG
10236 } else if (prefix == "osd destroy" || prefix == "osd purge") {
10237 /* Destroying an OSD means that we don't expect to further make use of
10238 * the OSDs data (which may even become unreadable after this operation),
10239 * and that we are okay with scrubbing all its cephx keys and config-key
10240 * data (which may include lockbox keys, thus rendering the osd's data
10241 * unreadable).
10242 *
10243 * The OSD will not be removed. Instead, we will mark it as destroyed,
10244 * such that a subsequent call to `create` will not reuse the osd id.
10245 * This will play into being able to recreate the OSD, at the same
10246 * crush location, with minimal data movement.
10247 */
10248
10249 // make sure authmon is writeable.
10250 if (!mon->authmon()->is_writeable()) {
10251 dout(10) << __func__ << " waiting for auth mon to be writeable for "
10252 << "osd destroy" << dendl;
10253 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10254 return false;
10255 }
10256
10257 int64_t id;
10258 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
10259 ss << "unable to parse osd id value '"
10260 << cmd_vartype_stringify(cmdmap["id"]) << "";
10261 err = -EINVAL;
10262 goto reply;
10263 }
10264
10265 bool is_destroy = (prefix == "osd destroy");
10266 if (!is_destroy) {
10267 assert("osd purge" == prefix);
10268 }
10269
10270 string sure;
10271 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
10272 sure != "--yes-i-really-mean-it") {
10273 ss << "Are you SURE? This will mean real, permanent data loss, as well "
10274 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
10275 << "really do.";
10276 err = -EPERM;
10277 goto reply;
d2e6a577 10278 } else if (!osdmap.exists(id)) {
31f18b77 10279 ss << "osd." << id << " does not exist";
d2e6a577 10280 err = 0; // idempotent
31f18b77
FG
10281 goto reply;
10282 } else if (osdmap.is_up(id)) {
10283 ss << "osd." << id << " is not `down`.";
10284 err = -EBUSY;
10285 goto reply;
10286 } else if (is_destroy && osdmap.is_destroyed(id)) {
10287 ss << "destroyed osd." << id;
10288 err = 0;
10289 goto reply;
10290 }
10291
10292 bool goto_reply = false;
10293
10294 paxos->plug();
10295 if (is_destroy) {
10296 err = prepare_command_osd_destroy(id, ss);
10297 // we checked above that it should exist.
10298 assert(err != -ENOENT);
10299 } else {
10300 err = prepare_command_osd_purge(id, ss);
10301 if (err == -ENOENT) {
10302 err = 0;
10303 ss << "osd." << id << " does not exist.";
10304 goto_reply = true;
10305 }
10306 }
10307 paxos->unplug();
10308
10309 if (err < 0 || goto_reply) {
10310 goto reply;
10311 }
10312
10313 if (is_destroy) {
10314 ss << "destroyed osd." << id;
10315 } else {
10316 ss << "purged osd." << id;
10317 }
10318
10319 getline(ss, rs);
10320 wait_for_finished_proposal(op,
10321 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
10322 force_immediate_propose();
10323 return true;
10324
10325 } else if (prefix == "osd new") {
10326
10327 // make sure authmon is writeable.
10328 if (!mon->authmon()->is_writeable()) {
10329 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 10330 << "osd new" << dendl;
31f18b77
FG
10331 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
10332 return false;
10333 }
10334
3a9019d9 10335 map<string,string> param_map;
31f18b77
FG
10336
10337 bufferlist bl = m->get_data();
3a9019d9
FG
10338 string param_json = bl.to_str();
10339 dout(20) << __func__ << " osd new json = " << param_json << dendl;
31f18b77 10340
3a9019d9 10341 err = get_json_str_map(param_json, ss, &param_map);
31f18b77
FG
10342 if (err < 0)
10343 goto reply;
10344
3a9019d9 10345 dout(20) << __func__ << " osd new params " << param_map << dendl;
31f18b77
FG
10346
10347 paxos->plug();
3a9019d9 10348 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
31f18b77
FG
10349 paxos->unplug();
10350
10351 if (err < 0) {
10352 goto reply;
10353 }
10354
10355 if (f) {
10356 f->flush(rdata);
10357 } else {
10358 rdata.append(ss);
10359 }
10360
10361 if (err == EEXIST) {
10362 // idempotent operation
10363 err = 0;
10364 goto reply;
10365 }
10366
10367 wait_for_finished_proposal(op,
10368 new Monitor::C_Command(mon, op, 0, rs, rdata,
10369 get_last_committed() + 1));
10370 force_immediate_propose();
10371 return true;
10372
7c673cae 10373 } else if (prefix == "osd create") {
7c673cae
FG
10374
10375 // optional id provided?
31f18b77
FG
10376 int64_t id = -1, cmd_id = -1;
10377 if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
10378 if (cmd_id < 0) {
10379 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
10380 err = -EINVAL;
10381 goto reply;
10382 }
31f18b77 10383 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
10384 }
10385
7c673cae
FG
10386 uuid_d uuid;
10387 string uuidstr;
10388 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
10389 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
10390 ss << "invalid uuid value '" << uuidstr << "'";
10391 err = -EINVAL;
10392 goto reply;
7c673cae 10393 }
31f18b77
FG
10394 // we only care about the id if we also have the uuid, to
10395 // ensure the operation's idempotency.
10396 id = cmd_id;
7c673cae
FG
10397 }
10398
31f18b77
FG
10399 int32_t new_id = -1;
10400 err = prepare_command_osd_create(id, uuid, &new_id, ss);
10401 if (err < 0) {
10402 if (err == -EAGAIN) {
10403 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10404 return true;
10405 }
10406 // a check has failed; reply to the user.
10407 goto reply;
10408
10409 } else if (err == EEXIST) {
10410 // this is an idempotent operation; we can go ahead and reply.
10411 if (f) {
10412 f->open_object_section("created_osd");
10413 f->dump_int("osdid", new_id);
10414 f->close_section();
10415 f->flush(rdata);
10416 } else {
10417 ss << new_id;
10418 rdata.append(ss);
7c673cae 10419 }
31f18b77
FG
10420 err = 0;
10421 goto reply;
7c673cae
FG
10422 }
10423
3a9019d9
FG
10424 string empty_device_class;
10425 do_osd_create(id, uuid, empty_device_class, &new_id);
31f18b77 10426
7c673cae
FG
10427 if (f) {
10428 f->open_object_section("created_osd");
31f18b77 10429 f->dump_int("osdid", new_id);
7c673cae
FG
10430 f->close_section();
10431 f->flush(rdata);
10432 } else {
31f18b77 10433 ss << new_id;
7c673cae
FG
10434 rdata.append(ss);
10435 }
31f18b77
FG
10436 wait_for_finished_proposal(op,
10437 new Monitor::C_Command(mon, op, 0, rs, rdata,
10438 get_last_committed() + 1));
7c673cae
FG
10439 return true;
10440
10441 } else if (prefix == "osd blacklist clear") {
10442 pending_inc.new_blacklist.clear();
10443 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
10444 osdmap.get_blacklist(&blacklist);
10445 for (const auto &entry : blacklist) {
10446 pending_inc.old_blacklist.push_back(entry.first);
10447 }
10448 ss << " removed all blacklist entries";
10449 getline(ss, rs);
10450 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10451 get_last_committed() + 1));
10452 return true;
10453 } else if (prefix == "osd blacklist") {
10454 string addrstr;
10455 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
10456 entity_addr_t addr;
10457 if (!addr.parse(addrstr.c_str(), 0)) {
10458 ss << "unable to parse address " << addrstr;
10459 err = -EINVAL;
10460 goto reply;
10461 }
10462 else {
10463 string blacklistop;
10464 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
10465 if (blacklistop == "add") {
10466 utime_t expires = ceph_clock_now();
10467 double d;
10468 // default one hour
224ce89b
WB
10469 cmd_getval(g_ceph_context, cmdmap, "expire", d,
10470 g_conf->mon_osd_blacklist_default_expire);
7c673cae
FG
10471 expires += d;
10472
10473 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
10474
10475 {
10476 // cancel any pending un-blacklisting request too
10477 auto it = std::find(pending_inc.old_blacklist.begin(),
10478 pending_inc.old_blacklist.end(), addr);
10479 if (it != pending_inc.old_blacklist.end()) {
10480 pending_inc.old_blacklist.erase(it);
10481 }
10482 }
10483
7c673cae
FG
10484 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
10485 getline(ss, rs);
10486 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10487 get_last_committed() + 1));
10488 return true;
10489 } else if (blacklistop == "rm") {
10490 if (osdmap.is_blacklisted(addr) ||
10491 pending_inc.new_blacklist.count(addr)) {
10492 if (osdmap.is_blacklisted(addr))
10493 pending_inc.old_blacklist.push_back(addr);
10494 else
10495 pending_inc.new_blacklist.erase(addr);
10496 ss << "un-blacklisting " << addr;
10497 getline(ss, rs);
10498 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10499 get_last_committed() + 1));
10500 return true;
10501 }
10502 ss << addr << " isn't blacklisted";
10503 err = 0;
10504 goto reply;
10505 }
10506 }
10507 } else if (prefix == "osd pool mksnap") {
10508 string poolstr;
10509 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10510 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10511 if (pool < 0) {
10512 ss << "unrecognized pool '" << poolstr << "'";
10513 err = -ENOENT;
10514 goto reply;
10515 }
10516 string snapname;
10517 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10518 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10519 if (p->is_unmanaged_snaps_mode()) {
10520 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10521 err = -EINVAL;
10522 goto reply;
10523 } else if (p->snap_exists(snapname.c_str())) {
10524 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10525 err = 0;
10526 goto reply;
10527 } else if (p->is_tier()) {
10528 ss << "pool " << poolstr << " is a cache tier";
10529 err = -EINVAL;
10530 goto reply;
10531 }
10532 pg_pool_t *pp = 0;
10533 if (pending_inc.new_pools.count(pool))
10534 pp = &pending_inc.new_pools[pool];
10535 if (!pp) {
10536 pp = &pending_inc.new_pools[pool];
10537 *pp = *p;
10538 }
10539 if (pp->snap_exists(snapname.c_str())) {
10540 ss << "pool " << poolstr << " snap " << snapname << " already exists";
10541 } else {
10542 pp->add_snap(snapname.c_str(), ceph_clock_now());
10543 pp->set_snap_epoch(pending_inc.epoch);
10544 ss << "created pool " << poolstr << " snap " << snapname;
10545 }
10546 getline(ss, rs);
10547 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10548 get_last_committed() + 1));
10549 return true;
10550 } else if (prefix == "osd pool rmsnap") {
10551 string poolstr;
10552 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10553 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10554 if (pool < 0) {
10555 ss << "unrecognized pool '" << poolstr << "'";
10556 err = -ENOENT;
10557 goto reply;
10558 }
10559 string snapname;
10560 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
10561 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10562 if (p->is_unmanaged_snaps_mode()) {
10563 ss << "pool " << poolstr << " is in unmanaged snaps mode";
10564 err = -EINVAL;
10565 goto reply;
10566 } else if (!p->snap_exists(snapname.c_str())) {
10567 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
10568 err = 0;
10569 goto reply;
10570 }
10571 pg_pool_t *pp = 0;
10572 if (pending_inc.new_pools.count(pool))
10573 pp = &pending_inc.new_pools[pool];
10574 if (!pp) {
10575 pp = &pending_inc.new_pools[pool];
10576 *pp = *p;
10577 }
10578 snapid_t sn = pp->snap_exists(snapname.c_str());
10579 if (sn) {
10580 pp->remove_snap(sn);
10581 pp->set_snap_epoch(pending_inc.epoch);
10582 ss << "removed pool " << poolstr << " snap " << snapname;
10583 } else {
10584 ss << "already removed pool " << poolstr << " snap " << snapname;
10585 }
10586 getline(ss, rs);
10587 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10588 get_last_committed() + 1));
10589 return true;
10590 } else if (prefix == "osd pool create") {
10591 int64_t pg_num;
10592 int64_t pgp_num;
10593 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
10594 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
10595
10596 string pool_type_str;
10597 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
10598 if (pool_type_str.empty())
224ce89b 10599 pool_type_str = g_conf->osd_pool_default_type;
7c673cae
FG
10600
10601 string poolstr;
10602 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10603 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10604 if (pool_id >= 0) {
10605 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10606 if (pool_type_str != p->get_type_name()) {
10607 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
10608 err = -EINVAL;
10609 } else {
10610 ss << "pool '" << poolstr << "' already exists";
10611 err = 0;
10612 }
10613 goto reply;
10614 }
10615
10616 int pool_type;
10617 if (pool_type_str == "replicated") {
10618 pool_type = pg_pool_t::TYPE_REPLICATED;
10619 } else if (pool_type_str == "erasure") {
10620 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
10621 CEPH_FEATURE_OSD_ERASURE_CODES,
10622 ss);
10623 if (err == -EAGAIN)
10624 goto wait;
10625 if (err)
10626 goto reply;
10627 pool_type = pg_pool_t::TYPE_ERASURE;
10628 } else {
10629 ss << "unknown pool type '" << pool_type_str << "'";
10630 err = -EINVAL;
10631 goto reply;
10632 }
10633
31f18b77 10634 bool implicit_rule_creation = false;
94b18763 10635 int64_t expected_num_objects = 0;
31f18b77
FG
10636 string rule_name;
10637 cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
7c673cae
FG
10638 string erasure_code_profile;
10639 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
10640
10641 if (pool_type == pg_pool_t::TYPE_ERASURE) {
10642 if (erasure_code_profile == "")
10643 erasure_code_profile = "default";
10644 //handle the erasure code profile
10645 if (erasure_code_profile == "default") {
10646 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
10647 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
10648 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
10649 goto wait;
10650 }
10651
10652 map<string,string> profile_map;
10653 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
10654 profile_map,
10655 &ss);
10656 if (err)
10657 goto reply;
10658 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
10659 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
10660 goto wait;
10661 }
10662 }
31f18b77
FG
10663 if (rule_name == "") {
10664 implicit_rule_creation = true;
7c673cae 10665 if (erasure_code_profile == "default") {
31f18b77 10666 rule_name = "erasure-code";
7c673cae 10667 } else {
31f18b77 10668 dout(1) << "implicitly use rule named after the pool: "
7c673cae 10669 << poolstr << dendl;
31f18b77 10670 rule_name = poolstr;
7c673cae
FG
10671 }
10672 }
94b18763
FG
10673 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
10674 expected_num_objects, int64_t(0));
7c673cae 10675 } else {
31f18b77 10676 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
94b18763
FG
10677 // and put expected_num_objects to rule field
10678 if (erasure_code_profile != "") { // cmd is from CLI
10679 if (rule_name != "") {
10680 string interr;
10681 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
10682 if (interr.length()) {
10683 ss << "error parsing integer value '" << rule_name << "': " << interr;
10684 err = -EINVAL;
10685 goto reply;
10686 }
10687 }
10688 rule_name = erasure_code_profile;
10689 } else { // cmd is well-formed
10690 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
10691 expected_num_objects, int64_t(0));
10692 }
7c673cae
FG
10693 }
10694
31f18b77
FG
10695 if (!implicit_rule_creation && rule_name != "") {
10696 int rule;
10697 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
10698 if (err == -EAGAIN) {
10699 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10700 return true;
10701 }
10702 if (err)
10703 goto reply;
10704 }
10705
7c673cae
FG
10706 if (expected_num_objects < 0) {
10707 ss << "'expected_num_objects' must be non-negative";
10708 err = -EINVAL;
10709 goto reply;
10710 }
10711
10712 int64_t fast_read_param;
10713 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
10714 FastReadType fast_read = FAST_READ_DEFAULT;
10715 if (fast_read_param == 0)
10716 fast_read = FAST_READ_OFF;
10717 else if (fast_read_param > 0)
10718 fast_read = FAST_READ_ON;
10719
10720 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
10721 -1, // default crush rule
31f18b77 10722 rule_name,
7c673cae
FG
10723 pg_num, pgp_num,
10724 erasure_code_profile, pool_type,
10725 (uint64_t)expected_num_objects,
10726 fast_read,
10727 &ss);
10728 if (err < 0) {
10729 switch(err) {
10730 case -EEXIST:
10731 ss << "pool '" << poolstr << "' already exists";
10732 break;
10733 case -EAGAIN:
10734 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10735 return true;
10736 case -ERANGE:
10737 goto reply;
10738 default:
10739 goto reply;
10740 break;
10741 }
10742 } else {
10743 ss << "pool '" << poolstr << "' created";
10744 }
10745 getline(ss, rs);
10746 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10747 get_last_committed() + 1));
10748 return true;
10749
10750 } else if (prefix == "osd pool delete" ||
10751 prefix == "osd pool rm") {
10752 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
10753 string poolstr, poolstr2, sure;
10754 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10755 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
10756 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10757 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
10758 if (pool < 0) {
10759 ss << "pool '" << poolstr << "' does not exist";
10760 err = 0;
10761 goto reply;
10762 }
10763
10764 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
10765 if (poolstr2 != poolstr ||
10766 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
10767 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
10768 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
10769 << "followed by --yes-i-really-really-mean-it.";
10770 err = -EPERM;
10771 goto reply;
10772 }
10773 err = _prepare_remove_pool(pool, &ss, force_no_fake);
10774 if (err == -EAGAIN) {
10775 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10776 return true;
10777 }
10778 if (err < 0)
10779 goto reply;
10780 goto update;
10781 } else if (prefix == "osd pool rename") {
10782 string srcpoolstr, destpoolstr;
10783 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
10784 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
10785 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
10786 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
10787
10788 if (pool_src < 0) {
10789 if (pool_dst >= 0) {
10790 // src pool doesn't exist, dst pool does exist: to ensure idempotency
10791 // of operations, assume this rename succeeded, as it is not changing
10792 // the current state. Make sure we output something understandable
10793 // for whoever is issuing the command, if they are paying attention,
10794 // in case it was not intentional; or to avoid a "wtf?" and a bug
10795 // report in case it was intentional, while expecting a failure.
10796 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
10797 << destpoolstr << "' does -- assuming successful rename";
10798 err = 0;
10799 } else {
10800 ss << "unrecognized pool '" << srcpoolstr << "'";
10801 err = -ENOENT;
10802 }
10803 goto reply;
10804 } else if (pool_dst >= 0) {
10805 // source pool exists and so does the destination pool
10806 ss << "pool '" << destpoolstr << "' already exists";
10807 err = -EEXIST;
10808 goto reply;
10809 }
10810
10811 int ret = _prepare_rename_pool(pool_src, destpoolstr);
10812 if (ret == 0) {
10813 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
10814 } else {
10815 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
10816 << cpp_strerror(ret);
10817 }
10818 getline(ss, rs);
10819 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
10820 get_last_committed() + 1));
10821 return true;
10822
10823 } else if (prefix == "osd pool set") {
10824 err = prepare_command_pool_set(cmdmap, ss);
10825 if (err == -EAGAIN)
10826 goto wait;
10827 if (err < 0)
10828 goto reply;
10829
10830 getline(ss, rs);
10831 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10832 get_last_committed() + 1));
10833 return true;
10834 } else if (prefix == "osd tier add") {
10835 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10836 if (err == -EAGAIN)
10837 goto wait;
10838 if (err)
10839 goto reply;
10840 string poolstr;
10841 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10842 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10843 if (pool_id < 0) {
10844 ss << "unrecognized pool '" << poolstr << "'";
10845 err = -ENOENT;
10846 goto reply;
10847 }
10848 string tierpoolstr;
10849 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10850 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10851 if (tierpool_id < 0) {
10852 ss << "unrecognized pool '" << tierpoolstr << "'";
10853 err = -ENOENT;
10854 goto reply;
10855 }
10856 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10857 assert(p);
10858 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10859 assert(tp);
10860
10861 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10862 goto reply;
10863 }
10864
10865 // make sure new tier is empty
10866 string force_nonempty;
10867 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
31f18b77
FG
10868 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
10869 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
10870 force_nonempty != "--force-nonempty") {
10871 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
10872 err = -ENOTEMPTY;
10873 goto reply;
10874 }
10875 if (tp->ec_pool()) {
10876 ss << "tier pool '" << tierpoolstr
10877 << "' is an ec pool, which cannot be a tier";
10878 err = -ENOTSUP;
10879 goto reply;
10880 }
10881 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
10882 ((force_nonempty != "--force-nonempty") ||
10883 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
10884 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
10885 err = -ENOTEMPTY;
10886 goto reply;
10887 }
10888 // go
10889 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10890 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10891 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10892 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10893 return true;
10894 }
10895 np->tiers.insert(tierpool_id);
10896 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10897 ntp->tier_of = pool_id;
10898 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
10899 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10900 get_last_committed() + 1));
10901 return true;
10902 } else if (prefix == "osd tier remove" ||
10903 prefix == "osd tier rm") {
10904 string poolstr;
10905 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10906 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10907 if (pool_id < 0) {
10908 ss << "unrecognized pool '" << poolstr << "'";
10909 err = -ENOENT;
10910 goto reply;
10911 }
10912 string tierpoolstr;
10913 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10914 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10915 if (tierpool_id < 0) {
10916 ss << "unrecognized pool '" << tierpoolstr << "'";
10917 err = -ENOENT;
10918 goto reply;
10919 }
10920 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10921 assert(p);
10922 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10923 assert(tp);
10924
10925 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
10926 goto reply;
10927 }
10928
10929 if (p->tiers.count(tierpool_id) == 0) {
10930 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10931 err = 0;
10932 goto reply;
10933 }
10934 if (tp->tier_of != pool_id) {
10935 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
10936 << osdmap.get_pool_name(tp->tier_of) << "': "
10937 // be scary about it; this is an inconsistency and bells must go off
10938 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
10939 err = -EINVAL;
10940 goto reply;
10941 }
10942 if (p->read_tier == tierpool_id) {
10943 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
10944 err = -EBUSY;
10945 goto reply;
10946 }
10947 // go
10948 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10949 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10950 if (np->tiers.count(tierpool_id) == 0 ||
10951 ntp->tier_of != pool_id ||
10952 np->read_tier == tierpool_id) {
10953 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10954 return true;
10955 }
10956 np->tiers.erase(tierpool_id);
10957 ntp->clear_tier();
10958 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
10959 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10960 get_last_committed() + 1));
10961 return true;
10962 } else if (prefix == "osd tier set-overlay") {
10963 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10964 if (err == -EAGAIN)
10965 goto wait;
10966 if (err)
10967 goto reply;
10968 string poolstr;
10969 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10970 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10971 if (pool_id < 0) {
10972 ss << "unrecognized pool '" << poolstr << "'";
10973 err = -ENOENT;
10974 goto reply;
10975 }
10976 string overlaypoolstr;
10977 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
10978 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
10979 if (overlaypool_id < 0) {
10980 ss << "unrecognized pool '" << overlaypoolstr << "'";
10981 err = -ENOENT;
10982 goto reply;
10983 }
10984 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10985 assert(p);
10986 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
10987 assert(overlay_p);
10988 if (p->tiers.count(overlaypool_id) == 0) {
10989 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
10990 err = -EINVAL;
10991 goto reply;
10992 }
10993 if (p->read_tier == overlaypool_id) {
10994 err = 0;
10995 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10996 goto reply;
10997 }
10998 if (p->has_read_tier()) {
10999 ss << "pool '" << poolstr << "' has overlay '"
11000 << osdmap.get_pool_name(p->read_tier)
11001 << "'; please remove-overlay first";
11002 err = -EINVAL;
11003 goto reply;
11004 }
11005
11006 // go
11007 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11008 np->read_tier = overlaypool_id;
11009 np->write_tier = overlaypool_id;
11010 np->set_last_force_op_resend(pending_inc.epoch);
11011 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
11012 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
11013 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11014 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
11015 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
11016 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11017 get_last_committed() + 1));
11018 return true;
11019 } else if (prefix == "osd tier remove-overlay" ||
11020 prefix == "osd tier rm-overlay") {
11021 string poolstr;
11022 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11023 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11024 if (pool_id < 0) {
11025 ss << "unrecognized pool '" << poolstr << "'";
11026 err = -ENOENT;
11027 goto reply;
11028 }
11029 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11030 assert(p);
11031 if (!p->has_read_tier()) {
11032 err = 0;
11033 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11034 goto reply;
11035 }
11036
11037 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
11038 goto reply;
11039 }
11040
11041 // go
11042 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11043 if (np->has_read_tier()) {
11044 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
11045 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
11046 nop->set_last_force_op_resend(pending_inc.epoch);
11047 }
11048 if (np->has_write_tier()) {
11049 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
11050 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
11051 nop->set_last_force_op_resend(pending_inc.epoch);
11052 }
11053 np->clear_read_tier();
11054 np->clear_write_tier();
11055 np->set_last_force_op_resend(pending_inc.epoch);
11056 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11057 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11058 get_last_committed() + 1));
11059 return true;
11060 } else if (prefix == "osd tier cache-mode") {
11061 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11062 if (err == -EAGAIN)
11063 goto wait;
11064 if (err)
11065 goto reply;
11066 string poolstr;
11067 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11068 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11069 if (pool_id < 0) {
11070 ss << "unrecognized pool '" << poolstr << "'";
11071 err = -ENOENT;
11072 goto reply;
11073 }
11074 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11075 assert(p);
11076 if (!p->is_tier()) {
11077 ss << "pool '" << poolstr << "' is not a tier";
11078 err = -EINVAL;
11079 goto reply;
11080 }
11081 string modestr;
11082 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
11083 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11084 if (mode < 0) {
11085 ss << "'" << modestr << "' is not a valid cache mode";
11086 err = -EINVAL;
11087 goto reply;
11088 }
11089
11090 string sure;
11091 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
11092 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11093 mode != pg_pool_t::CACHEMODE_NONE &&
11094 mode != pg_pool_t::CACHEMODE_PROXY &&
11095 mode != pg_pool_t::CACHEMODE_READPROXY) &&
11096 sure != "--yes-i-really-mean-it") {
11097 ss << "'" << modestr << "' is not a well-supported cache mode and may "
11098 << "corrupt your data. pass --yes-i-really-mean-it to force.";
11099 err = -EPERM;
11100 goto reply;
11101 }
11102
11103 // pool already has this cache-mode set and there are no pending changes
11104 if (p->cache_mode == mode &&
11105 (pending_inc.new_pools.count(pool_id) == 0 ||
11106 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
11107 ss << "set cache-mode for pool '" << poolstr << "'"
11108 << " to " << pg_pool_t::get_cache_mode_name(mode);
11109 err = 0;
11110 goto reply;
11111 }
11112
11113 /* Mode description:
11114 *
11115 * none: No cache-mode defined
11116 * forward: Forward all reads and writes to base pool
11117 * writeback: Cache writes, promote reads from base pool
11118 * readonly: Forward writes to base pool
11119 * readforward: Writes are in writeback mode, Reads are in forward mode
11120 * proxy: Proxy all reads and writes to base pool
11121 * readproxy: Writes are in writeback mode, Reads are in proxy mode
11122 *
11123 * Hence, these are the allowed transitions:
11124 *
11125 * none -> any
11126 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11127 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
11128 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
11129 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
11130 * writeback -> readforward || readproxy || forward || proxy
11131 * readonly -> any
11132 */
11133
11134 // We check if the transition is valid against the current pool mode, as
11135 // it is the only committed state thus far. We will blantly squash
11136 // whatever mode is on the pending state.
11137
11138 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
11139 (mode != pg_pool_t::CACHEMODE_FORWARD &&
11140 mode != pg_pool_t::CACHEMODE_PROXY &&
11141 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11142 mode != pg_pool_t::CACHEMODE_READPROXY)) {
11143 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
11144 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
11145 << "' pool; only '"
11146 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
11147 << "','"
11148 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
11149 << "','"
11150 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
11151 << "','"
11152 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
11153 << "' allowed.";
11154 err = -EINVAL;
11155 goto reply;
11156 }
11157 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
11158 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11159 mode != pg_pool_t::CACHEMODE_FORWARD &&
11160 mode != pg_pool_t::CACHEMODE_PROXY &&
11161 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11162
11163 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
11164 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11165 mode != pg_pool_t::CACHEMODE_FORWARD &&
11166 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11167 mode != pg_pool_t::CACHEMODE_PROXY)) ||
11168
11169 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
11170 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11171 mode != pg_pool_t::CACHEMODE_FORWARD &&
11172 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11173 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
11174
11175 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
11176 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11177 mode != pg_pool_t::CACHEMODE_READFORWARD &&
11178 mode != pg_pool_t::CACHEMODE_PROXY &&
11179 mode != pg_pool_t::CACHEMODE_READPROXY))) {
11180
31f18b77
FG
11181 const pool_stat_t* pstats =
11182 mon->pgservice->get_pool_stat(pool_id);
7c673cae 11183
31f18b77 11184 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
11185 ss << "unable to set cache-mode '"
11186 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
11187 << "': dirty objects found";
11188 err = -EBUSY;
11189 goto reply;
11190 }
11191 }
11192 // go
11193 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11194 np->cache_mode = mode;
11195 // set this both when moving to and from cache_mode NONE. this is to
11196 // capture legacy pools that were set up before this flag existed.
11197 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
11198 ss << "set cache-mode for pool '" << poolstr
11199 << "' to " << pg_pool_t::get_cache_mode_name(mode);
11200 if (mode == pg_pool_t::CACHEMODE_NONE) {
11201 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
11202 assert(base_pool);
11203 if (base_pool->read_tier == pool_id ||
11204 base_pool->write_tier == pool_id)
11205 ss <<" (WARNING: pool is still configured as read or write tier)";
11206 }
11207 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11208 get_last_committed() + 1));
11209 return true;
11210 } else if (prefix == "osd tier add-cache") {
11211 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11212 if (err == -EAGAIN)
11213 goto wait;
11214 if (err)
11215 goto reply;
11216 string poolstr;
11217 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11218 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11219 if (pool_id < 0) {
11220 ss << "unrecognized pool '" << poolstr << "'";
11221 err = -ENOENT;
11222 goto reply;
11223 }
11224 string tierpoolstr;
11225 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
11226 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11227 if (tierpool_id < 0) {
11228 ss << "unrecognized pool '" << tierpoolstr << "'";
11229 err = -ENOENT;
11230 goto reply;
11231 }
11232 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11233 assert(p);
11234 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11235 assert(tp);
11236
11237 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11238 goto reply;
11239 }
11240
11241 int64_t size = 0;
11242 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
11243 ss << "unable to parse 'size' value '"
11244 << cmd_vartype_stringify(cmdmap["size"]) << "'";
11245 err = -EINVAL;
11246 goto reply;
11247 }
11248 // make sure new tier is empty
31f18b77
FG
11249 const pool_stat_t *pstats =
11250 mon->pgservice->get_pool_stat(tierpool_id);
11251 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
11252 ss << "tier pool '" << tierpoolstr << "' is not empty";
11253 err = -ENOTEMPTY;
11254 goto reply;
11255 }
11256 string modestr = g_conf->osd_tier_default_cache_mode;
11257 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
11258 if (mode < 0) {
11259 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
11260 err = -EINVAL;
11261 goto reply;
11262 }
11263 HitSet::Params hsp;
11264 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
11265 BloomHitSet::Params *bsp = new BloomHitSet::Params;
11266 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
11267 hsp = HitSet::Params(bsp);
11268 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
11269 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
11270 }
11271 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
11272 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
11273 } else {
11274 ss << "osd tier cache default hit set type '" <<
11275 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
11276 err = -EINVAL;
11277 goto reply;
11278 }
11279 // go
11280 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11281 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11282 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11283 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11284 return true;
11285 }
11286 np->tiers.insert(tierpool_id);
11287 np->read_tier = np->write_tier = tierpool_id;
11288 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11289 np->set_last_force_op_resend(pending_inc.epoch);
11290 ntp->set_last_force_op_resend(pending_inc.epoch);
11291 ntp->tier_of = pool_id;
11292 ntp->cache_mode = mode;
11293 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
11294 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
11295 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
11296 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
11297 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
11298 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
11299 ntp->hit_set_params = hsp;
11300 ntp->target_max_bytes = size;
11301 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
11302 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11303 get_last_committed() + 1));
11304 return true;
11305 } else if (prefix == "osd pool set-quota") {
11306 string poolstr;
11307 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
11308 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11309 if (pool_id < 0) {
11310 ss << "unrecognized pool '" << poolstr << "'";
11311 err = -ENOENT;
11312 goto reply;
11313 }
11314
11315 string field;
11316 cmd_getval(g_ceph_context, cmdmap, "field", field);
11317 if (field != "max_objects" && field != "max_bytes") {
11318 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
11319 err = -EINVAL;
11320 goto reply;
11321 }
11322
11323 // val could contain unit designations, so we treat as a string
11324 string val;
11325 cmd_getval(g_ceph_context, cmdmap, "val", val);
11326 stringstream tss;
11327 int64_t value = unit_to_bytesize(val, &tss);
11328 if (value < 0) {
11329 ss << "error parsing value '" << value << "': " << tss.str();
11330 err = value;
11331 goto reply;
11332 }
11333
11334 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
11335 if (field == "max_objects") {
11336 pi->quota_max_objects = value;
11337 } else if (field == "max_bytes") {
11338 pi->quota_max_bytes = value;
11339 } else {
11340 assert(0 == "unrecognized option");
11341 }
11342 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
11343 rs = ss.str();
11344 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11345 get_last_committed() + 1));
11346 return true;
c07f9fc5
FG
11347 } else if (prefix == "osd pool application enable" ||
11348 prefix == "osd pool application disable" ||
11349 prefix == "osd pool application set" ||
11350 prefix == "osd pool application rm") {
11351 err = prepare_command_pool_application(prefix, cmdmap, ss);
11352 if (err == -EAGAIN)
11353 goto wait;
11354 if (err < 0)
11355 goto reply;
7c673cae 11356
c07f9fc5
FG
11357 getline(ss, rs);
11358 wait_for_finished_proposal(
11359 op, new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11360 return true;
7c673cae
FG
11361 } else if (prefix == "osd reweight-by-pg" ||
11362 prefix == "osd reweight-by-utilization" ||
11363 prefix == "osd test-reweight-by-pg" ||
11364 prefix == "osd test-reweight-by-utilization") {
11365 bool by_pg =
11366 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
11367 bool dry_run =
11368 prefix == "osd test-reweight-by-pg" ||
11369 prefix == "osd test-reweight-by-utilization";
11370 int64_t oload;
11371 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
11372 set<int64_t> pools;
11373 vector<string> poolnamevec;
11374 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
11375 for (unsigned j = 0; j < poolnamevec.size(); j++) {
11376 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
11377 if (pool < 0) {
11378 ss << "pool '" << poolnamevec[j] << "' does not exist";
11379 err = -ENOENT;
11380 goto reply;
11381 }
11382 pools.insert(pool);
11383 }
11384 double max_change = g_conf->mon_reweight_max_change;
11385 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
11386 if (max_change <= 0.0) {
11387 ss << "max_change " << max_change << " must be positive";
11388 err = -EINVAL;
11389 goto reply;
11390 }
11391 int64_t max_osds = g_conf->mon_reweight_max_osds;
11392 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
11393 if (max_osds <= 0) {
11394 ss << "max_osds " << max_osds << " must be positive";
11395 err = -EINVAL;
11396 goto reply;
11397 }
11398 string no_increasing;
11399 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
11400 string out_str;
11401 mempool::osdmap::map<int32_t, uint32_t> new_weights;
31f18b77
FG
11402 err = mon->pgservice->reweight_by_utilization(osdmap,
11403 oload,
11404 max_change,
11405 max_osds,
11406 by_pg,
11407 pools.empty() ? NULL : &pools,
11408 no_increasing == "--no-increasing",
11409 &new_weights,
11410 &ss, &out_str, f.get());
7c673cae
FG
11411 if (err >= 0) {
11412 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
11413 }
11414 if (f)
11415 f->flush(rdata);
11416 else
11417 rdata.append(out_str);
11418 if (err < 0) {
11419 ss << "FAILED reweight-by-pg";
11420 } else if (err == 0 || dry_run) {
11421 ss << "no change";
11422 } else {
11423 ss << "SUCCESSFUL reweight-by-pg";
11424 pending_inc.new_weight = std::move(new_weights);
11425 wait_for_finished_proposal(
11426 op,
11427 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11428 return true;
11429 }
c07f9fc5
FG
11430 } else if (prefix == "osd force-create-pg") {
11431 pg_t pgid;
11432 string pgidstr;
11433 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
11434 if (!pgid.parse(pgidstr.c_str())) {
11435 ss << "invalid pgid '" << pgidstr << "'";
11436 err = -EINVAL;
11437 goto reply;
11438 }
94b18763
FG
11439 if (!osdmap.pg_exists(pgid)) {
11440 ss << "pg " << pgid << " should not exist";
11441 err = -ENOENT;
11442 goto reply;
11443 }
c07f9fc5
FG
11444 bool creating_now;
11445 {
11446 std::lock_guard<std::mutex> l(creating_pgs_lock);
11447 auto emplaced = creating_pgs.pgs.emplace(pgid,
11448 make_pair(osdmap.get_epoch(),
11449 ceph_clock_now()));
11450 creating_now = emplaced.second;
11451 }
11452 if (creating_now) {
11453 ss << "pg " << pgidstr << " now creating, ok";
11454 err = 0;
11455 goto update;
11456 } else {
11457 ss << "pg " << pgid << " already creating";
11458 err = 0;
11459 goto reply;
11460 }
7c673cae
FG
11461 } else {
11462 err = -EINVAL;
11463 }
11464
11465 reply:
11466 getline(ss, rs);
11467 if (err < 0 && rs.length() == 0)
11468 rs = cpp_strerror(err);
11469 mon->reply_command(op, err, rs, rdata, get_last_committed());
11470 return ret;
11471
11472 update:
11473 getline(ss, rs);
11474 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11475 get_last_committed() + 1));
11476 return true;
11477
11478 wait:
11479 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11480 return true;
11481}
11482
11483bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
11484{
11485 op->mark_osdmon_event(__func__);
11486 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11487
11488 if (m->fsid != mon->monmap->fsid) {
11489 dout(0) << __func__ << " drop message on fsid " << m->fsid
11490 << " != " << mon->monmap->fsid << " for " << *m << dendl;
11491 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11492 return true;
11493 }
11494
11495 if (m->op == POOL_OP_CREATE)
11496 return preprocess_pool_op_create(op);
11497
11498 if (!osdmap.get_pg_pool(m->pool)) {
11499 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
11500 _pool_op_reply(op, 0, osdmap.get_epoch());
11501 return true;
11502 }
11503
11504 // check if the snap and snapname exist
11505 bool snap_exists = false;
11506 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
11507 if (p->snap_exists(m->name.c_str()))
11508 snap_exists = true;
11509
11510 switch (m->op) {
11511 case POOL_OP_CREATE_SNAP:
11512 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
11513 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11514 return true;
11515 }
11516 if (snap_exists) {
11517 _pool_op_reply(op, 0, osdmap.get_epoch());
11518 return true;
11519 }
11520 return false;
11521 case POOL_OP_CREATE_UNMANAGED_SNAP:
11522 if (p->is_pool_snaps_mode()) {
11523 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11524 return true;
11525 }
11526 return false;
11527 case POOL_OP_DELETE_SNAP:
11528 if (p->is_unmanaged_snaps_mode()) {
11529 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11530 return true;
11531 }
11532 if (!snap_exists) {
11533 _pool_op_reply(op, 0, osdmap.get_epoch());
11534 return true;
11535 }
11536 return false;
11537 case POOL_OP_DELETE_UNMANAGED_SNAP:
11538 if (p->is_pool_snaps_mode()) {
11539 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11540 return true;
11541 }
11542 if (p->is_removed_snap(m->snapid)) {
11543 _pool_op_reply(op, 0, osdmap.get_epoch());
11544 return true;
11545 }
11546 return false;
11547 case POOL_OP_DELETE:
11548 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
11549 _pool_op_reply(op, 0, osdmap.get_epoch());
11550 return true;
11551 }
11552 return false;
11553 case POOL_OP_AUID_CHANGE:
11554 return false;
11555 default:
11556 ceph_abort();
11557 break;
11558 }
11559
11560 return false;
11561}
11562
11563bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
11564{
11565 op->mark_osdmon_event(__func__);
11566 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11567 MonSession *session = m->get_session();
11568 if (!session) {
11569 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11570 return true;
11571 }
11572 if (!session->is_capable("osd", MON_CAP_W)) {
11573 dout(5) << "attempt to create new pool without sufficient auid privileges!"
11574 << "message: " << *m << std::endl
11575 << "caps: " << session->caps << dendl;
11576 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
11577 return true;
11578 }
11579
11580 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
11581 if (pool >= 0) {
11582 _pool_op_reply(op, 0, osdmap.get_epoch());
11583 return true;
11584 }
11585
11586 return false;
11587}
11588
11589bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
11590{
11591 op->mark_osdmon_event(__func__);
11592 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
11593 dout(10) << "prepare_pool_op " << *m << dendl;
11594 if (m->op == POOL_OP_CREATE) {
11595 return prepare_pool_op_create(op);
11596 } else if (m->op == POOL_OP_DELETE) {
11597 return prepare_pool_op_delete(op);
11598 }
11599
11600 int ret = 0;
11601 bool changed = false;
11602
11603 if (!osdmap.have_pg_pool(m->pool)) {
11604 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
11605 return false;
11606 }
11607
11608 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
11609
11610 switch (m->op) {
11611 case POOL_OP_CREATE_SNAP:
11612 if (pool->is_tier()) {
11613 ret = -EINVAL;
11614 _pool_op_reply(op, ret, osdmap.get_epoch());
11615 return false;
11616 } // else, fall through
11617 case POOL_OP_DELETE_SNAP:
11618 if (!pool->is_unmanaged_snaps_mode()) {
11619 bool snap_exists = pool->snap_exists(m->name.c_str());
11620 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
11621 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
11622 ret = 0;
11623 } else {
11624 break;
11625 }
11626 } else {
11627 ret = -EINVAL;
11628 }
11629 _pool_op_reply(op, ret, osdmap.get_epoch());
11630 return false;
11631
11632 case POOL_OP_DELETE_UNMANAGED_SNAP:
11633 // we won't allow removal of an unmanaged snapshot from a pool
11634 // not in unmanaged snaps mode.
11635 if (!pool->is_unmanaged_snaps_mode()) {
11636 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
11637 return false;
11638 }
11639 /* fall-thru */
11640 case POOL_OP_CREATE_UNMANAGED_SNAP:
11641 // but we will allow creating an unmanaged snapshot on any pool
11642 // as long as it is not in 'pool' snaps mode.
11643 if (pool->is_pool_snaps_mode()) {
11644 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
11645 return false;
11646 }
11647 }
11648
11649 // projected pool info
11650 pg_pool_t pp;
11651 if (pending_inc.new_pools.count(m->pool))
11652 pp = pending_inc.new_pools[m->pool];
11653 else
11654 pp = *osdmap.get_pg_pool(m->pool);
11655
11656 bufferlist reply_data;
11657
11658 // pool snaps vs unmanaged snaps are mutually exclusive
11659 switch (m->op) {
11660 case POOL_OP_CREATE_SNAP:
11661 case POOL_OP_DELETE_SNAP:
11662 if (pp.is_unmanaged_snaps_mode()) {
11663 ret = -EINVAL;
11664 goto out;
11665 }
11666 break;
11667
11668 case POOL_OP_CREATE_UNMANAGED_SNAP:
11669 case POOL_OP_DELETE_UNMANAGED_SNAP:
11670 if (pp.is_pool_snaps_mode()) {
11671 ret = -EINVAL;
11672 goto out;
11673 }
11674 }
11675
11676 switch (m->op) {
11677 case POOL_OP_CREATE_SNAP:
11678 if (!pp.snap_exists(m->name.c_str())) {
11679 pp.add_snap(m->name.c_str(), ceph_clock_now());
11680 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
11681 changed = true;
11682 }
11683 break;
11684
11685 case POOL_OP_DELETE_SNAP:
11686 {
11687 snapid_t s = pp.snap_exists(m->name.c_str());
11688 if (s) {
11689 pp.remove_snap(s);
11690 changed = true;
11691 }
11692 }
11693 break;
11694
11695 case POOL_OP_CREATE_UNMANAGED_SNAP:
11696 {
11697 uint64_t snapid;
11698 pp.add_unmanaged_snap(snapid);
11699 ::encode(snapid, reply_data);
11700 changed = true;
11701 }
11702 break;
11703
11704 case POOL_OP_DELETE_UNMANAGED_SNAP:
11705 if (!pp.is_removed_snap(m->snapid)) {
11706 pp.remove_unmanaged_snap(m->snapid);
11707 changed = true;
11708 }
11709 break;
11710
11711 case POOL_OP_AUID_CHANGE:
11712 if (pp.auid != m->auid) {
11713 pp.auid = m->auid;
11714 changed = true;
11715 }
11716 break;
11717
11718 default:
11719 ceph_abort();
11720 break;
11721 }
11722
11723 if (changed) {
11724 pp.set_snap_epoch(pending_inc.epoch);
11725 pending_inc.new_pools[m->pool] = pp;
11726 }
11727
11728 out:
11729 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
11730 return true;
11731}
11732
11733bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
11734{
11735 op->mark_osdmon_event(__func__);
11736 int err = prepare_new_pool(op);
11737 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
11738 return true;
11739}
11740
11741int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
11742 ostream *ss)
11743{
11744 const string& poolstr = osdmap.get_pool_name(pool_id);
11745
11746 // If the Pool is in use by CephFS, refuse to delete it
11747 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
11748 if (pending_fsmap.pool_in_use(pool_id)) {
11749 *ss << "pool '" << poolstr << "' is in use by CephFS";
11750 return -EBUSY;
11751 }
11752
11753 if (pool.tier_of >= 0) {
11754 *ss << "pool '" << poolstr << "' is a tier of '"
11755 << osdmap.get_pool_name(pool.tier_of) << "'";
11756 return -EBUSY;
11757 }
11758 if (!pool.tiers.empty()) {
11759 *ss << "pool '" << poolstr << "' has tiers";
11760 for(auto tier : pool.tiers) {
11761 *ss << " " << osdmap.get_pool_name(tier);
11762 }
11763 return -EBUSY;
11764 }
11765
11766 if (!g_conf->mon_allow_pool_delete) {
11767 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
11768 return -EPERM;
11769 }
11770
11771 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
11772 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
11773 return -EPERM;
11774 }
11775
11776 *ss << "pool '" << poolstr << "' removed";
11777 return 0;
11778}
11779
11780/**
11781 * Check if it is safe to add a tier to a base pool
11782 *
11783 * @return
11784 * True if the operation should proceed, false if we should abort here
11785 * (abort doesn't necessarily mean error, could be idempotency)
11786 */
11787bool OSDMonitor::_check_become_tier(
11788 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
11789 const int64_t base_pool_id, const pg_pool_t *base_pool,
11790 int *err,
11791 ostream *ss) const
11792{
11793 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
11794 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11795
11796 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11797 if (pending_fsmap.pool_in_use(tier_pool_id)) {
11798 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
11799 *err = -EBUSY;
11800 return false;
11801 }
11802
11803 if (base_pool->tiers.count(tier_pool_id)) {
11804 assert(tier_pool->tier_of == base_pool_id);
11805 *err = 0;
11806 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
11807 << base_pool_name << "'";
11808 return false;
11809 }
11810
11811 if (base_pool->is_tier()) {
11812 *ss << "pool '" << base_pool_name << "' is already a tier of '"
11813 << osdmap.get_pool_name(base_pool->tier_of) << "', "
11814 << "multiple tiers are not yet supported.";
11815 *err = -EINVAL;
11816 return false;
11817 }
11818
11819 if (tier_pool->has_tiers()) {
11820 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
11821 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
11822 it != tier_pool->tiers.end(); ++it)
11823 *ss << "'" << osdmap.get_pool_name(*it) << "',";
11824 *ss << " multiple tiers are not yet supported.";
11825 *err = -EINVAL;
11826 return false;
11827 }
11828
11829 if (tier_pool->is_tier()) {
11830 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
11831 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
11832 *err = -EINVAL;
11833 return false;
11834 }
11835
11836 *err = 0;
11837 return true;
11838}
11839
11840
11841/**
11842 * Check if it is safe to remove a tier from this base pool
11843 *
11844 * @return
11845 * True if the operation should proceed, false if we should abort here
11846 * (abort doesn't necessarily mean error, could be idempotency)
11847 */
11848bool OSDMonitor::_check_remove_tier(
11849 const int64_t base_pool_id, const pg_pool_t *base_pool,
11850 const pg_pool_t *tier_pool,
11851 int *err, ostream *ss) const
11852{
11853 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
11854
11855 // Apply CephFS-specific checks
11856 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
11857 if (pending_fsmap.pool_in_use(base_pool_id)) {
94b18763
FG
11858 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
11859 // If the underlying pool is erasure coded and does not allow EC
11860 // overwrites, we can't permit the removal of the replicated tier that
11861 // CephFS relies on to access it
11862 *ss << "pool '" << base_pool_name <<
11863 "' does not allow EC overwrites and is in use by CephFS"
11864 " via its tier";
7c673cae
FG
11865 *err = -EBUSY;
11866 return false;
11867 }
11868
11869 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
11870 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
11871 "tier is still in use as a writeback cache. Change the cache "
11872 "mode and flush the cache before removing it";
11873 *err = -EBUSY;
11874 return false;
11875 }
11876 }
11877
11878 *err = 0;
11879 return true;
11880}
11881
11882int OSDMonitor::_prepare_remove_pool(
11883 int64_t pool, ostream *ss, bool no_fake)
11884{
224ce89b 11885 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
11886 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11887 int r = _check_remove_pool(pool, *p, ss);
11888 if (r < 0)
11889 return r;
11890
11891 auto new_pool = pending_inc.new_pools.find(pool);
11892 if (new_pool != pending_inc.new_pools.end()) {
11893 // if there is a problem with the pending info, wait and retry
11894 // this op.
11895 const auto& p = new_pool->second;
11896 int r = _check_remove_pool(pool, p, ss);
11897 if (r < 0)
11898 return -EAGAIN;
11899 }
11900
11901 if (pending_inc.old_pools.count(pool)) {
224ce89b 11902 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
11903 << dendl;
11904 return 0;
11905 }
11906
11907 if (g_conf->mon_fake_pool_delete && !no_fake) {
11908 string old_name = osdmap.get_pool_name(pool);
11909 string new_name = old_name + "." + stringify(pool) + ".DELETED";
11910 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
11911 << old_name << " -> " << new_name << dendl;
11912 pending_inc.new_pool_names[pool] = new_name;
11913 return 0;
11914 }
11915
11916 // remove
11917 pending_inc.old_pools.insert(pool);
11918
224ce89b 11919 // remove any pg_temp mappings for this pool
7c673cae
FG
11920 for (auto p = osdmap.pg_temp->begin();
11921 p != osdmap.pg_temp->end();
11922 ++p) {
11923 if (p->first.pool() == (uint64_t)pool) {
224ce89b 11924 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
11925 << p->first << dendl;
11926 pending_inc.new_pg_temp[p->first].clear();
11927 }
11928 }
224ce89b 11929 // remove any primary_temp mappings for this pool
7c673cae
FG
11930 for (auto p = osdmap.primary_temp->begin();
11931 p != osdmap.primary_temp->end();
11932 ++p) {
11933 if (p->first.pool() == (uint64_t)pool) {
224ce89b 11934 dout(10) << __func__ << " " << pool
7c673cae
FG
11935 << " removing obsolete primary_temp" << p->first << dendl;
11936 pending_inc.new_primary_temp[p->first] = -1;
11937 }
11938 }
224ce89b
WB
11939 // remove any pg_upmap mappings for this pool
11940 for (auto& p : osdmap.pg_upmap) {
11941 if (p.first.pool() == (uint64_t)pool) {
11942 dout(10) << __func__ << " " << pool
11943 << " removing obsolete pg_upmap "
11944 << p.first << dendl;
11945 pending_inc.old_pg_upmap.insert(p.first);
11946 }
11947 }
94b18763
FG
11948 // remove any pending pg_upmap mappings for this pool
11949 {
11950 auto it = pending_inc.new_pg_upmap.begin();
11951 while (it != pending_inc.new_pg_upmap.end()) {
11952 if (it->first.pool() == (uint64_t)pool) {
11953 dout(10) << __func__ << " " << pool
11954 << " removing pending pg_upmap "
11955 << it->first << dendl;
11956 it = pending_inc.new_pg_upmap.erase(it);
11957 } else {
11958 it++;
11959 }
11960 }
11961 }
224ce89b
WB
11962 // remove any pg_upmap_items mappings for this pool
11963 for (auto& p : osdmap.pg_upmap_items) {
11964 if (p.first.pool() == (uint64_t)pool) {
11965 dout(10) << __func__ << " " << pool
11966 << " removing obsolete pg_upmap_items " << p.first
11967 << dendl;
11968 pending_inc.old_pg_upmap_items.insert(p.first);
11969 }
11970 }
94b18763
FG
11971 // remove any pending pg_upmap mappings for this pool
11972 {
11973 auto it = pending_inc.new_pg_upmap_items.begin();
11974 while (it != pending_inc.new_pg_upmap_items.end()) {
11975 if (it->first.pool() == (uint64_t)pool) {
11976 dout(10) << __func__ << " " << pool
11977 << " removing pending pg_upmap_items "
11978 << it->first << dendl;
11979 it = pending_inc.new_pg_upmap_items.erase(it);
11980 } else {
11981 it++;
11982 }
11983 }
11984 }
35e4c445
FG
11985
11986 // remove any choose_args for this pool
11987 CrushWrapper newcrush;
11988 _get_pending_crush(newcrush);
11989 if (newcrush.have_choose_args(pool)) {
11990 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
11991 newcrush.rm_choose_args(pool);
11992 pending_inc.crush.clear();
11993 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11994 }
7c673cae
FG
11995 return 0;
11996}
11997
11998int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
11999{
12000 dout(10) << "_prepare_rename_pool " << pool << dendl;
12001 if (pending_inc.old_pools.count(pool)) {
12002 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
12003 return -ENOENT;
12004 }
12005 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
12006 p != pending_inc.new_pool_names.end();
12007 ++p) {
12008 if (p->second == newname && p->first != pool) {
12009 return -EEXIST;
12010 }
12011 }
12012
12013 pending_inc.new_pool_names[pool] = newname;
12014 return 0;
12015}
12016
12017bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
12018{
12019 op->mark_osdmon_event(__func__);
12020 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12021 ostringstream ss;
12022 int ret = _prepare_remove_pool(m->pool, &ss, false);
12023 if (ret == -EAGAIN) {
12024 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12025 return true;
12026 }
12027 if (ret < 0)
12028 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
12029 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
12030 pending_inc.epoch));
12031 return true;
12032}
12033
12034void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
12035 int ret, epoch_t epoch, bufferlist *blp)
12036{
12037 op->mark_osdmon_event(__func__);
12038 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12039 dout(20) << "_pool_op_reply " << ret << dendl;
12040 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
12041 ret, epoch, get_last_committed(), blp);
12042 mon->send_reply(op, reply);
12043}