]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/OSDMonitor.cc
update sources to v12.1.1
[ceph.git] / ceph / src / mon / OSDMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19#include <algorithm>
224ce89b
WB
20#include <boost/algorithm/string.hpp>
21#include <locale>
7c673cae
FG
22#include <sstream>
23
31f18b77
FG
24#include "mon/OSDMonitor.h"
25#include "mon/Monitor.h"
26#include "mon/MDSMonitor.h"
27#include "mon/PGMonitor.h"
28#include "mon/MgrStatMonitor.h"
29#include "mon/AuthMonitor.h"
30#include "mon/ConfigKeyService.h"
7c673cae 31
31f18b77
FG
32#include "mon/MonitorDBStore.h"
33#include "mon/Session.h"
7c673cae
FG
34
35#include "crush/CrushWrapper.h"
36#include "crush/CrushTester.h"
37#include "crush/CrushTreeDumper.h"
38
39#include "messages/MOSDBeacon.h"
40#include "messages/MOSDFailure.h"
41#include "messages/MOSDMarkMeDown.h"
42#include "messages/MOSDFull.h"
43#include "messages/MOSDMap.h"
44#include "messages/MMonGetOSDMap.h"
45#include "messages/MOSDBoot.h"
46#include "messages/MOSDAlive.h"
47#include "messages/MPoolOp.h"
48#include "messages/MPoolOpReply.h"
49#include "messages/MOSDPGCreate.h"
50#include "messages/MOSDPGCreated.h"
51#include "messages/MOSDPGTemp.h"
52#include "messages/MMonCommand.h"
53#include "messages/MRemoveSnaps.h"
54#include "messages/MOSDScrub.h"
55#include "messages/MRoute.h"
56
57#include "common/TextTable.h"
58#include "common/Timer.h"
59#include "common/ceph_argparse.h"
60#include "common/perf_counters.h"
61#include "common/strtol.h"
62
63#include "common/config.h"
64#include "common/errno.h"
65
66#include "erasure-code/ErasureCodePlugin.h"
67#include "compressor/Compressor.h"
68#include "common/Checksummer.h"
69
70#include "include/compat.h"
71#include "include/assert.h"
72#include "include/stringify.h"
73#include "include/util.h"
74#include "common/cmdparse.h"
75#include "include/str_list.h"
76#include "include/str_map.h"
224ce89b 77#include "include/scope_guard.h"
7c673cae
FG
78
79#include "json_spirit/json_spirit_reader.h"
80
81#define dout_subsys ceph_subsys_mon
82#define OSD_PG_CREATING_PREFIX "osd_pg_creating"
83
84void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
85{
86 if (epoch_by_pg.size() <= ps) {
87 epoch_by_pg.resize(ps + 1, 0);
88 }
89 const auto old_lec = epoch_by_pg[ps];
90 if (old_lec >= last_epoch_clean) {
91 // stale lec
92 return;
93 }
94 epoch_by_pg[ps] = last_epoch_clean;
95 if (last_epoch_clean < floor) {
96 floor = last_epoch_clean;
97 } else if (last_epoch_clean > floor) {
98 if (old_lec == floor) {
99 // probably should increase floor?
100 auto new_floor = std::min_element(std::begin(epoch_by_pg),
101 std::end(epoch_by_pg));
102 floor = *new_floor;
103 }
104 }
105 if (ps != next_missing) {
106 return;
107 }
108 for (; next_missing < epoch_by_pg.size(); next_missing++) {
109 if (epoch_by_pg[next_missing] == 0) {
110 break;
111 }
112 }
113}
114
115void LastEpochClean::remove_pool(uint64_t pool)
116{
117 report_by_pool.erase(pool);
118}
119
120void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
121{
122 auto& lec = report_by_pool[pg.pool()];
123 return lec.report(pg.ps(), last_epoch_clean);
124}
125
126epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
127{
128 auto floor = latest.get_epoch();
129 for (auto& pool : latest.get_pools()) {
130 auto reported = report_by_pool.find(pool.first);
131 if (reported == report_by_pool.end()) {
132 return 0;
133 }
134 if (reported->second.next_missing < pool.second.get_pg_num()) {
135 return 0;
136 }
137 if (reported->second.floor < floor) {
138 floor = reported->second.floor;
139 }
140 }
141 return floor;
142}
143
144
145struct C_UpdateCreatingPGs : public Context {
146 OSDMonitor *osdmon;
147 utime_t start;
148 epoch_t epoch;
149 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
150 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
151 void finish(int r) override {
152 if (r >= 0) {
153 utime_t end = ceph_clock_now();
154 dout(10) << "osdmap epoch " << epoch << " mapping took "
155 << (end - start) << " seconds" << dendl;
156 osdmon->update_creating_pgs();
157 osdmon->check_pg_creates_subs();
158 }
159 }
160};
161
162#undef dout_prefix
163#define dout_prefix _prefix(_dout, mon, osdmap)
164static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
165 return *_dout << "mon." << mon->name << "@" << mon->rank
166 << "(" << mon->get_state_name()
167 << ").osd e" << osdmap.get_epoch() << " ";
168}
169
170OSDMonitor::OSDMonitor(
171 CephContext *cct,
172 Monitor *mn,
173 Paxos *p,
174 const string& service_name)
175 : PaxosService(mn, p, service_name),
176 cct(cct),
177 inc_osd_cache(g_conf->mon_osd_cache_size),
178 full_osd_cache(g_conf->mon_osd_cache_size),
179 last_attempted_minwait_time(utime_t()),
180 mapper(mn->cct, &mn->cpu_tp),
181 op_tracker(cct, true, 1)
182{}
183
184bool OSDMonitor::_have_pending_crush()
185{
186 return pending_inc.crush.length() > 0;
187}
188
189CrushWrapper &OSDMonitor::_get_stable_crush()
190{
191 return *osdmap.crush;
192}
193
194void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
195{
196 bufferlist bl;
197 if (pending_inc.crush.length())
198 bl = pending_inc.crush;
199 else
200 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
201
202 bufferlist::iterator p = bl.begin();
203 newcrush.decode(p);
204}
205
206void OSDMonitor::create_initial()
207{
208 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
209
210 OSDMap newmap;
211
212 bufferlist bl;
213 mon->store->get("mkfs", "osdmap", bl);
214
215 if (bl.length()) {
216 newmap.decode(bl);
217 newmap.set_fsid(mon->monmap->fsid);
218 } else {
224ce89b 219 newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0);
7c673cae
FG
220 }
221 newmap.set_epoch(1);
222 newmap.created = newmap.modified = ceph_clock_now();
223
224 // new clusters should sort bitwise by default.
225 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
226
227 // new cluster should require latest by default
31f18b77
FG
228 if (g_conf->mon_debug_no_require_luminous) {
229 newmap.require_osd_release = CEPH_RELEASE_KRAKEN;
230 derr << __func__ << " mon_debug_no_require_luminous=true" << dendl;
231 } else {
232 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
7c673cae
FG
233 newmap.full_ratio = g_conf->mon_osd_full_ratio;
234 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
235 newmap.backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
236 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
237 newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
238 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
31f18b77
FG
239 int r = ceph_release_from_name(
240 g_conf->mon_osd_initial_require_min_compat_client.c_str());
241 if (r <= 0) {
242 assert(0 == "mon_osd_initial_require_min_compat_client is not valid");
243 }
244 newmap.require_min_compat_client = r;
7c673cae
FG
245 }
246
247 // encode into pending incremental
248 newmap.encode(pending_inc.fullmap,
249 mon->get_quorum_con_features() | CEPH_FEATURE_RESERVED);
250 pending_inc.full_crc = newmap.get_crc();
251 dout(20) << " full crc " << pending_inc.full_crc << dendl;
252}
253
254void OSDMonitor::get_store_prefixes(std::set<string>& s)
255{
256 s.insert(service_name);
257 s.insert(OSD_PG_CREATING_PREFIX);
258}
259
260void OSDMonitor::update_from_paxos(bool *need_bootstrap)
261{
262 version_t version = get_last_committed();
263 if (version == osdmap.epoch)
264 return;
265 assert(version > osdmap.epoch);
266
267 dout(15) << "update_from_paxos paxos e " << version
268 << ", my e " << osdmap.epoch << dendl;
269
31f18b77
FG
270 if (mapping_job) {
271 if (!mapping_job->is_done()) {
272 dout(1) << __func__ << " mapping job "
273 << mapping_job.get() << " did not complete, "
274 << mapping_job->shards << " left, canceling" << dendl;
275 mapping_job->abort();
276 }
277 mapping_job.reset();
278 }
7c673cae 279
224ce89b
WB
280 load_health();
281
7c673cae
FG
282 /*
283 * We will possibly have a stashed latest that *we* wrote, and we will
284 * always be sure to have the oldest full map in the first..last range
285 * due to encode_trim_extra(), which includes the oldest full map in the trim
286 * transaction.
287 *
288 * encode_trim_extra() does not however write the full map's
289 * version to 'full_latest'. This is only done when we are building the
290 * full maps from the incremental versions. But don't panic! We make sure
291 * that the following conditions find whichever full map version is newer.
292 */
293 version_t latest_full = get_version_latest_full();
294 if (latest_full == 0 && get_first_committed() > 1)
295 latest_full = get_first_committed();
296
297 if (get_first_committed() > 1 &&
298 latest_full < get_first_committed()) {
299 // the monitor could be just sync'ed with its peer, and the latest_full key
300 // is not encoded in the paxos commits in encode_pending(), so we need to
301 // make sure we get it pointing to a proper version.
302 version_t lc = get_last_committed();
303 version_t fc = get_first_committed();
304
305 dout(10) << __func__ << " looking for valid full map in interval"
306 << " [" << fc << ", " << lc << "]" << dendl;
307
308 latest_full = 0;
309 for (version_t v = lc; v >= fc; v--) {
310 string full_key = "full_" + stringify(v);
311 if (mon->store->exists(get_service_name(), full_key)) {
312 dout(10) << __func__ << " found latest full map v " << v << dendl;
313 latest_full = v;
314 break;
315 }
316 }
317
318 assert(latest_full > 0);
319 auto t(std::make_shared<MonitorDBStore::Transaction>());
320 put_version_latest_full(t, latest_full);
321 mon->store->apply_transaction(t);
322 dout(10) << __func__ << " updated the on-disk full map version to "
323 << latest_full << dendl;
324 }
325
326 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
327 bufferlist latest_bl;
328 get_version_full(latest_full, latest_bl);
329 assert(latest_bl.length() != 0);
330 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
331 osdmap.decode(latest_bl);
332 }
333
334 if (mon->monmap->get_required_features().contains_all(
335 ceph::features::mon::FEATURE_LUMINOUS)) {
336 bufferlist bl;
337 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
338 auto p = bl.begin();
339 std::lock_guard<std::mutex> l(creating_pgs_lock);
340 creating_pgs.decode(p);
341 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
342 << creating_pgs.last_scan_epoch
343 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
344 } else {
345 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
346 << dendl;
347 }
348 }
349
31f18b77
FG
350 // make sure we're using the right pg service.. remove me post-luminous!
351 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
352 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
353 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
354 } else {
355 dout(10) << __func__ << " pgservice is pg" << dendl;
356 mon->pgservice = mon->pgmon()->get_pg_stat_service();
357 }
358
7c673cae
FG
359 // walk through incrementals
360 MonitorDBStore::TransactionRef t;
361 size_t tx_size = 0;
362 while (version > osdmap.epoch) {
363 bufferlist inc_bl;
364 int err = get_version(osdmap.epoch+1, inc_bl);
365 assert(err == 0);
366 assert(inc_bl.length());
367
368 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
369 << dendl;
370 OSDMap::Incremental inc(inc_bl);
371 err = osdmap.apply_incremental(inc);
372 assert(err == 0);
373
374 if (!t)
375 t.reset(new MonitorDBStore::Transaction);
376
377 // Write out the full map for all past epochs. Encode the full
378 // map with the same features as the incremental. If we don't
379 // know, use the quorum features. If we don't know those either,
380 // encode with all features.
381 uint64_t f = inc.encode_features;
382 if (!f)
383 f = mon->get_quorum_con_features();
384 if (!f)
385 f = -1;
386 bufferlist full_bl;
387 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
388 tx_size += full_bl.length();
389
390 bufferlist orig_full_bl;
391 get_version_full(osdmap.epoch, orig_full_bl);
392 if (orig_full_bl.length()) {
393 // the primary provided the full map
394 assert(inc.have_crc);
395 if (inc.full_crc != osdmap.crc) {
396 // This will happen if the mons were running mixed versions in
397 // the past or some other circumstance made the full encoded
398 // maps divergent. Reloading here will bring us back into
399 // sync with the primary for this and all future maps. OSDs
400 // will also be brought back into sync when they discover the
401 // crc mismatch and request a full map from a mon.
402 derr << __func__ << " full map CRC mismatch, resetting to canonical"
403 << dendl;
404 osdmap = OSDMap();
405 osdmap.decode(orig_full_bl);
406 }
407 } else {
408 assert(!inc.have_crc);
409 put_version_full(t, osdmap.epoch, full_bl);
410 }
411 put_version_latest_full(t, osdmap.epoch);
412
413 // share
414 dout(1) << osdmap << dendl;
415
416 if (osdmap.epoch == 1) {
417 t->erase("mkfs", "osdmap");
418 }
419
31f18b77
FG
420 // make sure we're using the right pg service.. remove me post-luminous!
421 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
422 dout(10) << __func__ << " pgservice is mgrstat" << dendl;
423 mon->pgservice = mon->mgrstatmon()->get_pg_stat_service();
424 } else {
425 dout(10) << __func__ << " pgservice is pg" << dendl;
426 mon->pgservice = mon->pgmon()->get_pg_stat_service();
427 }
428
7c673cae
FG
429 if (tx_size > g_conf->mon_sync_max_payload_size*2) {
430 mon->store->apply_transaction(t);
431 t = MonitorDBStore::TransactionRef();
432 tx_size = 0;
433 }
434 if (mon->monmap->get_required_features().contains_all(
435 ceph::features::mon::FEATURE_LUMINOUS)) {
7c673cae
FG
436 for (const auto &osd_state : inc.new_state) {
437 if (osd_state.second & CEPH_OSD_UP) {
438 // could be marked up *or* down, but we're too lazy to check which
439 last_osd_report.erase(osd_state.first);
440 }
441 if (osd_state.second & CEPH_OSD_EXISTS) {
442 // could be created *or* destroyed, but we can safely drop it
443 osd_epochs.erase(osd_state.first);
444 }
445 }
446 }
447 }
448
449 if (t) {
450 mon->store->apply_transaction(t);
451 }
452
453 for (int o = 0; o < osdmap.get_max_osd(); o++) {
454 if (osdmap.is_out(o))
455 continue;
456 auto found = down_pending_out.find(o);
457 if (osdmap.is_down(o)) {
458 // populate down -> out map
459 if (found == down_pending_out.end()) {
460 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
461 down_pending_out[o] = ceph_clock_now();
462 }
463 } else {
464 if (found != down_pending_out.end()) {
465 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
466 down_pending_out.erase(found);
467 }
468 }
469 }
470 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
471
472 if (mon->is_leader()) {
473 // kick pgmon, make sure it's seen the latest map
474 mon->pgmon()->check_osd_map(osdmap.epoch);
475 }
476
477 check_osdmap_subs();
478 check_pg_creates_subs();
479
480 share_map_with_random_osd();
481 update_logger();
482
483 process_failures();
484
485 // make sure our feature bits reflect the latest map
486 update_msgr_features();
487
488 if (!mon->is_leader()) {
489 // will be called by on_active() on the leader, avoid doing so twice
490 start_mapping();
491 }
492}
493
494void OSDMonitor::start_mapping()
495{
496 // initiate mapping job
497 if (mapping_job) {
498 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
499 << dendl;
500 mapping_job->abort();
501 }
224ce89b
WB
502 if (!osdmap.get_pools().empty()) {
503 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
504 mapping_job = mapping.start_update(osdmap, mapper,
505 g_conf->mon_osd_mapping_pgs_per_chunk);
506 dout(10) << __func__ << " started mapping job " << mapping_job.get()
507 << " at " << fin->start << dendl;
508 mapping_job->set_finish_event(fin);
509 } else {
510 dout(10) << __func__ << " no pools, no mapping job" << dendl;
511 mapping_job = nullptr;
512 }
7c673cae
FG
513}
514
515void OSDMonitor::update_msgr_features()
516{
517 set<int> types;
518 types.insert((int)entity_name_t::TYPE_OSD);
519 types.insert((int)entity_name_t::TYPE_CLIENT);
520 types.insert((int)entity_name_t::TYPE_MDS);
521 types.insert((int)entity_name_t::TYPE_MON);
522 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
523 uint64_t mask;
524 uint64_t features = osdmap.get_features(*q, &mask);
525 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
526 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
527 Messenger::Policy p = mon->messenger->get_policy(*q);
528 p.features_required = (p.features_required & ~mask) | features;
529 mon->messenger->set_policy(*q, p);
530 }
531 }
532}
533
534void OSDMonitor::on_active()
535{
536 update_logger();
537
538 if (mon->is_leader()) {
224ce89b 539 mon->clog->debug() << "osdmap " << osdmap;
7c673cae
FG
540 } else {
541 list<MonOpRequestRef> ls;
542 take_all_failures(ls);
543 while (!ls.empty()) {
544 MonOpRequestRef op = ls.front();
545 op->mark_osdmon_event(__func__);
546 dispatch(op);
547 ls.pop_front();
548 }
549 }
550 start_mapping();
551}
552
553void OSDMonitor::on_restart()
554{
555 last_osd_report.clear();
31f18b77
FG
556
557 if (mon->is_leader()) {
558 // fix ruleset != ruleid
559 if (osdmap.crush->has_legacy_rulesets() &&
560 !osdmap.crush->has_multirule_rulesets()) {
561 CrushWrapper newcrush;
562 _get_pending_crush(newcrush);
563 int r = newcrush.renumber_rules_by_ruleset();
564 if (r >= 0) {
565 dout(1) << __func__ << " crush map has ruleset != rule id; fixing" << dendl;
566 pending_inc.crush.clear();
567 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
568 } else {
569 dout(10) << __func__ << " unable to renumber rules by ruleset" << dendl;
570 }
571 }
572 }
7c673cae
FG
573}
574
575void OSDMonitor::on_shutdown()
576{
577 dout(10) << __func__ << dendl;
578 if (mapping_job) {
579 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
580 << dendl;
581 mapping_job->abort();
582 }
583
584 // discard failure info, waiters
585 list<MonOpRequestRef> ls;
586 take_all_failures(ls);
587 ls.clear();
588}
589
590void OSDMonitor::update_logger()
591{
592 dout(10) << "update_logger" << dendl;
593
594 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
595 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
596 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
597 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
598}
599
7c673cae
FG
600void OSDMonitor::create_pending()
601{
602 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
603 pending_inc.fsid = mon->monmap->fsid;
604
605 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
606
607 // clean up pg_temp, primary_temp
608 OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
609 dout(10) << "create_pending did clean_temps" << dendl;
610
611 // On upgrade OSDMap has new field set by mon_osd_backfillfull_ratio config
612 // instead of osd_backfill_full_ratio config
613 if (osdmap.backfillfull_ratio <= 0) {
614 pending_inc.new_backfillfull_ratio = g_conf->mon_osd_backfillfull_ratio;
615 if (pending_inc.new_backfillfull_ratio > 1.0)
616 pending_inc.new_backfillfull_ratio /= 100;
617 dout(1) << __func__ << " setting backfillfull_ratio = "
618 << pending_inc.new_backfillfull_ratio << dendl;
619 }
31f18b77
FG
620 if (osdmap.get_epoch() > 0 &&
621 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae 622 // transition full ratios from PGMap to OSDMap (on upgrade)
31f18b77
FG
623 float full_ratio = mon->pgservice->get_full_ratio();
624 float nearfull_ratio = mon->pgservice->get_nearfull_ratio();
625 if (osdmap.full_ratio != full_ratio) {
7c673cae 626 dout(10) << __func__ << " full_ratio " << osdmap.full_ratio
31f18b77
FG
627 << " -> " << full_ratio << " (from pgmap)" << dendl;
628 pending_inc.new_full_ratio = full_ratio;
7c673cae 629 }
31f18b77 630 if (osdmap.nearfull_ratio != nearfull_ratio) {
7c673cae 631 dout(10) << __func__ << " nearfull_ratio " << osdmap.nearfull_ratio
31f18b77
FG
632 << " -> " << nearfull_ratio << " (from pgmap)" << dendl;
633 pending_inc.new_nearfull_ratio = nearfull_ratio;
7c673cae
FG
634 }
635 } else {
636 // safety check (this shouldn't really happen)
637 if (osdmap.full_ratio <= 0) {
638 pending_inc.new_full_ratio = g_conf->mon_osd_full_ratio;
639 if (pending_inc.new_full_ratio > 1.0)
640 pending_inc.new_full_ratio /= 100;
641 dout(1) << __func__ << " setting full_ratio = "
642 << pending_inc.new_full_ratio << dendl;
643 }
644 if (osdmap.nearfull_ratio <= 0) {
645 pending_inc.new_nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
646 if (pending_inc.new_nearfull_ratio > 1.0)
647 pending_inc.new_nearfull_ratio /= 100;
648 dout(1) << __func__ << " setting nearfull_ratio = "
649 << pending_inc.new_nearfull_ratio << dendl;
650 }
651 }
652}
653
654creating_pgs_t
655OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
656{
31f18b77 657 dout(10) << __func__ << dendl;
7c673cae
FG
658 creating_pgs_t pending_creatings;
659 {
660 std::lock_guard<std::mutex> l(creating_pgs_lock);
661 pending_creatings = creating_pgs;
662 }
31f18b77
FG
663 // check for new or old pools
664 if (pending_creatings.last_scan_epoch < inc.epoch) {
665 if (osdmap.get_epoch() &&
666 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
667 auto added =
668 mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
669 osdmap.get_pools(),
670 &pending_creatings);
671 dout(7) << __func__ << " " << added << " pgs added from pgmap" << dendl;
672 }
673 unsigned queued = 0;
674 queued += scan_for_creating_pgs(osdmap.get_pools(),
675 inc.old_pools,
676 inc.modified,
677 &pending_creatings);
678 queued += scan_for_creating_pgs(inc.new_pools,
679 inc.old_pools,
680 inc.modified,
681 &pending_creatings);
682 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
683 for (auto deleted_pool : inc.old_pools) {
684 auto removed = pending_creatings.remove_pool(deleted_pool);
685 dout(10) << __func__ << " " << removed
686 << " pg removed because containing pool deleted: "
687 << deleted_pool << dendl;
688 last_epoch_clean.remove_pool(deleted_pool);
689 }
690 // pgmon updates its creating_pgs in check_osd_map() which is called by
691 // on_active() and check_osd_map() could be delayed if lease expires, so its
692 // creating_pgs could be stale in comparison with the one of osdmon. let's
693 // trim them here. otherwise, they will be added back after being erased.
694 unsigned removed = 0;
695 for (auto& pg : pending_created_pgs) {
696 dout(20) << __func__ << " noting created pg " << pg << dendl;
697 pending_creatings.created_pools.insert(pg.pool());
698 removed += pending_creatings.pgs.erase(pg);
699 }
700 pending_created_pgs.clear();
701 dout(10) << __func__ << " " << removed
702 << " pgs removed because they're created" << dendl;
703 pending_creatings.last_scan_epoch = osdmap.get_epoch();
704 }
705
706 // process queue
707 unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
708 const auto total = pending_creatings.pgs.size();
709 while (pending_creatings.pgs.size() < max &&
710 !pending_creatings.queue.empty()) {
711 auto p = pending_creatings.queue.begin();
712 int64_t poolid = p->first;
713 dout(10) << __func__ << " pool " << poolid
714 << " created " << p->second.created
715 << " modified " << p->second.modified
716 << " [" << p->second.start << "-" << p->second.end << ")"
717 << dendl;
718 int n = MIN(max - pending_creatings.pgs.size(),
719 p->second.end - p->second.start);
720 ps_t first = p->second.start;
721 ps_t end = first + n;
722 for (ps_t ps = first; ps < end; ++ps) {
723 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
724 // NOTE: use the *current* epoch as the PG creation epoch so that the
725 // OSD does not have to generate a long set of PastIntervals.
726 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
727 p->second.modified));
728 dout(10) << __func__ << " adding " << pgid << dendl;
729 }
730 p->second.start = end;
731 if (p->second.done()) {
732 dout(10) << __func__ << " done with queue for " << poolid << dendl;
733 pending_creatings.queue.erase(p);
734 } else {
735 dout(10) << __func__ << " pool " << poolid
736 << " now [" << p->second.start << "-" << p->second.end << ")"
737 << dendl;
738 }
739 }
740 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
741 << " pools" << dendl;
742 dout(10) << __func__ << " " << pending_creatings.pgs.size() - total
743 << " pgs added from queued pools" << dendl;
7c673cae
FG
744 return pending_creatings;
745}
746
747void OSDMonitor::maybe_prime_pg_temp()
748{
749 bool all = false;
750 if (pending_inc.crush.length()) {
751 dout(10) << __func__ << " new crush map, all" << dendl;
752 all = true;
753 }
754
755 if (!pending_inc.new_up_client.empty()) {
756 dout(10) << __func__ << " new up osds, all" << dendl;
757 all = true;
758 }
759
760 // check for interesting OSDs
761 set<int> osds;
31f18b77 762 for (auto p = pending_inc.new_state.begin();
7c673cae
FG
763 !all && p != pending_inc.new_state.end();
764 ++p) {
765 if ((p->second & CEPH_OSD_UP) &&
766 osdmap.is_up(p->first)) {
767 osds.insert(p->first);
768 }
769 }
770 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
771 !all && p != pending_inc.new_weight.end();
772 ++p) {
773 if (p->second < osdmap.get_weight(p->first)) {
774 // weight reduction
775 osds.insert(p->first);
776 } else {
777 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
778 << dendl;
779 all = true;
780 }
781 }
782
783 if (!all && osds.empty())
784 return;
785
786 if (!all) {
787 unsigned estimate =
788 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
789 if (estimate > mapping.get_num_pgs() *
790 g_conf->mon_osd_prime_pg_temp_max_estimate) {
791 dout(10) << __func__ << " estimate " << estimate << " pgs on "
792 << osds.size() << " osds >= "
793 << g_conf->mon_osd_prime_pg_temp_max_estimate << " of total "
794 << mapping.get_num_pgs() << " pgs, all"
795 << dendl;
796 all = true;
797 } else {
798 dout(10) << __func__ << " estimate " << estimate << " pgs on "
799 << osds.size() << " osds" << dendl;
800 }
801 }
802
803 OSDMap next;
804 next.deepish_copy_from(osdmap);
805 next.apply_incremental(pending_inc);
806
224ce89b
WB
807 if (next.get_pools().empty()) {
808 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
809 } else if (all) {
7c673cae
FG
810 PrimeTempJob job(next, this);
811 mapper.queue(&job, g_conf->mon_osd_mapping_pgs_per_chunk);
812 if (job.wait_for(g_conf->mon_osd_prime_pg_temp_max_time)) {
813 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
814 } else {
815 dout(10) << __func__ << " did not finish in "
816 << g_conf->mon_osd_prime_pg_temp_max_time
817 << ", stopping" << dendl;
818 job.abort();
819 }
820 } else {
821 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
822 utime_t stop = ceph_clock_now();
823 stop += g_conf->mon_osd_prime_pg_temp_max_time;
824 const int chunk = 1000;
825 int n = chunk;
826 std::unordered_set<pg_t> did_pgs;
827 for (auto osd : osds) {
828 auto& pgs = mapping.get_osd_acting_pgs(osd);
829 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
830 for (auto pgid : pgs) {
831 if (!did_pgs.insert(pgid).second) {
832 continue;
833 }
834 prime_pg_temp(next, pgid);
835 if (--n <= 0) {
836 n = chunk;
837 if (ceph_clock_now() > stop) {
838 dout(10) << __func__ << " consumed more than "
839 << g_conf->mon_osd_prime_pg_temp_max_time
840 << " seconds, stopping"
841 << dendl;
842 return;
843 }
844 }
845 }
846 }
847 }
848}
849
850void OSDMonitor::prime_pg_temp(
851 const OSDMap& next,
852 pg_t pgid)
853{
854 if (mon->monmap->get_required_features().contains_all(
855 ceph::features::mon::FEATURE_LUMINOUS)) {
31f18b77 856 // TODO: remove this creating_pgs direct access?
7c673cae
FG
857 if (creating_pgs.pgs.count(pgid)) {
858 return;
859 }
860 } else {
31f18b77 861 if (mon->pgservice->is_creating_pg(pgid)) {
7c673cae
FG
862 return;
863 }
864 }
865 if (!osdmap.pg_exists(pgid)) {
866 return;
867 }
868
869 vector<int> up, acting;
870 mapping.get(pgid, &up, nullptr, &acting, nullptr);
871
872 vector<int> next_up, next_acting;
873 int next_up_primary, next_acting_primary;
874 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
875 &next_acting, &next_acting_primary);
876 if (acting == next_acting)
877 return; // no change since last epoch
878
879 if (acting.empty())
880 return; // if previously empty now we can be no worse off
881 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
882 if (pool && acting.size() < pool->min_size)
883 return; // can be no worse off than before
884
885 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
886 << " -> " << next_up << "/" << next_acting
887 << ", priming " << acting
888 << dendl;
889 {
890 Mutex::Locker l(prime_pg_temp_lock);
891 // do not touch a mapping if a change is pending
892 pending_inc.new_pg_temp.emplace(
893 pgid,
894 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
895 }
896}
897
898/**
899 * @note receiving a transaction in this function gives a fair amount of
900 * freedom to the service implementation if it does need it. It shouldn't.
901 */
902void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
903{
904 dout(10) << "encode_pending e " << pending_inc.epoch
905 << dendl;
906
907 // finalize up pending_inc
908 pending_inc.modified = ceph_clock_now();
909
910 int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
911 assert(r == 0);
912
913 if (mapping_job) {
914 if (!mapping_job->is_done()) {
915 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
916 << mapping_job.get() << " did not complete, "
917 << mapping_job->shards << " left" << dendl;
918 mapping_job->abort();
919 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
920 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
921 << mapping_job.get() << " is prior epoch "
922 << mapping.get_epoch() << dendl;
923 } else {
924 if (g_conf->mon_osd_prime_pg_temp) {
925 maybe_prime_pg_temp();
926 }
927 }
928 } else if (g_conf->mon_osd_prime_pg_temp) {
929 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
930 << dendl;
931 }
932 mapping_job.reset();
933
934 bufferlist bl;
935
936 {
937 OSDMap tmp;
938 tmp.deepish_copy_from(osdmap);
939 tmp.apply_incremental(pending_inc);
940
31f18b77 941 if (tmp.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
942 // set or clear full/nearfull?
943 int full, backfill, nearfull;
944 tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
945 if (full > 0) {
946 if (!tmp.test_flag(CEPH_OSDMAP_FULL)) {
947 dout(10) << __func__ << " setting full flag" << dendl;
948 add_flag(CEPH_OSDMAP_FULL);
949 remove_flag(CEPH_OSDMAP_NEARFULL);
950 }
951 } else {
952 if (tmp.test_flag(CEPH_OSDMAP_FULL)) {
953 dout(10) << __func__ << " clearing full flag" << dendl;
954 remove_flag(CEPH_OSDMAP_FULL);
955 }
956 if (nearfull > 0) {
957 if (!tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
958 dout(10) << __func__ << " setting nearfull flag" << dendl;
959 add_flag(CEPH_OSDMAP_NEARFULL);
960 }
961 } else {
962 if (tmp.test_flag(CEPH_OSDMAP_NEARFULL)) {
963 dout(10) << __func__ << " clearing nearfull flag" << dendl;
964 remove_flag(CEPH_OSDMAP_NEARFULL);
965 }
966 }
967 }
968
969 // min_compat_client?
31f18b77 970 if (tmp.require_min_compat_client == 0) {
7c673cae 971 auto mv = tmp.get_min_compat_client();
31f18b77
FG
972 dout(1) << __func__ << " setting require_min_compat_client to currently "
973 << "required " << ceph_release_name(mv) << dendl;
974 mon->clog->info() << "setting require_min_compat_client to currently "
975 << "required " << ceph_release_name(mv);
976 pending_inc.new_require_min_compat_client = mv;
7c673cae 977 }
224ce89b
WB
978
979 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
980 // convert ec profile ruleset-* -> crush-*
981 for (auto& p : tmp.erasure_code_profiles) {
982 bool changed = false;
983 map<string,string> newprofile;
984 for (auto& q : p.second) {
985 if (q.first.find("ruleset-") == 0) {
986 string key = "crush-";
987 key += q.first.substr(8);
988 newprofile[key] = q.second;
989 changed = true;
990 dout(20) << " updating ec profile " << p.first
991 << " key " << q.first << " -> " << key << dendl;
992 } else {
993 newprofile[q.first] = q.second;
994 }
995 }
996 if (changed) {
997 dout(10) << " updated ec profile " << p.first << ": "
998 << newprofile << dendl;
999 pending_inc.new_erasure_code_profiles[p.first] = newprofile;
1000 }
1001 }
1002 }
7c673cae
FG
1003 }
1004 }
1005
1006 // tell me about it
31f18b77 1007 for (auto i = pending_inc.new_state.begin();
7c673cae
FG
1008 i != pending_inc.new_state.end();
1009 ++i) {
1010 int s = i->second ? i->second : CEPH_OSD_UP;
1011 if (s & CEPH_OSD_UP)
1012 dout(2) << " osd." << i->first << " DOWN" << dendl;
1013 if (s & CEPH_OSD_EXISTS)
1014 dout(2) << " osd." << i->first << " DNE" << dendl;
1015 }
1016 for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
1017 i != pending_inc.new_up_client.end();
1018 ++i) {
1019 //FIXME: insert cluster addresses too
1020 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1021 }
1022 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1023 i != pending_inc.new_weight.end();
1024 ++i) {
1025 if (i->second == CEPH_OSD_OUT) {
1026 dout(2) << " osd." << i->first << " OUT" << dendl;
1027 } else if (i->second == CEPH_OSD_IN) {
1028 dout(2) << " osd." << i->first << " IN" << dendl;
1029 } else {
1030 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1031 }
1032 }
1033
1034 // features for osdmap and its incremental
1035 uint64_t features = mon->get_quorum_con_features();
1036
1037 // encode full map and determine its crc
1038 OSDMap tmp;
1039 {
1040 tmp.deepish_copy_from(osdmap);
1041 tmp.apply_incremental(pending_inc);
1042
1043 // determine appropriate features
31f18b77 1044 if (tmp.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1045 dout(10) << __func__ << " encoding without feature SERVER_LUMINOUS"
1046 << dendl;
1047 features &= ~CEPH_FEATURE_SERVER_LUMINOUS;
1048 }
31f18b77 1049 if (tmp.require_osd_release < CEPH_RELEASE_KRAKEN) {
7c673cae
FG
1050 dout(10) << __func__ << " encoding without feature SERVER_KRAKEN | "
1051 << "MSG_ADDR2" << dendl;
1052 features &= ~(CEPH_FEATURE_SERVER_KRAKEN |
1053 CEPH_FEATURE_MSG_ADDR2);
1054 }
31f18b77
FG
1055 if (tmp.require_osd_release < CEPH_RELEASE_JEWEL) {
1056 dout(10) << __func__ << " encoding without feature SERVER_JEWEL" << dendl;
1057 features &= ~CEPH_FEATURE_SERVER_JEWEL;
1058 }
7c673cae
FG
1059 dout(10) << __func__ << " encoding full map with " << features << dendl;
1060
1061 bufferlist fullbl;
1062 ::encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1063 pending_inc.full_crc = tmp.get_crc();
1064
1065 // include full map in the txn. note that old monitors will
1066 // overwrite this. new ones will now skip the local full map
1067 // encode and reload from this.
1068 put_version_full(t, pending_inc.epoch, fullbl);
1069 }
1070
1071 // encode
1072 assert(get_last_committed() + 1 == pending_inc.epoch);
1073 ::encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1074
1075 dout(20) << " full_crc " << tmp.get_crc()
1076 << " inc_crc " << pending_inc.inc_crc << dendl;
1077
1078 /* put everything in the transaction */
1079 put_version(t, pending_inc.epoch, bl);
1080 put_last_committed(t, pending_inc.epoch);
1081
1082 // metadata, too!
1083 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1084 p != pending_metadata.end();
1085 ++p)
1086 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1087 for (set<int>::iterator p = pending_metadata_rm.begin();
1088 p != pending_metadata_rm.end();
1089 ++p)
1090 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1091 pending_metadata.clear();
1092 pending_metadata_rm.clear();
1093
1094 // and pg creating, also!
1095 if (mon->monmap->get_required_features().contains_all(
1096 ceph::features::mon::FEATURE_LUMINOUS)) {
1097 auto pending_creatings = update_pending_pgs(pending_inc);
31f18b77
FG
1098 if (osdmap.get_epoch() &&
1099 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
1100 dout(7) << __func__ << " in the middle of upgrading, "
1101 << " trimming pending creating_pgs using pgmap" << dendl;
31f18b77 1102 mon->pgservice->maybe_trim_creating_pgs(&pending_creatings);
7c673cae
FG
1103 }
1104 bufferlist creatings_bl;
1105 ::encode(pending_creatings, creatings_bl);
1106 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1107 }
224ce89b
WB
1108
1109 // health
1110 health_check_map_t next;
1111 tmp.check_health(&next);
1112 encode_health(next, t);
7c673cae
FG
1113}
1114
1115void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
31f18b77 1116 const ceph::unordered_map<pg_t,pg_stat_t>& pg_stat)
7c673cae
FG
1117{
1118 auto p = creating_pgs->pgs.begin();
1119 while (p != creating_pgs->pgs.end()) {
31f18b77
FG
1120 auto q = pg_stat.find(p->first);
1121 if (q != pg_stat.end() &&
7c673cae
FG
1122 !(q->second.state & PG_STATE_CREATING)) {
1123 dout(20) << __func__ << " pgmap shows " << p->first << " is created"
1124 << dendl;
1125 p = creating_pgs->pgs.erase(p);
7c673cae
FG
1126 } else {
1127 ++p;
1128 }
1129 }
1130}
1131
1132int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1133{
1134 bufferlist bl;
1135 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1136 if (r < 0)
1137 return r;
1138 try {
1139 bufferlist::iterator p = bl.begin();
1140 ::decode(m, p);
1141 }
1142 catch (buffer::error& e) {
1143 if (err)
1144 *err << "osd." << osd << " metadata is corrupt";
1145 return -EIO;
1146 }
1147 return 0;
1148}
1149
31f18b77
FG
1150void OSDMonitor::count_metadata(const string& field, Formatter *f)
1151{
1152 map<string,int> by_val;
1153 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1154 if (osdmap.is_up(osd)) {
1155 map<string,string> meta;
1156 load_metadata(osd, meta, nullptr);
1157 auto p = meta.find(field);
1158 if (p == meta.end()) {
1159 by_val["unknown"]++;
1160 } else {
1161 by_val[p->second]++;
1162 }
1163 }
1164 }
1165 f->open_object_section(field.c_str());
1166 for (auto& p : by_val) {
1167 f->dump_int(p.first.c_str(), p.second);
1168 }
1169 f->close_section();
1170}
1171
7c673cae
FG
1172int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1173{
1174 map<string, string> metadata;
1175 int r = load_metadata(osd, metadata, nullptr);
1176 if (r < 0)
1177 return r;
1178
1179 auto it = metadata.find("osd_objectstore");
1180 if (it == metadata.end())
1181 return -ENOENT;
1182 *type = it->second;
1183 return 0;
1184}
1185
1186bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1187 const pg_pool_t &pool,
1188 ostream *err)
1189{
1190 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1191 // since filestore osds could always join the pool later
1192 set<int> checked_osds;
1193 for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) {
1194 vector<int> up, acting;
1195 pg_t pgid(ps, pool_id, -1);
1196 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1197 for (int osd : up) {
1198 if (checked_osds.find(osd) != checked_osds.end())
1199 continue;
1200 string objectstore_type;
1201 int r = get_osd_objectstore_type(osd, &objectstore_type);
1202 // allow with missing metadata, e.g. due to an osd never booting yet
1203 if (r < 0 || objectstore_type == "bluestore") {
1204 checked_osds.insert(osd);
1205 continue;
1206 }
1207 *err << "osd." << osd << " uses " << objectstore_type;
1208 return false;
1209 }
1210 }
1211 return true;
1212}
1213
1214int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1215{
1216 map<string,string> m;
1217 if (int r = load_metadata(osd, m, err))
1218 return r;
1219 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1220 f->dump_string(p->first.c_str(), p->second);
1221 return 0;
1222}
1223
1224void OSDMonitor::print_nodes(Formatter *f)
1225{
1226 // group OSDs by their hosts
1227 map<string, list<int> > osds; // hostname => osd
1228 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1229 map<string, string> m;
1230 if (load_metadata(osd, m, NULL)) {
1231 continue;
1232 }
1233 map<string, string>::iterator hostname = m.find("hostname");
1234 if (hostname == m.end()) {
1235 // not likely though
1236 continue;
1237 }
1238 osds[hostname->second].push_back(osd);
1239 }
1240
1241 dump_services(f, osds, "osd");
1242}
1243
1244void OSDMonitor::share_map_with_random_osd()
1245{
1246 if (osdmap.get_num_up_osds() == 0) {
1247 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1248 return;
1249 }
1250
1251 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1252 if (!s) {
1253 dout(10) << __func__ << " no up osd on our session map" << dendl;
1254 return;
1255 }
1256
1257 dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
1258 // whatev, they'll request more if they need it
1259 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
1260 s->con->send_message(m);
1261 // NOTE: do *not* record osd has up to this epoch (as we do
1262 // elsewhere) as they may still need to request older values.
1263}
1264
1265version_t OSDMonitor::get_trim_to()
1266{
31f18b77
FG
1267 if (mon->get_quorum().empty()) {
1268 dout(10) << __func__ << ": quorum not formed" << dendl;
1269 return 0;
1270 }
7c673cae 1271
31f18b77 1272 epoch_t floor;
7c673cae
FG
1273 if (mon->monmap->get_required_features().contains_all(
1274 ceph::features::mon::FEATURE_LUMINOUS)) {
1275 {
31f18b77 1276 // TODO: Get this hidden in PGStatService
7c673cae
FG
1277 std::lock_guard<std::mutex> l(creating_pgs_lock);
1278 if (!creating_pgs.pgs.empty()) {
1279 return 0;
1280 }
1281 }
1282 floor = get_min_last_epoch_clean();
1283 } else {
31f18b77 1284 if (!mon->pgservice->is_readable())
7c673cae 1285 return 0;
31f18b77 1286 if (mon->pgservice->have_creating_pgs()) {
7c673cae
FG
1287 return 0;
1288 }
31f18b77 1289 floor = mon->pgservice->get_min_last_epoch_clean();
7c673cae
FG
1290 }
1291 {
1292 dout(10) << " min_last_epoch_clean " << floor << dendl;
1293 if (g_conf->mon_osd_force_trim_to > 0 &&
1294 g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
1295 floor = g_conf->mon_osd_force_trim_to;
1296 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1297 }
1298 unsigned min = g_conf->mon_min_osdmap_epochs;
1299 if (floor + min > get_last_committed()) {
1300 if (min < get_last_committed())
1301 floor = get_last_committed() - min;
1302 else
1303 floor = 0;
1304 }
1305 if (floor > get_first_committed())
1306 return floor;
1307 }
1308 return 0;
1309}
1310
1311epoch_t OSDMonitor::get_min_last_epoch_clean() const
1312{
1313 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1314 // also scan osd epochs
1315 // don't trim past the oldest reported osd epoch
1316 for (auto& osd_epoch : osd_epochs) {
1317 if (osd_epoch.second < floor) {
1318 floor = osd_epoch.second;
1319 }
1320 }
1321 return floor;
1322}
1323
1324void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1325 version_t first)
1326{
1327 dout(10) << __func__ << " including full map for e " << first << dendl;
1328 bufferlist bl;
1329 get_version_full(first, bl);
1330 put_version_full(tx, first, bl);
1331}
1332
1333// -------------
1334
1335bool OSDMonitor::preprocess_query(MonOpRequestRef op)
1336{
1337 op->mark_osdmon_event(__func__);
1338 Message *m = op->get_req();
1339 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
1340
1341 switch (m->get_type()) {
1342 // READs
1343 case MSG_MON_COMMAND:
1344 return preprocess_command(op);
1345 case CEPH_MSG_MON_GET_OSDMAP:
1346 return preprocess_get_osdmap(op);
1347
1348 // damp updates
1349 case MSG_OSD_MARK_ME_DOWN:
1350 return preprocess_mark_me_down(op);
1351 case MSG_OSD_FULL:
1352 return preprocess_full(op);
1353 case MSG_OSD_FAILURE:
1354 return preprocess_failure(op);
1355 case MSG_OSD_BOOT:
1356 return preprocess_boot(op);
1357 case MSG_OSD_ALIVE:
1358 return preprocess_alive(op);
1359 case MSG_OSD_PG_CREATED:
1360 return preprocess_pg_created(op);
1361 case MSG_OSD_PGTEMP:
1362 return preprocess_pgtemp(op);
1363 case MSG_OSD_BEACON:
1364 return preprocess_beacon(op);
1365
1366 case CEPH_MSG_POOLOP:
1367 return preprocess_pool_op(op);
1368
1369 case MSG_REMOVE_SNAPS:
1370 return preprocess_remove_snaps(op);
1371
1372 default:
1373 ceph_abort();
1374 return true;
1375 }
1376}
1377
1378bool OSDMonitor::prepare_update(MonOpRequestRef op)
1379{
1380 op->mark_osdmon_event(__func__);
1381 Message *m = op->get_req();
1382 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
1383
1384 switch (m->get_type()) {
1385 // damp updates
1386 case MSG_OSD_MARK_ME_DOWN:
1387 return prepare_mark_me_down(op);
1388 case MSG_OSD_FULL:
1389 return prepare_full(op);
1390 case MSG_OSD_FAILURE:
1391 return prepare_failure(op);
1392 case MSG_OSD_BOOT:
1393 return prepare_boot(op);
1394 case MSG_OSD_ALIVE:
1395 return prepare_alive(op);
1396 case MSG_OSD_PG_CREATED:
1397 return prepare_pg_created(op);
1398 case MSG_OSD_PGTEMP:
1399 return prepare_pgtemp(op);
1400 case MSG_OSD_BEACON:
1401 return prepare_beacon(op);
1402
1403 case MSG_MON_COMMAND:
1404 return prepare_command(op);
1405
1406 case CEPH_MSG_POOLOP:
1407 return prepare_pool_op(op);
1408
1409 case MSG_REMOVE_SNAPS:
1410 return prepare_remove_snaps(op);
1411
1412
1413 default:
1414 ceph_abort();
1415 }
1416
1417 return false;
1418}
1419
1420bool OSDMonitor::should_propose(double& delay)
1421{
1422 dout(10) << "should_propose" << dendl;
1423
1424 // if full map, propose immediately! any subsequent changes will be clobbered.
1425 if (pending_inc.fullmap.length())
1426 return true;
1427
1428 // adjust osd weights?
1429 if (!osd_weight.empty() &&
1430 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
1431 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
1432 osdmap.adjust_osd_weights(osd_weight, pending_inc);
1433 delay = 0.0;
1434 osd_weight.clear();
1435 return true;
1436 }
1437
1438 // propose as fast as possible if updating up_thru or pg_temp
1439 // want to merge OSDMap changes as much as possible
1440 if ((pending_inc.new_primary_temp.size() == 1
1441 || pending_inc.new_up_thru.size() == 1)
1442 && pending_inc.new_state.size() < 2) {
1443 dout(15) << " propose as fast as possible for up_thru/pg_temp" << dendl;
1444
1445 utime_t now = ceph_clock_now();
1446 if (now - last_attempted_minwait_time > g_conf->paxos_propose_interval
1447 && now - paxos->get_last_commit_time() > g_conf->paxos_min_wait) {
1448 delay = g_conf->paxos_min_wait;
1449 last_attempted_minwait_time = now;
1450 return true;
1451 }
1452 }
1453
1454 return PaxosService::should_propose(delay);
1455}
1456
1457
1458
1459// ---------------------------
1460// READs
1461
1462bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
1463{
1464 op->mark_osdmon_event(__func__);
1465 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
1466 dout(10) << __func__ << " " << *m << dendl;
1467 MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
1468 epoch_t first = get_first_committed();
1469 epoch_t last = osdmap.get_epoch();
1470 int max = g_conf->osd_map_message_max;
1471 for (epoch_t e = MAX(first, m->get_full_first());
1472 e <= MIN(last, m->get_full_last()) && max > 0;
1473 ++e, --max) {
1474 int r = get_version_full(e, reply->maps[e]);
1475 assert(r >= 0);
1476 }
1477 for (epoch_t e = MAX(first, m->get_inc_first());
1478 e <= MIN(last, m->get_inc_last()) && max > 0;
1479 ++e, --max) {
1480 int r = get_version(e, reply->incremental_maps[e]);
1481 assert(r >= 0);
1482 }
1483 reply->oldest_map = first;
1484 reply->newest_map = last;
1485 mon->send_reply(op, reply);
1486 return true;
1487}
1488
1489
1490// ---------------------------
1491// UPDATEs
1492
1493// failure --
1494
1495bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
1496 // check permissions
1497 MonSession *session = m->get_session();
1498 if (!session)
1499 return true;
1500 if (!session->is_capable("osd", MON_CAP_X)) {
1501 dout(0) << "got MOSDFailure from entity with insufficient caps "
1502 << session->caps << dendl;
1503 return true;
1504 }
1505 if (fsid != mon->monmap->fsid) {
1506 dout(0) << "check_source: on fsid " << fsid
1507 << " != " << mon->monmap->fsid << dendl;
1508 return true;
1509 }
1510 return false;
1511}
1512
1513
1514bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
1515{
1516 op->mark_osdmon_event(__func__);
1517 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1518 // who is target_osd
1519 int badboy = m->get_target().name.num();
1520
1521 // check permissions
1522 if (check_source(m, m->fsid))
1523 goto didit;
1524
1525 // first, verify the reporting host is valid
1526 if (m->get_orig_source().is_osd()) {
1527 int from = m->get_orig_source().num();
1528 if (!osdmap.exists(from) ||
1529 osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
1530 (osdmap.is_down(from) && m->if_osd_failed())) {
1531 dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
1532 send_incremental(op, m->get_epoch()+1);
1533 goto didit;
1534 }
1535 }
1536
1537
1538 // weird?
1539 if (osdmap.is_down(badboy)) {
1540 dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1541 if (m->get_epoch() < osdmap.get_epoch())
1542 send_incremental(op, m->get_epoch()+1);
1543 goto didit;
1544 }
1545 if (osdmap.get_inst(badboy) != m->get_target()) {
1546 dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
1547 << ", from " << m->get_orig_source_inst() << dendl;
1548 if (m->get_epoch() < osdmap.get_epoch())
1549 send_incremental(op, m->get_epoch()+1);
1550 goto didit;
1551 }
1552
1553 // already reported?
1554 if (osdmap.is_down(badboy) ||
1555 osdmap.get_up_from(badboy) > m->get_epoch()) {
1556 dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1557 if (m->get_epoch() < osdmap.get_epoch())
1558 send_incremental(op, m->get_epoch()+1);
1559 goto didit;
1560 }
1561
1562 if (!can_mark_down(badboy)) {
1563 dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
1564 goto didit;
1565 }
1566
1567 dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
1568 return false;
1569
1570 didit:
1571 return true;
1572}
1573
1574class C_AckMarkedDown : public C_MonOp {
1575 OSDMonitor *osdmon;
1576public:
1577 C_AckMarkedDown(
1578 OSDMonitor *osdmon,
1579 MonOpRequestRef op)
1580 : C_MonOp(op), osdmon(osdmon) {}
1581
1582 void _finish(int) override {
1583 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1584 osdmon->mon->send_reply(
1585 op,
1586 new MOSDMarkMeDown(
1587 m->fsid,
1588 m->get_target(),
1589 m->get_epoch(),
1590 false)); // ACK itself does not request an ack
1591 }
1592 ~C_AckMarkedDown() override {
1593 }
1594};
1595
1596bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
1597{
1598 op->mark_osdmon_event(__func__);
1599 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1600 int requesting_down = m->get_target().name.num();
1601 int from = m->get_orig_source().num();
1602
1603 // check permissions
1604 if (check_source(m, m->fsid))
1605 goto reply;
1606
1607 // first, verify the reporting host is valid
1608 if (!m->get_orig_source().is_osd())
1609 goto reply;
1610
1611 if (!osdmap.exists(from) ||
1612 osdmap.is_down(from) ||
1613 osdmap.get_addr(from) != m->get_target().addr) {
1614 dout(5) << "preprocess_mark_me_down from dead osd."
1615 << from << ", ignoring" << dendl;
1616 send_incremental(op, m->get_epoch()+1);
1617 goto reply;
1618 }
1619
1620 // no down might be set
1621 if (!can_mark_down(requesting_down))
1622 goto reply;
1623
1624 dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
1625 return false;
1626
1627 reply:
1628 if (m->request_ack) {
1629 Context *c(new C_AckMarkedDown(this, op));
1630 c->complete(0);
1631 }
1632 return true;
1633}
1634
1635bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
1636{
1637 op->mark_osdmon_event(__func__);
1638 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
1639 int target_osd = m->get_target().name.num();
1640
1641 assert(osdmap.is_up(target_osd));
1642 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1643
1644 mon->clog->info() << "osd." << target_osd << " marked itself down";
1645 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1646 if (m->request_ack)
1647 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
1648 return true;
1649}
1650
1651bool OSDMonitor::can_mark_down(int i)
1652{
1653 if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
31f18b77
FG
1654 dout(5) << __func__ << " NODOWN flag set, will not mark osd." << i
1655 << " down" << dendl;
1656 return false;
1657 }
1658
1659 if (osdmap.is_nodown(i)) {
1660 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
1661 << "will not mark it down" << dendl;
7c673cae
FG
1662 return false;
1663 }
31f18b77 1664
7c673cae
FG
1665 int num_osds = osdmap.get_num_osds();
1666 if (num_osds == 0) {
31f18b77 1667 dout(5) << __func__ << " no osds" << dendl;
7c673cae
FG
1668 return false;
1669 }
1670 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
1671 float up_ratio = (float)up / (float)num_osds;
1672 if (up_ratio < g_conf->mon_osd_min_up_ratio) {
31f18b77 1673 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
7c673cae
FG
1674 << g_conf->mon_osd_min_up_ratio
1675 << ", will not mark osd." << i << " down" << dendl;
1676 return false;
1677 }
1678 return true;
1679}
1680
1681bool OSDMonitor::can_mark_up(int i)
1682{
1683 if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
31f18b77
FG
1684 dout(5) << __func__ << " NOUP flag set, will not mark osd." << i
1685 << " up" << dendl;
1686 return false;
1687 }
1688
1689 if (osdmap.is_noup(i)) {
1690 dout(5) << __func__ << " osd." << i << " is marked as noup, "
1691 << "will not mark it up" << dendl;
7c673cae
FG
1692 return false;
1693 }
31f18b77 1694
7c673cae
FG
1695 return true;
1696}
1697
1698/**
1699 * @note the parameter @p i apparently only exists here so we can output the
1700 * osd's id on messages.
1701 */
1702bool OSDMonitor::can_mark_out(int i)
1703{
1704 if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
1705 dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
1706 return false;
1707 }
31f18b77
FG
1708
1709 if (osdmap.is_noout(i)) {
1710 dout(5) << __func__ << " osd." << i << " is marked as noout, "
1711 << "will not mark it out" << dendl;
1712 return false;
1713 }
1714
7c673cae
FG
1715 int num_osds = osdmap.get_num_osds();
1716 if (num_osds == 0) {
1717 dout(5) << __func__ << " no osds" << dendl;
1718 return false;
1719 }
1720 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
1721 float in_ratio = (float)in / (float)num_osds;
1722 if (in_ratio < g_conf->mon_osd_min_in_ratio) {
1723 if (i >= 0)
1724 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1725 << g_conf->mon_osd_min_in_ratio
1726 << ", will not mark osd." << i << " out" << dendl;
1727 else
1728 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
1729 << g_conf->mon_osd_min_in_ratio
1730 << ", will not mark osds out" << dendl;
1731 return false;
1732 }
1733
1734 return true;
1735}
1736
1737bool OSDMonitor::can_mark_in(int i)
1738{
1739 if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
31f18b77
FG
1740 dout(5) << __func__ << " NOIN flag set, will not mark osd." << i
1741 << " in" << dendl;
1742 return false;
1743 }
1744
1745 if (osdmap.is_noin(i)) {
1746 dout(5) << __func__ << " osd." << i << " is marked as noin, "
1747 << "will not mark it in" << dendl;
7c673cae
FG
1748 return false;
1749 }
31f18b77 1750
7c673cae
FG
1751 return true;
1752}
1753
1754bool OSDMonitor::check_failures(utime_t now)
1755{
1756 bool found_failure = false;
1757 for (map<int,failure_info_t>::iterator p = failure_info.begin();
1758 p != failure_info.end();
1759 ++p) {
1760 if (can_mark_down(p->first)) {
1761 found_failure |= check_failure(now, p->first, p->second);
1762 }
1763 }
1764 return found_failure;
1765}
1766
1767bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
1768{
1769 // already pending failure?
1770 if (pending_inc.new_state.count(target_osd) &&
1771 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1772 dout(10) << " already pending failure" << dendl;
1773 return true;
1774 }
1775
1776 set<string> reporters_by_subtree;
1777 string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
1778 utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
1779 utime_t max_failed_since = fi.get_failed_since();
1780 utime_t failed_for = now - max_failed_since;
1781
1782 utime_t grace = orig_grace;
1783 double my_grace = 0, peer_grace = 0;
1784 double decay_k = 0;
1785 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1786 double halflife = (double)g_conf->mon_osd_laggy_halflife;
1787 decay_k = ::log(.5) / halflife;
1788
1789 // scale grace period based on historical probability of 'lagginess'
1790 // (false positive failures due to slowness).
1791 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
1792 double decay = exp((double)failed_for * decay_k);
1793 dout(20) << " halflife " << halflife << " decay_k " << decay_k
1794 << " failed_for " << failed_for << " decay " << decay << dendl;
1795 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
1796 grace += my_grace;
1797 }
1798
1799 // consider the peers reporting a failure a proxy for a potential
1800 // 'subcluster' over the overall cluster that is similarly
1801 // laggy. this is clearly not true in all cases, but will sometimes
1802 // help us localize the grace correction to a subset of the system
1803 // (say, a rack with a bad switch) that is unhappy.
1804 assert(fi.reporters.size());
1805 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
1806 p != fi.reporters.end();
1807 ++p) {
1808 // get the parent bucket whose type matches with "reporter_subtree_level".
1809 // fall back to OSD if the level doesn't exist.
1810 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
1811 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
1812 if (iter == reporter_loc.end()) {
1813 reporters_by_subtree.insert("osd." + to_string(p->first));
1814 } else {
1815 reporters_by_subtree.insert(iter->second);
1816 }
1817 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1818 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
1819 utime_t elapsed = now - xi.down_stamp;
1820 double decay = exp((double)elapsed * decay_k);
1821 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
1822 }
1823 }
1824
1825 if (g_conf->mon_osd_adjust_heartbeat_grace) {
1826 peer_grace /= (double)fi.reporters.size();
1827 grace += peer_grace;
1828 }
1829
1830 dout(10) << " osd." << target_osd << " has "
1831 << fi.reporters.size() << " reporters, "
1832 << grace << " grace (" << orig_grace << " + " << my_grace
1833 << " + " << peer_grace << "), max_failed_since " << max_failed_since
1834 << dendl;
1835
1836 if (failed_for >= grace &&
1837 (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
1838 dout(1) << " we have enough reporters to mark osd." << target_osd
1839 << " down" << dendl;
1840 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1841
31f18b77
FG
1842 mon->clog->info() << "osd." << target_osd << " failed ("
1843 << osdmap.crush->get_full_location_ordered_string(
1844 target_osd)
1845 << ") ("
1846 << (int)reporters_by_subtree.size()
1847 << " reporters from different "
7c673cae
FG
1848 << reporter_subtree_level << " after "
1849 << failed_for << " >= grace " << grace << ")";
1850 return true;
1851 }
1852 return false;
1853}
1854
224ce89b 1855void OSDMonitor::force_failure(int target_osd, int by)
7c673cae
FG
1856{
1857 // already pending failure?
1858 if (pending_inc.new_state.count(target_osd) &&
1859 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
1860 dout(10) << " already pending failure" << dendl;
1861 return;
1862 }
1863
1864 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
1865 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
1866
31f18b77
FG
1867 mon->clog->info() << "osd." << target_osd << " failed ("
1868 << osdmap.crush->get_full_location_ordered_string(target_osd)
1869 << ") (connection refused reported by osd." << by << ")";
7c673cae
FG
1870 return;
1871}
1872
1873bool OSDMonitor::prepare_failure(MonOpRequestRef op)
1874{
1875 op->mark_osdmon_event(__func__);
1876 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
1877 dout(1) << "prepare_failure " << m->get_target()
1878 << " from " << m->get_orig_source_inst()
1879 << " is reporting failure:" << m->if_osd_failed() << dendl;
1880
1881 int target_osd = m->get_target().name.num();
1882 int reporter = m->get_orig_source().num();
1883 assert(osdmap.is_up(target_osd));
1884 assert(osdmap.get_addr(target_osd) == m->get_target().addr);
1885
1886 if (m->if_osd_failed()) {
1887 // calculate failure time
1888 utime_t now = ceph_clock_now();
1889 utime_t failed_since =
1890 m->get_recv_stamp() - utime_t(m->failed_for, 0);
1891
1892 // add a report
1893 if (m->is_immediate()) {
1894 mon->clog->debug() << m->get_target() << " reported immediately failed by "
1895 << m->get_orig_source_inst();
224ce89b 1896 force_failure(target_osd, reporter);
7c673cae
FG
1897 return true;
1898 }
1899 mon->clog->debug() << m->get_target() << " reported failed by "
1900 << m->get_orig_source_inst();
1901
1902 failure_info_t& fi = failure_info[target_osd];
1903 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
1904 if (old_op) {
1905 mon->no_reply(old_op);
1906 }
1907
1908 return check_failure(now, target_osd, fi);
1909 } else {
1910 // remove the report
1911 mon->clog->debug() << m->get_target() << " failure report canceled by "
1912 << m->get_orig_source_inst();
1913 if (failure_info.count(target_osd)) {
1914 failure_info_t& fi = failure_info[target_osd];
1915 MonOpRequestRef report_op = fi.cancel_report(reporter);
1916 if (report_op) {
1917 mon->no_reply(report_op);
1918 }
1919 if (fi.reporters.empty()) {
1920 dout(10) << " removing last failure_info for osd." << target_osd
1921 << dendl;
1922 failure_info.erase(target_osd);
1923 } else {
1924 dout(10) << " failure_info for osd." << target_osd << " now "
1925 << fi.reporters.size() << " reporters" << dendl;
1926 }
1927 } else {
1928 dout(10) << " no failure_info for osd." << target_osd << dendl;
1929 }
1930 mon->no_reply(op);
1931 }
1932
1933 return false;
1934}
1935
1936void OSDMonitor::process_failures()
1937{
1938 map<int,failure_info_t>::iterator p = failure_info.begin();
1939 while (p != failure_info.end()) {
1940 if (osdmap.is_up(p->first)) {
1941 ++p;
1942 } else {
1943 dout(10) << "process_failures osd." << p->first << dendl;
1944 list<MonOpRequestRef> ls;
1945 p->second.take_report_messages(ls);
1946 failure_info.erase(p++);
1947
1948 while (!ls.empty()) {
1949 MonOpRequestRef o = ls.front();
1950 if (o) {
1951 o->mark_event(__func__);
1952 MOSDFailure *m = o->get_req<MOSDFailure>();
1953 send_latest(o, m->get_epoch());
1954 }
1955 ls.pop_front();
1956 }
1957 }
1958 }
1959}
1960
1961void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
1962{
1963 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
1964
1965 for (map<int,failure_info_t>::iterator p = failure_info.begin();
1966 p != failure_info.end();
1967 ++p) {
1968 p->second.take_report_messages(ls);
1969 }
1970 failure_info.clear();
1971}
1972
1973
1974// boot --
1975
1976bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
1977{
1978 op->mark_osdmon_event(__func__);
1979 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
1980 int from = m->get_orig_source_inst().name.num();
1981
1982 // check permissions, ignore if failed (no response expected)
1983 MonSession *session = m->get_session();
1984 if (!session)
1985 goto ignore;
1986 if (!session->is_capable("osd", MON_CAP_X)) {
1987 dout(0) << "got preprocess_boot message from entity with insufficient caps"
1988 << session->caps << dendl;
1989 goto ignore;
1990 }
1991
1992 if (m->sb.cluster_fsid != mon->monmap->fsid) {
1993 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
1994 << " != " << mon->monmap->fsid << dendl;
1995 goto ignore;
1996 }
1997
1998 if (m->get_orig_source_inst().addr.is_blank_ip()) {
1999 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2000 goto ignore;
2001 }
2002
2003 assert(m->get_orig_source_inst().name.is_osd());
2004
2005 // check if osd has required features to boot
2006 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2007 CEPH_FEATURE_OSD_ERASURE_CODES) &&
2008 !(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
2009 dout(0) << __func__ << " osdmap requires erasure code but osd at "
2010 << m->get_orig_source_inst()
2011 << " doesn't announce support -- ignore" << dendl;
2012 goto ignore;
2013 }
2014
2015 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2016 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
2017 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
2018 dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
2019 << m->get_orig_source_inst()
2020 << " doesn't announce support -- ignore" << dendl;
2021 goto ignore;
2022 }
2023
2024 if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
2025 CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3) &&
2026 !(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3)) {
2027 dout(0) << __func__ << " osdmap requires erasure code plugins v3 but osd at "
2028 << m->get_orig_source_inst()
2029 << " doesn't announce support -- ignore" << dendl;
2030 goto ignore;
2031 }
2032
31f18b77 2033 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
2034 !HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2035 mon->clog->info() << "disallowing boot of OSD "
2036 << m->get_orig_source_inst()
2037 << " because the osdmap requires"
2038 << " CEPH_FEATURE_SERVER_LUMINOUS"
2039 << " but the osd lacks CEPH_FEATURE_SERVER_LUMINOUS";
2040 goto ignore;
2041 }
2042
31f18b77 2043 if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL &&
7c673cae
FG
2044 !(m->osd_features & CEPH_FEATURE_SERVER_JEWEL)) {
2045 mon->clog->info() << "disallowing boot of OSD "
2046 << m->get_orig_source_inst()
2047 << " because the osdmap requires"
2048 << " CEPH_FEATURE_SERVER_JEWEL"
2049 << " but the osd lacks CEPH_FEATURE_SERVER_JEWEL";
2050 goto ignore;
2051 }
2052
31f18b77 2053 if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN &&
7c673cae
FG
2054 !HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2055 mon->clog->info() << "disallowing boot of OSD "
2056 << m->get_orig_source_inst()
2057 << " because the osdmap requires"
2058 << " CEPH_FEATURE_SERVER_KRAKEN"
2059 << " but the osd lacks CEPH_FEATURE_SERVER_KRAKEN";
2060 goto ignore;
2061 }
2062
2063 if (osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
2064 !(m->osd_features & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
2065 mon->clog->info() << "disallowing boot of OSD "
2066 << m->get_orig_source_inst()
2067 << " because 'sortbitwise' osdmap flag is set and OSD lacks the OSD_BITWISE_HOBJ_SORT feature";
2068 goto ignore;
2069 }
2070
2071 if (any_of(osdmap.get_pools().begin(),
2072 osdmap.get_pools().end(),
2073 [](const std::pair<int64_t,pg_pool_t>& pool)
2074 { return pool.second.use_gmt_hitset; })) {
2075 assert(osdmap.get_num_up_osds() == 0 ||
2076 osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
2077 if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
2078 dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
2079 << m->get_orig_source_inst()
2080 << " doesn't announce support -- ignore" << dendl;
2081 goto ignore;
2082 }
2083 }
2084
2085 // make sure upgrades stop at luminous
2086 if (HAVE_FEATURE(m->osd_features, SERVER_M) &&
31f18b77 2087 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
2088 mon->clog->info() << "disallowing boot of post-luminous OSD "
2089 << m->get_orig_source_inst()
31f18b77 2090 << " because require_osd_release < luminous";
7c673cae
FG
2091 goto ignore;
2092 }
2093
2094 // make sure upgrades stop at jewel
2095 if (HAVE_FEATURE(m->osd_features, SERVER_KRAKEN) &&
31f18b77 2096 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
2097 mon->clog->info() << "disallowing boot of post-jewel OSD "
2098 << m->get_orig_source_inst()
31f18b77 2099 << " because require_osd_release < jewel";
7c673cae
FG
2100 goto ignore;
2101 }
2102
2103 // make sure upgrades stop at hammer
2104 // * HAMMER_0_94_4 is the required hammer feature
2105 // * MON_METADATA is the first post-hammer feature
2106 if (osdmap.get_num_up_osds() > 0) {
2107 if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
2108 !(osdmap.get_up_osd_features() & CEPH_FEATURE_HAMMER_0_94_4)) {
2109 mon->clog->info() << "disallowing boot of post-hammer OSD "
2110 << m->get_orig_source_inst()
2111 << " because one or more up OSDs is pre-hammer v0.94.4";
2112 goto ignore;
2113 }
2114 if (!(m->osd_features & CEPH_FEATURE_HAMMER_0_94_4) &&
2115 (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
2116 mon->clog->info() << "disallowing boot of pre-hammer v0.94.4 OSD "
2117 << m->get_orig_source_inst()
2118 << " because all up OSDs are post-hammer";
2119 goto ignore;
2120 }
2121 }
2122
2123 // already booted?
2124 if (osdmap.is_up(from) &&
2125 osdmap.get_inst(from) == m->get_orig_source_inst() &&
2126 osdmap.get_cluster_addr(from) == m->cluster_addr) {
2127 // yup.
2128 dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
2129 << " == " << osdmap.get_inst(from) << dendl;
2130 _booted(op, false);
2131 return true;
2132 }
2133
2134 if (osdmap.exists(from) &&
2135 !osdmap.get_uuid(from).is_zero() &&
2136 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2137 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2138 << " clashes with existing osd: different fsid"
2139 << " (ours: " << osdmap.get_uuid(from)
2140 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2141 goto ignore;
2142 }
2143
2144 if (osdmap.exists(from) &&
2145 osdmap.get_info(from).up_from > m->version &&
2146 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) {
2147 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2148 send_latest(op, m->sb.current_epoch+1);
2149 return true;
2150 }
2151
2152 // noup?
2153 if (!can_mark_up(from)) {
2154 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2155 send_latest(op, m->sb.current_epoch+1);
2156 return true;
2157 }
2158
2159 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2160 return false;
2161
2162 ignore:
2163 return true;
2164}
2165
2166bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2167{
2168 op->mark_osdmon_event(__func__);
2169 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2170 dout(7) << __func__ << " from " << m->get_orig_source_inst() << " sb " << m->sb
2171 << " cluster_addr " << m->cluster_addr
2172 << " hb_back_addr " << m->hb_back_addr
2173 << " hb_front_addr " << m->hb_front_addr
2174 << dendl;
2175
2176 assert(m->get_orig_source().is_osd());
2177 int from = m->get_orig_source().num();
2178
2179 // does this osd exist?
2180 if (from >= osdmap.get_max_osd()) {
2181 dout(1) << "boot from osd." << from << " >= max_osd "
2182 << osdmap.get_max_osd() << dendl;
2183 return false;
2184 }
2185
2186 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2187 if (pending_inc.new_state.count(from))
2188 oldstate ^= pending_inc.new_state[from];
2189
2190 // already up? mark down first?
2191 if (osdmap.is_up(from)) {
2192 dout(7) << __func__ << " was up, first marking down "
2193 << osdmap.get_inst(from) << dendl;
2194 // preprocess should have caught these; if not, assert.
2195 assert(osdmap.get_inst(from) != m->get_orig_source_inst() ||
2196 osdmap.get_cluster_addr(from) != m->cluster_addr);
2197 assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2198
2199 if (pending_inc.new_state.count(from) == 0 ||
2200 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2201 // mark previous guy down
2202 pending_inc.new_state[from] = CEPH_OSD_UP;
2203 }
2204 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2205 } else if (pending_inc.new_up_client.count(from)) {
2206 // already prepared, just wait
2207 dout(7) << __func__ << " already prepared, waiting on "
2208 << m->get_orig_source_addr() << dendl;
2209 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2210 } else {
2211 // mark new guy up.
2212 pending_inc.new_up_client[from] = m->get_orig_source_addr();
2213 if (!m->cluster_addr.is_blank_ip())
2214 pending_inc.new_up_cluster[from] = m->cluster_addr;
2215 pending_inc.new_hb_back_up[from] = m->hb_back_addr;
2216 if (!m->hb_front_addr.is_blank_ip())
2217 pending_inc.new_hb_front_up[from] = m->hb_front_addr;
2218
2219 down_pending_out.erase(from); // if any
2220
2221 if (m->sb.weight)
2222 osd_weight[from] = m->sb.weight;
2223
2224 // set uuid?
2225 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2226 << dendl;
2227 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2228 // preprocess should have caught this; if not, assert.
2229 assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2230 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2231 }
2232
2233 // fresh osd?
2234 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2235 const osd_info_t& i = osdmap.get_info(from);
2236 if (i.up_from > i.lost_at) {
2237 dout(10) << " fresh osd; marking lost_at too" << dendl;
2238 pending_inc.new_lost[from] = osdmap.get_epoch();
2239 }
2240 }
2241
2242 // metadata
2243 bufferlist osd_metadata;
2244 ::encode(m->metadata, osd_metadata);
2245 pending_metadata[from] = osd_metadata;
31f18b77 2246 pending_metadata_rm.erase(from);
7c673cae
FG
2247
2248 // adjust last clean unmount epoch?
2249 const osd_info_t& info = osdmap.get_info(from);
2250 dout(10) << " old osd_info: " << info << dendl;
2251 if (m->sb.mounted > info.last_clean_begin ||
2252 (m->sb.mounted == info.last_clean_begin &&
2253 m->sb.clean_thru > info.last_clean_end)) {
2254 epoch_t begin = m->sb.mounted;
2255 epoch_t end = m->sb.clean_thru;
2256
2257 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2258 << "[" << info.last_clean_begin << "," << info.last_clean_end
2259 << ") -> [" << begin << "-" << end << ")"
2260 << dendl;
2261 pending_inc.new_last_clean_interval[from] =
2262 pair<epoch_t,epoch_t>(begin, end);
2263 }
2264
2265 osd_xinfo_t xi = osdmap.get_xinfo(from);
2266 if (m->boot_epoch == 0) {
2267 xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
2268 xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
2269 dout(10) << " not laggy, new xi " << xi << dendl;
2270 } else {
2271 if (xi.down_stamp.sec()) {
2272 int interval = ceph_clock_now().sec() -
2273 xi.down_stamp.sec();
2274 if (g_conf->mon_osd_laggy_max_interval &&
2275 (interval > g_conf->mon_osd_laggy_max_interval)) {
2276 interval = g_conf->mon_osd_laggy_max_interval;
2277 }
2278 xi.laggy_interval =
2279 interval * g_conf->mon_osd_laggy_weight +
2280 xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
2281 }
2282 xi.laggy_probability =
2283 g_conf->mon_osd_laggy_weight +
2284 xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
2285 dout(10) << " laggy, now xi " << xi << dendl;
2286 }
2287
2288 // set features shared by the osd
2289 if (m->osd_features)
2290 xi.features = m->osd_features;
2291 else
2292 xi.features = m->get_connection()->get_features();
2293
2294 // mark in?
2295 if ((g_conf->mon_osd_auto_mark_auto_out_in &&
2296 (oldstate & CEPH_OSD_AUTOOUT)) ||
2297 (g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
2298 (g_conf->mon_osd_auto_mark_in)) {
2299 if (can_mark_in(from)) {
2300 if (osdmap.osd_xinfo[from].old_weight > 0) {
2301 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
2302 xi.old_weight = 0;
2303 } else {
2304 pending_inc.new_weight[from] = CEPH_OSD_IN;
2305 }
2306 } else {
2307 dout(7) << __func__ << " NOIN set, will not mark in "
2308 << m->get_orig_source_addr() << dendl;
2309 }
2310 }
2311
2312 pending_inc.new_xinfo[from] = xi;
2313
2314 // wait
2315 wait_for_finished_proposal(op, new C_Booted(this, op));
2316 }
2317 return true;
2318}
2319
2320void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
2321{
2322 op->mark_osdmon_event(__func__);
2323 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2324 dout(7) << "_booted " << m->get_orig_source_inst()
2325 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
2326
2327 if (logit) {
2328 mon->clog->info() << m->get_orig_source_inst() << " boot";
2329 }
2330
2331 send_latest(op, m->sb.current_epoch+1);
2332}
2333
2334
2335// -------------
2336// full
2337
2338bool OSDMonitor::preprocess_full(MonOpRequestRef op)
2339{
2340 op->mark_osdmon_event(__func__);
2341 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2342 int from = m->get_orig_source().num();
2343 set<string> state;
2344 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2345
2346 // check permissions, ignore if failed
2347 MonSession *session = m->get_session();
2348 if (!session)
2349 goto ignore;
2350 if (!session->is_capable("osd", MON_CAP_X)) {
2351 dout(0) << "MOSDFull from entity with insufficient privileges:"
2352 << session->caps << dendl;
2353 goto ignore;
2354 }
2355
2356 // ignore a full message from the osd instance that already went down
2357 if (!osdmap.exists(from)) {
2358 dout(7) << __func__ << " ignoring full message from nonexistent "
2359 << m->get_orig_source_inst() << dendl;
2360 goto ignore;
2361 }
2362 if ((!osdmap.is_up(from) &&
2363 osdmap.get_most_recent_inst(from) == m->get_orig_source_inst()) ||
2364 (osdmap.is_up(from) &&
2365 osdmap.get_inst(from) != m->get_orig_source_inst())) {
2366 dout(7) << __func__ << " ignoring full message from down "
2367 << m->get_orig_source_inst() << dendl;
2368 goto ignore;
2369 }
2370
2371 OSDMap::calc_state_set(osdmap.get_state(from), state);
2372
2373 if ((osdmap.get_state(from) & mask) == m->state) {
2374 dout(7) << __func__ << " state already " << state << " for osd." << from
2375 << " " << m->get_orig_source_inst() << dendl;
2376 _reply_map(op, m->version);
2377 goto ignore;
2378 }
2379
2380 dout(10) << __func__ << " want state " << state << " for osd." << from
2381 << " " << m->get_orig_source_inst() << dendl;
2382 return false;
2383
2384 ignore:
2385 return true;
2386}
2387
2388bool OSDMonitor::prepare_full(MonOpRequestRef op)
2389{
2390 op->mark_osdmon_event(__func__);
2391 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
2392 const int from = m->get_orig_source().num();
2393
2394 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
2395 const unsigned want_state = m->state & mask; // safety first
2396
2397 unsigned cur_state = osdmap.get_state(from);
2398 auto p = pending_inc.new_state.find(from);
2399 if (p != pending_inc.new_state.end()) {
2400 cur_state ^= p->second;
2401 }
2402 cur_state &= mask;
2403
2404 set<string> want_state_set, cur_state_set;
2405 OSDMap::calc_state_set(want_state, want_state_set);
2406 OSDMap::calc_state_set(cur_state, cur_state_set);
2407
2408 if (cur_state != want_state) {
2409 if (p != pending_inc.new_state.end()) {
2410 p->second &= ~mask;
2411 } else {
2412 pending_inc.new_state[from] = 0;
2413 }
2414 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
2415 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2416 << " -> " << want_state_set << dendl;
2417 } else {
2418 dout(7) << __func__ << " osd." << from << " " << cur_state_set
2419 << " = wanted " << want_state_set << ", just waiting" << dendl;
2420 }
2421
2422 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2423 return true;
2424}
2425
2426// -------------
2427// alive
2428
2429bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
2430{
2431 op->mark_osdmon_event(__func__);
2432 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2433 int from = m->get_orig_source().num();
2434
2435 // check permissions, ignore if failed
2436 MonSession *session = m->get_session();
2437 if (!session)
2438 goto ignore;
2439 if (!session->is_capable("osd", MON_CAP_X)) {
2440 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
2441 << session->caps << dendl;
2442 goto ignore;
2443 }
2444
2445 if (!osdmap.is_up(from) ||
2446 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2447 dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
2448 goto ignore;
2449 }
2450
2451 if (osdmap.get_up_thru(from) >= m->want) {
2452 // yup.
2453 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
2454 _reply_map(op, m->version);
2455 return true;
2456 }
2457
2458 dout(10) << "preprocess_alive want up_thru " << m->want
2459 << " from " << m->get_orig_source_inst() << dendl;
2460 return false;
2461
2462 ignore:
2463 return true;
2464}
2465
2466bool OSDMonitor::prepare_alive(MonOpRequestRef op)
2467{
2468 op->mark_osdmon_event(__func__);
2469 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
2470 int from = m->get_orig_source().num();
2471
2472 if (0) { // we probably don't care much about these
2473 mon->clog->debug() << m->get_orig_source_inst() << " alive";
2474 }
2475
2476 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
2477 << " from " << m->get_orig_source_inst() << dendl;
2478
2479 update_up_thru(from, m->version); // set to the latest map the OSD has
2480 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
2481 return true;
2482}
2483
2484void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
2485{
2486 op->mark_osdmon_event(__func__);
2487 dout(7) << "_reply_map " << e
2488 << " from " << op->get_req()->get_orig_source_inst()
2489 << dendl;
2490 send_latest(op, e);
2491}
2492
2493// pg_created
2494bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
2495{
2496 op->mark_osdmon_event(__func__);
2497 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2498 dout(10) << __func__ << " " << *m << dendl;
2499 auto session = m->get_session();
2500 if (!session) {
2501 dout(10) << __func__ << ": no monitor session!" << dendl;
2502 return true;
2503 }
2504 if (!session->is_capable("osd", MON_CAP_X)) {
2505 derr << __func__ << " received from entity "
2506 << "with insufficient privileges " << session->caps << dendl;
2507 return true;
2508 }
2509 // always forward the "created!" to the leader
2510 return false;
2511}
2512
2513bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
2514{
2515 op->mark_osdmon_event(__func__);
2516 auto m = static_cast<MOSDPGCreated*>(op->get_req());
2517 dout(10) << __func__ << " " << *m << dendl;
2518 auto src = m->get_orig_source();
2519 auto from = src.num();
2520 if (!src.is_osd() ||
2521 !mon->osdmon()->osdmap.is_up(from) ||
2522 m->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
2523 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
2524 return false;
2525 }
2526 pending_created_pgs.push_back(m->pgid);
2527 return true;
2528}
2529
2530// -------------
2531// pg_temp changes
2532
2533bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
2534{
2535 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2536 dout(10) << "preprocess_pgtemp " << *m << dendl;
2537 mempool::osdmap::vector<int> empty;
2538 int from = m->get_orig_source().num();
2539 size_t ignore_cnt = 0;
2540
2541 // check caps
2542 MonSession *session = m->get_session();
2543 if (!session)
2544 goto ignore;
2545 if (!session->is_capable("osd", MON_CAP_X)) {
2546 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
2547 << session->caps << dendl;
2548 goto ignore;
2549 }
2550
2551 if (!osdmap.is_up(from) ||
2552 osdmap.get_inst(from) != m->get_orig_source_inst()) {
2553 dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
2554 goto ignore;
2555 }
2556
2557 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2558 dout(20) << " " << p->first
31f18b77 2559 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
7c673cae
FG
2560 << " -> " << p->second << dendl;
2561
2562 // does the pool exist?
2563 if (!osdmap.have_pg_pool(p->first.pool())) {
2564 /*
2565 * 1. If the osdmap does not have the pool, it means the pool has been
2566 * removed in-between the osd sending this message and us handling it.
2567 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
2568 * not exist in the pending either, as the osds would not send a
2569 * message about a pool they know nothing about (yet).
2570 * 3. However, if the pool does exist in the pending, then it must be a
2571 * new pool, and not relevant to this message (see 1).
2572 */
2573 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2574 << ": pool has been removed" << dendl;
2575 ignore_cnt++;
2576 continue;
2577 }
2578
2579 int acting_primary = -1;
2580 osdmap.pg_to_up_acting_osds(
2581 p->first, nullptr, nullptr, nullptr, &acting_primary);
2582 if (acting_primary != from) {
2583 /* If the source isn't the primary based on the current osdmap, we know
2584 * that the interval changed and that we can discard this message.
2585 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
2586 * which of two pg temp mappings on the same pg is more recent.
2587 */
2588 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2589 << ": primary has changed" << dendl;
2590 ignore_cnt++;
2591 continue;
2592 }
2593
2594 // removal?
2595 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
2596 osdmap.primary_temp->count(p->first)))
2597 return false;
2598 // change?
2599 // NOTE: we assume that this will clear pg_primary, so consider
2600 // an existing pg_primary field to imply a change
2601 if (p->second.size() &&
2602 (osdmap.pg_temp->count(p->first) == 0 ||
31f18b77 2603 !vectors_equal(osdmap.pg_temp->get(p->first), p->second) ||
7c673cae
FG
2604 osdmap.primary_temp->count(p->first)))
2605 return false;
2606 }
2607
2608 // should we ignore all the pgs?
2609 if (ignore_cnt == m->pg_temp.size())
2610 goto ignore;
2611
2612 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
2613 _reply_map(op, m->map_epoch);
2614 return true;
2615
2616 ignore:
2617 return true;
2618}
2619
2620void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
2621{
2622 epoch_t old_up_thru = osdmap.get_up_thru(from);
2623 auto ut = pending_inc.new_up_thru.find(from);
2624 if (ut != pending_inc.new_up_thru.end()) {
2625 old_up_thru = ut->second;
2626 }
2627 if (up_thru > old_up_thru) {
2628 // set up_thru too, so the osd doesn't have to ask again
2629 pending_inc.new_up_thru[from] = up_thru;
2630 }
2631}
2632
2633bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
2634{
2635 op->mark_osdmon_event(__func__);
2636 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
2637 int from = m->get_orig_source().num();
2638 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
2639 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
2640 uint64_t pool = p->first.pool();
2641 if (pending_inc.old_pools.count(pool)) {
2642 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2643 << ": pool pending removal" << dendl;
2644 continue;
2645 }
2646 if (!osdmap.have_pg_pool(pool)) {
2647 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
2648 << ": pool has been removed" << dendl;
2649 continue;
2650 }
2651 pending_inc.new_pg_temp[p->first] =
2652 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
2653
2654 // unconditionally clear pg_primary (until this message can encode
2655 // a change for that, too.. at which point we need to also fix
2656 // preprocess_pg_temp)
2657 if (osdmap.primary_temp->count(p->first) ||
2658 pending_inc.new_primary_temp.count(p->first))
2659 pending_inc.new_primary_temp[p->first] = -1;
2660 }
2661
2662 // set up_thru too, so the osd doesn't have to ask again
2663 update_up_thru(from, m->map_epoch);
2664
2665 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
2666 return true;
2667}
2668
2669
2670// ---
2671
2672bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
2673{
2674 op->mark_osdmon_event(__func__);
2675 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2676 dout(7) << "preprocess_remove_snaps " << *m << dendl;
2677
2678 // check privilege, ignore if failed
2679 MonSession *session = m->get_session();
2680 if (!session)
2681 goto ignore;
2682 if (!session->caps.is_capable(
2683 g_ceph_context,
2684 CEPH_ENTITY_TYPE_MON,
2685 session->entity_name,
2686 "osd", "osd pool rmsnap", {}, true, true, false)) {
2687 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
2688 << session->caps << dendl;
2689 goto ignore;
2690 }
2691
2692 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
2693 q != m->snaps.end();
2694 ++q) {
2695 if (!osdmap.have_pg_pool(q->first)) {
2696 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
2697 continue;
2698 }
2699 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
2700 for (vector<snapid_t>::iterator p = q->second.begin();
2701 p != q->second.end();
2702 ++p) {
2703 if (*p > pi->get_snap_seq() ||
2704 !pi->removed_snaps.contains(*p))
2705 return false;
2706 }
2707 }
2708
2709 ignore:
2710 return true;
2711}
2712
2713bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
2714{
2715 op->mark_osdmon_event(__func__);
2716 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
2717 dout(7) << "prepare_remove_snaps " << *m << dendl;
2718
2719 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
2720 p != m->snaps.end();
2721 ++p) {
2722
2723 if (!osdmap.have_pg_pool(p->first)) {
2724 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
2725 continue;
2726 }
2727
2728 pg_pool_t& pi = osdmap.pools[p->first];
2729 for (vector<snapid_t>::iterator q = p->second.begin();
2730 q != p->second.end();
2731 ++q) {
2732 if (!pi.removed_snaps.contains(*q) &&
2733 (!pending_inc.new_pools.count(p->first) ||
2734 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
2735 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
2736 newpi->removed_snaps.insert(*q);
2737 dout(10) << " pool " << p->first << " removed_snaps added " << *q
2738 << " (now " << newpi->removed_snaps << ")" << dendl;
2739 if (*q > newpi->get_snap_seq()) {
2740 dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
2741 newpi->set_snap_seq(*q);
2742 }
2743 newpi->set_snap_epoch(pending_inc.epoch);
2744 }
2745 }
2746 }
2747 return true;
2748}
2749
2750// osd beacon
2751bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
2752{
2753 op->mark_osdmon_event(__func__);
2754 auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2755 // check caps
2756 auto session = beacon->get_session();
2757 if (!session) {
2758 dout(10) << __func__ << " no monitor session!" << dendl;
2759 return true;
2760 }
2761 if (!session->is_capable("osd", MON_CAP_X)) {
2762 derr << __func__ << " received from entity "
2763 << "with insufficient privileges " << session->caps << dendl;
2764 return true;
2765 }
2766 // Always forward the beacon to the leader, even if they are the same as
2767 // the old one. The leader will mark as down osds that haven't sent
2768 // beacon for a few minutes.
2769 return false;
2770}
2771
2772bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
2773{
2774 op->mark_osdmon_event(__func__);
2775 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
2776 const auto src = beacon->get_orig_source();
2777 dout(10) << __func__ << " " << *beacon
2778 << " from " << src << dendl;
2779 int from = src.num();
2780
2781 if (!src.is_osd() ||
2782 !osdmap.is_up(from) ||
2783 beacon->get_orig_source_inst() != osdmap.get_inst(from)) {
2784 dout(1) << " ignoring beacon from non-active osd." << dendl;
2785 return false;
2786 }
2787
2788 last_osd_report[from] = ceph_clock_now();
2789 osd_epochs[from] = beacon->version;
2790
2791 for (const auto& pg : beacon->pgs) {
2792 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
2793 }
2794 return false;
2795}
2796
2797// ---------------
2798// map helpers
2799
2800void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
2801{
2802 op->mark_osdmon_event(__func__);
2803 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
2804 << " start " << start << dendl;
2805 if (start == 0)
2806 send_full(op);
2807 else
2808 send_incremental(op, start);
2809}
2810
2811
2812MOSDMap *OSDMonitor::build_latest_full()
2813{
2814 MOSDMap *r = new MOSDMap(mon->monmap->fsid);
2815 get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
2816 r->oldest_map = get_first_committed();
2817 r->newest_map = osdmap.get_epoch();
2818 return r;
2819}
2820
2821MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
2822{
2823 dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
2824 MOSDMap *m = new MOSDMap(mon->monmap->fsid);
2825 m->oldest_map = get_first_committed();
2826 m->newest_map = osdmap.get_epoch();
2827
2828 for (epoch_t e = to; e >= from && e > 0; e--) {
2829 bufferlist bl;
2830 int err = get_version(e, bl);
2831 if (err == 0) {
2832 assert(bl.length());
2833 // if (get_version(e, bl) > 0) {
2834 dout(20) << "build_incremental inc " << e << " "
2835 << bl.length() << " bytes" << dendl;
2836 m->incremental_maps[e] = bl;
2837 } else {
2838 assert(err == -ENOENT);
2839 assert(!bl.length());
2840 get_version_full(e, bl);
2841 if (bl.length() > 0) {
2842 //else if (get_version("full", e, bl) > 0) {
2843 dout(20) << "build_incremental full " << e << " "
2844 << bl.length() << " bytes" << dendl;
2845 m->maps[e] = bl;
2846 } else {
2847 ceph_abort(); // we should have all maps.
2848 }
2849 }
2850 }
2851 return m;
2852}
2853
2854void OSDMonitor::send_full(MonOpRequestRef op)
2855{
2856 op->mark_osdmon_event(__func__);
2857 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
2858 mon->send_reply(op, build_latest_full());
2859}
2860
2861void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
2862{
2863 op->mark_osdmon_event(__func__);
2864
2865 MonSession *s = op->get_session();
2866 assert(s);
2867
2868 if (s->proxy_con &&
2869 s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
2870 // oh, we can tell the other mon to do it
2871 dout(10) << __func__ << " asking proxying mon to send_incremental from "
2872 << first << dendl;
2873 MRoute *r = new MRoute(s->proxy_tid, NULL);
2874 r->send_osdmap_first = first;
2875 s->proxy_con->send_message(r);
2876 op->mark_event("reply: send routed send_osdmap_first reply");
2877 } else {
2878 // do it ourselves
2879 send_incremental(first, s, false, op);
2880 }
2881}
2882
2883void OSDMonitor::send_incremental(epoch_t first,
2884 MonSession *session,
2885 bool onetime,
2886 MonOpRequestRef req)
2887{
2888 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
2889 << " to " << session->inst << dendl;
2890
2891 if (first <= session->osd_epoch) {
31f18b77 2892 dout(10) << __func__ << " " << session->inst << " should already have epoch "
7c673cae
FG
2893 << session->osd_epoch << dendl;
2894 first = session->osd_epoch + 1;
2895 }
2896
2897 if (first < get_first_committed()) {
2898 first = get_first_committed();
2899 bufferlist bl;
2900 int err = get_version_full(first, bl);
2901 assert(err == 0);
2902 assert(bl.length());
2903
2904 dout(20) << "send_incremental starting with base full "
2905 << first << " " << bl.length() << " bytes" << dendl;
2906
2907 MOSDMap *m = new MOSDMap(osdmap.get_fsid());
2908 m->oldest_map = get_first_committed();
2909 m->newest_map = osdmap.get_epoch();
2910 m->maps[first] = bl;
2911
2912 if (req) {
2913 mon->send_reply(req, m);
2914 session->osd_epoch = first;
2915 return;
2916 } else {
2917 session->con->send_message(m);
2918 session->osd_epoch = first;
2919 }
2920 first++;
2921 }
2922
2923 while (first <= osdmap.get_epoch()) {
2924 epoch_t last = MIN(first + g_conf->osd_map_message_max - 1,
2925 osdmap.get_epoch());
2926 MOSDMap *m = build_incremental(first, last);
2927
2928 if (req) {
2929 // send some maps. it may not be all of them, but it will get them
2930 // started.
2931 mon->send_reply(req, m);
2932 } else {
2933 session->con->send_message(m);
2934 first = last + 1;
2935 }
2936 session->osd_epoch = last;
2937 if (onetime || req)
2938 break;
2939 }
2940}
2941
2942int OSDMonitor::get_version(version_t ver, bufferlist& bl)
2943{
2944 if (inc_osd_cache.lookup(ver, &bl)) {
2945 return 0;
2946 }
2947 int ret = PaxosService::get_version(ver, bl);
2948 if (!ret) {
2949 inc_osd_cache.add(ver, bl);
2950 }
2951 return ret;
2952}
2953
2954int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
2955{
2956 if (full_osd_cache.lookup(ver, &bl)) {
2957 return 0;
2958 }
2959 int ret = PaxosService::get_version_full(ver, bl);
2960 if (!ret) {
2961 full_osd_cache.add(ver, bl);
2962 }
2963 return ret;
2964}
2965
2966epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
2967{
2968 dout(10) << "blacklist " << a << " until " << until << dendl;
2969 pending_inc.new_blacklist[a] = until;
2970 return pending_inc.epoch;
2971}
2972
2973
2974void OSDMonitor::check_osdmap_subs()
2975{
2976 dout(10) << __func__ << dendl;
2977 if (!osdmap.get_epoch()) {
2978 return;
2979 }
2980 auto osdmap_subs = mon->session_map.subs.find("osdmap");
2981 if (osdmap_subs == mon->session_map.subs.end()) {
2982 return;
2983 }
2984 auto p = osdmap_subs->second->begin();
2985 while (!p.end()) {
2986 auto sub = *p;
2987 ++p;
2988 check_osdmap_sub(sub);
2989 }
2990}
2991
2992void OSDMonitor::check_osdmap_sub(Subscription *sub)
2993{
2994 dout(10) << __func__ << " " << sub << " next " << sub->next
2995 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
2996 if (sub->next <= osdmap.get_epoch()) {
2997 if (sub->next >= 1)
2998 send_incremental(sub->next, sub->session, sub->incremental_onetime);
2999 else
3000 sub->session->con->send_message(build_latest_full());
3001 if (sub->onetime)
3002 mon->session_map.remove_sub(sub);
3003 else
3004 sub->next = osdmap.get_epoch() + 1;
3005 }
3006}
3007
3008void OSDMonitor::check_pg_creates_subs()
3009{
3010 if (!mon->monmap->get_required_features().contains_all(
3011 ceph::features::mon::FEATURE_LUMINOUS)) {
3012 // PGMonitor takes care of this in pre-luminous era.
3013 return;
3014 }
3015 if (!osdmap.get_num_up_osds()) {
3016 return;
3017 }
3018 assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
3019 mon->with_session_map([this](const MonSessionMap& session_map) {
3020 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
3021 if (pg_creates_subs == session_map.subs.end()) {
3022 return;
3023 }
3024 for (auto sub : *pg_creates_subs->second) {
3025 check_pg_creates_sub(sub);
3026 }
3027 });
3028}
3029
3030void OSDMonitor::check_pg_creates_sub(Subscription *sub)
3031{
3032 dout(20) << __func__ << " .. " << sub->session->inst << dendl;
3033 assert(sub->type == "osd_pg_creates");
3034 // only send these if the OSD is up. we will check_subs() when they do
3035 // come up so they will get the creates then.
3036 if (sub->session->inst.name.is_osd() &&
3037 mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
3038 sub->next = send_pg_creates(sub->session->inst.name.num(),
3039 sub->session->con.get(),
3040 sub->next);
3041 }
3042}
3043
31f18b77 3044unsigned OSDMonitor::scan_for_creating_pgs(
7c673cae
FG
3045 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
3046 const mempool::osdmap::set<int64_t>& removed_pools,
3047 utime_t modified,
3048 creating_pgs_t* creating_pgs) const
3049{
31f18b77 3050 unsigned queued = 0;
7c673cae
FG
3051 for (auto& p : pools) {
3052 int64_t poolid = p.first;
3053 const pg_pool_t& pool = p.second;
31f18b77 3054 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
7c673cae
FG
3055 pool.get_type(), pool.get_size());
3056 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
3057 continue;
3058
3059 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
3060 const auto created = pool.get_last_change();
3061 if (last_scan_epoch && created <= last_scan_epoch) {
3062 dout(10) << __func__ << " no change in pool " << poolid
3063 << " " << pool << dendl;
3064 continue;
3065 }
3066 if (removed_pools.count(poolid)) {
3067 dout(10) << __func__ << " pool is being removed: " << poolid
3068 << " " << pool << dendl;
3069 continue;
3070 }
31f18b77 3071 dout(10) << __func__ << " queueing pool create for " << poolid
7c673cae 3072 << " " << pool << dendl;
31f18b77
FG
3073 if (creating_pgs->create_pool(poolid, pool.get_pg_num(),
3074 created, modified)) {
3075 queued++;
7c673cae
FG
3076 }
3077 }
31f18b77 3078 return queued;
7c673cae
FG
3079}
3080
3081void OSDMonitor::update_creating_pgs()
3082{
31f18b77
FG
3083 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
3084 << creating_pgs.queue.size() << " pools in queue" << dendl;
7c673cae
FG
3085 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
3086 std::lock_guard<std::mutex> l(creating_pgs_lock);
3087 for (auto& pg : creating_pgs.pgs) {
3088 int acting_primary = -1;
3089 auto pgid = pg.first;
3090 auto mapped = pg.second.first;
31f18b77 3091 dout(20) << __func__ << " looking up " << pgid << dendl;
7c673cae
FG
3092 mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
3093 // check the previous creating_pgs, look for the target to whom the pg was
3094 // previously mapped
3095 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
3096 const auto last_acting_primary = pgs_by_epoch.first;
3097 for (auto& pgs: pgs_by_epoch.second) {
3098 if (pgs.second.count(pgid)) {
3099 if (last_acting_primary == acting_primary) {
3100 mapped = pgs.first;
3101 } else {
3102 dout(20) << __func__ << " " << pgid << " "
3103 << " acting_primary:" << last_acting_primary
3104 << " -> " << acting_primary << dendl;
3105 // note epoch if the target of the create message changed.
3106 mapped = mapping.get_epoch();
3107 }
3108 break;
31f18b77
FG
3109 } else {
3110 // newly creating
3111 mapped = mapping.get_epoch();
3112 }
7c673cae
FG
3113 }
3114 }
3115 dout(10) << __func__ << " will instruct osd." << acting_primary
3116 << " to create " << pgid << dendl;
3117 new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
3118 }
3119 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
3120 creating_pgs_epoch = mapping.get_epoch();
3121}
3122
3123epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
3124{
3125 dout(30) << __func__ << " osd." << osd << " next=" << next
3126 << " " << creating_pgs_by_osd_epoch << dendl;
3127 std::lock_guard<std::mutex> l(creating_pgs_lock);
3128 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
3129 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
3130 return next;
3131 assert(!creating_pgs_by_epoch->second.empty());
3132
3133 MOSDPGCreate *m = nullptr;
3134 epoch_t last = 0;
3135 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
3136 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
3137 auto epoch = epoch_pgs->first;
3138 auto& pgs = epoch_pgs->second;
3139 dout(20) << __func__ << " osd." << osd << " from " << next
3140 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
3141 last = epoch;
3142 for (auto& pg : pgs) {
3143 if (!m)
3144 m = new MOSDPGCreate(creating_pgs_epoch);
3145 // Need the create time from the monitor using its clock to set
3146 // last_scrub_stamp upon pg creation.
3147 const auto& creation = creating_pgs.pgs[pg];
3148 m->mkpg.emplace(pg, pg_create_t{creation.first, pg, 0});
3149 m->ctimes.emplace(pg, creation.second);
3150 dout(20) << __func__ << " will create " << pg
3151 << " at " << creation.first << dendl;
3152 }
3153 }
3154 if (!m) {
3155 dout(20) << __func__ << " osd." << osd << " from " << next
3156 << " has nothing to send" << dendl;
3157 return next;
3158 }
3159 con->send_message(m);
3160 // sub is current through last + 1
3161 return last + 1;
3162}
3163
3164// TICK
3165
3166
3167void OSDMonitor::tick()
3168{
3169 if (!is_active()) return;
3170
3171 dout(10) << osdmap << dendl;
3172
3173 if (!mon->is_leader()) return;
3174
3175 bool do_propose = false;
3176 utime_t now = ceph_clock_now();
3177
31f18b77 3178 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
7c673cae
FG
3179 mon->monmap->get_required_features().contains_all(
3180 ceph::features::mon::FEATURE_LUMINOUS)) {
3181 if (handle_osd_timeouts(now, last_osd_report)) {
3182 do_propose = true;
3183 }
3184 }
3185
3186 // mark osds down?
3187 if (check_failures(now))
3188 do_propose = true;
3189
3190 // mark down osds out?
3191
3192 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
3193 * influence at all. The decision is made based on the ratio of "in" osds,
3194 * and the function returns false if this ratio is lower that the minimum
3195 * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
3196 */
3197 if (can_mark_out(-1)) {
3198 set<int> down_cache; // quick cache of down subtrees
3199
3200 map<int,utime_t>::iterator i = down_pending_out.begin();
3201 while (i != down_pending_out.end()) {
3202 int o = i->first;
3203 utime_t down = now;
3204 down -= i->second;
3205 ++i;
3206
3207 if (osdmap.is_down(o) &&
3208 osdmap.is_in(o) &&
3209 can_mark_out(o)) {
3210 utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
3211 utime_t grace = orig_grace;
3212 double my_grace = 0.0;
3213
3214 if (g_conf->mon_osd_adjust_down_out_interval) {
3215 // scale grace period the same way we do the heartbeat grace.
3216 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
3217 double halflife = (double)g_conf->mon_osd_laggy_halflife;
3218 double decay_k = ::log(.5) / halflife;
3219 double decay = exp((double)down * decay_k);
3220 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
3221 << " down for " << down << " decay " << decay << dendl;
3222 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3223 grace += my_grace;
3224 }
3225
3226 // is this an entire large subtree down?
3227 if (g_conf->mon_osd_down_out_subtree_limit.length()) {
3228 int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
3229 if (type > 0) {
3230 if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
3231 dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
3232 << " subtree for osd." << o << " is down; resetting timer" << dendl;
3233 // reset timer, too.
3234 down_pending_out[o] = now;
3235 continue;
3236 }
3237 }
3238 }
3239
3240 if (g_conf->mon_osd_down_out_interval > 0 &&
3241 down.sec() >= grace) {
3242 dout(10) << "tick marking osd." << o << " OUT after " << down
3243 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
3244 pending_inc.new_weight[o] = CEPH_OSD_OUT;
3245
3246 // set the AUTOOUT bit.
3247 if (pending_inc.new_state.count(o) == 0)
3248 pending_inc.new_state[o] = 0;
3249 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
3250
3251 // remember previous weight
3252 if (pending_inc.new_xinfo.count(o) == 0)
3253 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
3254 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
3255
3256 do_propose = true;
3257
224ce89b
WB
3258 mon->clog->info() << "Marking osd." << o << " out (has been down for "
3259 << int(down.sec()) << " seconds)";
7c673cae
FG
3260 } else
3261 continue;
3262 }
3263
3264 down_pending_out.erase(o);
3265 }
3266 } else {
3267 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
3268 }
3269
3270 // expire blacklisted items?
3271 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
3272 p != osdmap.blacklist.end();
3273 ++p) {
3274 if (p->second < now) {
3275 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
3276 pending_inc.old_blacklist.push_back(p->first);
3277 do_propose = true;
3278 }
3279 }
3280
3281 // if map full setting has changed, get that info out there!
31f18b77
FG
3282 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS &&
3283 mon->pgservice->is_readable()) {
7c673cae 3284 // for pre-luminous compat only!
31f18b77 3285 if (mon->pgservice->have_full_osds()) {
7c673cae
FG
3286 dout(5) << "There are full osds, setting full flag" << dendl;
3287 add_flag(CEPH_OSDMAP_FULL);
3288 } else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
3289 dout(10) << "No full osds, removing full flag" << dendl;
3290 remove_flag(CEPH_OSDMAP_FULL);
3291 }
3292
31f18b77 3293 if (mon->pgservice->have_nearfull_osds()) {
7c673cae
FG
3294 dout(5) << "There are near full osds, setting nearfull flag" << dendl;
3295 add_flag(CEPH_OSDMAP_NEARFULL);
3296 } else if (osdmap.test_flag(CEPH_OSDMAP_NEARFULL)){
3297 dout(10) << "No near full osds, removing nearfull flag" << dendl;
3298 remove_flag(CEPH_OSDMAP_NEARFULL);
3299 }
3300 if (pending_inc.new_flags != -1 &&
3301 (pending_inc.new_flags ^ osdmap.flags) & (CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
3302 dout(1) << "New setting for" <<
3303 (pending_inc.new_flags & CEPH_OSDMAP_FULL ? " CEPH_OSDMAP_FULL" : "") <<
3304 (pending_inc.new_flags & CEPH_OSDMAP_NEARFULL ? " CEPH_OSDMAP_NEARFULL" : "")
3305 << " -- doing propose" << dendl;
3306 do_propose = true;
3307 }
3308 }
3309
3310 if (update_pools_status())
3311 do_propose = true;
3312
3313 if (do_propose ||
3314 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
3315 propose_pending();
3316}
3317
3318bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
3319 std::map<int,utime_t> &last_osd_report)
3320{
3321 utime_t timeo(g_conf->mon_osd_report_timeout, 0);
3322 if (now - mon->get_leader_since() < timeo) {
3323 // We haven't been the leader for long enough to consider OSD timeouts
3324 return false;
3325 }
3326
3327 int max_osd = osdmap.get_max_osd();
3328 bool new_down = false;
3329
3330 for (int i=0; i < max_osd; ++i) {
3331 dout(30) << __func__ << ": checking up on osd " << i << dendl;
3332 if (!osdmap.is_up(i))
3333 continue;
3334 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
3335 if (t == last_osd_report.end()) {
3336 // it wasn't in the map; start the timer.
3337 last_osd_report[i] = now;
3338 } else if (can_mark_down(i)) {
3339 utime_t diff = now - t->second;
3340 if (diff > timeo) {
31f18b77
FG
3341 mon->clog->info() << "osd." << i << " marked down after no beacon for "
3342 << diff << " seconds";
3343 derr << "no beacon from osd." << i << " since " << t->second
3344 << ", " << diff << " seconds ago. marking down" << dendl;
7c673cae
FG
3345 pending_inc.new_state[i] = CEPH_OSD_UP;
3346 new_down = true;
3347 }
3348 }
3349 }
3350 return new_down;
3351}
3352
3353void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
3354 list<pair<health_status_t,string> > *detail,
3355 CephContext *cct) const
3356{
3357 int num_osds = osdmap.get_num_osds();
3358
3359 if (num_osds == 0) {
3360 summary.push_back(make_pair(HEALTH_ERR, "no osds"));
3361 } else {
3362 int num_in_osds = 0;
3363 int num_down_in_osds = 0;
3364 set<int> osds;
31f18b77
FG
3365 set<int> down_in_osds;
3366 set<int> up_in_osds;
3367 set<int> subtree_up;
3368 unordered_map<int, set<int> > subtree_type_down;
3369 unordered_map<int, int> num_osds_subtree;
3370 int max_type = osdmap.crush->get_max_type_id();
3371
7c673cae
FG
3372 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3373 if (!osdmap.exists(i)) {
3374 if (osdmap.crush->item_exists(i)) {
3375 osds.insert(i);
3376 }
31f18b77 3377 continue;
224ce89b 3378 }
7c673cae
FG
3379 if (osdmap.is_out(i))
3380 continue;
3381 ++num_in_osds;
31f18b77
FG
3382 if (down_in_osds.count(i) || up_in_osds.count(i))
3383 continue;
7c673cae 3384 if (!osdmap.is_up(i)) {
31f18b77
FG
3385 down_in_osds.insert(i);
3386 int parent_id = 0;
3387 int current = i;
3388 for (int type = 0; type <= max_type; type++) {
3389 if (!osdmap.crush->get_type_name(type))
3390 continue;
3391 int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
3392 if (r == -ENOENT)
3393 break;
3394 // break early if this parent is already marked as up
3395 if (subtree_up.count(parent_id))
3396 break;
3397 type = osdmap.crush->get_bucket_type(parent_id);
3398 if (!osdmap.subtree_type_is_down(
3399 g_ceph_context, parent_id, type,
3400 &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
3401 break;
3402 current = parent_id;
3403 }
3404 }
3405 }
3406
3407 // calculate the number of down osds in each down subtree and
3408 // store it in num_osds_subtree
3409 for (int type = 1; type <= max_type; type++) {
3410 if (!osdmap.crush->get_type_name(type))
3411 continue;
3412 for (auto j = subtree_type_down[type].begin();
3413 j != subtree_type_down[type].end();
3414 ++j) {
3415 if (type == 1) {
3416 list<int> children;
3417 int num = osdmap.crush->get_children(*j, &children);
3418 num_osds_subtree[*j] = num;
3419 } else {
3420 list<int> children;
3421 int num = 0;
3422 int num_children = osdmap.crush->get_children(*j, &children);
3423 if (num_children == 0)
3424 continue;
3425 for (auto l = children.begin(); l != children.end(); ++l) {
3426 if (num_osds_subtree[*l] > 0) {
3427 num = num + num_osds_subtree[*l];
3428 }
3429 }
3430 num_osds_subtree[*j] = num;
7c673cae
FG
3431 }
3432 }
3433 }
31f18b77 3434 num_down_in_osds = down_in_osds.size();
7c673cae
FG
3435 assert(num_down_in_osds <= num_in_osds);
3436 if (num_down_in_osds > 0) {
31f18b77
FG
3437 // summary of down subtree types and osds
3438 for (int type = max_type; type > 0; type--) {
3439 if (!osdmap.crush->get_type_name(type))
3440 continue;
3441 if (subtree_type_down[type].size() > 0) {
3442 ostringstream ss;
3443 ss << subtree_type_down[type].size() << " "
3444 << osdmap.crush->get_type_name(type);
3445 if (subtree_type_down[type].size() > 1) {
3446 ss << "s";
3447 }
3448 int sum_down_osds = 0;
3449 for (auto j = subtree_type_down[type].begin();
3450 j != subtree_type_down[type].end();
3451 ++j) {
3452 sum_down_osds = sum_down_osds + num_osds_subtree[*j];
3453 }
3454 ss << " (" << sum_down_osds << " osds) down";
3455 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3456 }
3457 }
7c673cae 3458 ostringstream ss;
31f18b77 3459 ss << down_in_osds.size() << " osds down";
7c673cae 3460 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
31f18b77
FG
3461
3462 if (detail) {
3463 // details of down subtree types
3464 for (int type = max_type; type > 0; type--) {
3465 if (!osdmap.crush->get_type_name(type))
3466 continue;
3467 for (auto j = subtree_type_down[type].rbegin();
3468 j != subtree_type_down[type].rend();
3469 ++j) {
3470 ostringstream ss;
3471 ss << osdmap.crush->get_type_name(type);
3472 ss << " ";
3473 ss << osdmap.crush->get_item_name(*j);
3474 // at the top level, do not print location
3475 if (type != max_type) {
3476 ss << " (";
3477 ss << osdmap.crush->get_full_location_ordered_string(*j);
3478 ss << ")";
3479 }
3480 int num = num_osds_subtree[*j];
3481 ss << " (" << num << " osds)";
3482 ss << " is down";
3483 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3484 }
3485 }
3486 // details of down osds
3487 for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
3488 ostringstream ss;
3489 ss << "osd." << *it << " (";
3490 ss << osdmap.crush->get_full_location_ordered_string(*it);
3491 ss << ") is down";
3492 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3493 }
3494 }
7c673cae
FG
3495 }
3496
3497 if (!osds.empty()) {
3498 ostringstream ss;
31f18b77 3499 ss << osds.size() << " osds exist in the crush map but not in the osdmap";
7c673cae
FG
3500 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3501 if (detail) {
31f18b77 3502 ss << " (osds: " << osds << ")";
7c673cae
FG
3503 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3504 }
3505 }
3506
31f18b77 3507 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
3508 // An osd could configure failsafe ratio, to something different
3509 // but for now assume it is the same here.
3510 float fsr = g_conf->osd_failsafe_full_ratio;
3511 if (fsr > 1.0) fsr /= 100;
3512 float fr = osdmap.get_full_ratio();
3513 float br = osdmap.get_backfillfull_ratio();
3514 float nr = osdmap.get_nearfull_ratio();
3515
3516 bool out_of_order = false;
3517 // These checks correspond to how OSDService::check_full_status() in an OSD
3518 // handles the improper setting of these values.
3519 if (br < nr) {
3520 out_of_order = true;
3521 if (detail) {
3522 ostringstream ss;
3523 ss << "backfillfull_ratio (" << br << ") < nearfull_ratio (" << nr << "), increased";
3524 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3525 }
3526 br = nr;
3527 }
3528 if (fr < br) {
3529 out_of_order = true;
3530 if (detail) {
3531 ostringstream ss;
3532 ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br << "), increased";
3533 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3534 }
3535 fr = br;
3536 }
3537 if (fsr < fr) {
3538 out_of_order = true;
3539 if (detail) {
3540 ostringstream ss;
3541 ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr << "), increased";
3542 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3543 }
3544 }
3545 if (out_of_order) {
3546 ostringstream ss;
3547 ss << "Full ratio(s) out of order";
3548 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3549 }
3550
31f18b77
FG
3551 set<int> full, backfillfull, nearfull;
3552 osdmap.get_full_osd_counts(&full, &backfillfull, &nearfull);
7c673cae
FG
3553 if (full.size()) {
3554 ostringstream ss;
3555 ss << full.size() << " full osd(s)";
3556 summary.push_back(make_pair(HEALTH_ERR, ss.str()));
3557 }
3558 if (backfillfull.size()) {
3559 ostringstream ss;
3560 ss << backfillfull.size() << " backfillfull osd(s)";
3561 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3562 }
3563 if (nearfull.size()) {
3564 ostringstream ss;
3565 ss << nearfull.size() << " nearfull osd(s)";
3566 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3567 }
3568 if (detail) {
3569 for (auto& i: full) {
3570 ostringstream ss;
31f18b77 3571 ss << "osd." << i << " is full";
7c673cae
FG
3572 detail->push_back(make_pair(HEALTH_ERR, ss.str()));
3573 }
3574 for (auto& i: backfillfull) {
3575 ostringstream ss;
31f18b77 3576 ss << "osd." << i << " is backfill full";
7c673cae
FG
3577 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3578 }
3579 for (auto& i: nearfull) {
3580 ostringstream ss;
31f18b77 3581 ss << "osd." << i << " is near full";
7c673cae
FG
3582 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3583 }
3584 }
31f18b77
FG
3585
3586 // warn if there is any noup osds.
3587 vector<int> noup_osds;
3588 osdmap.get_noup_osds(&noup_osds);
3589 if (noup_osds.size()) {
3590 ostringstream ss;
3591 ss << noup_osds.size() << " noup osd(s)";
3592 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3593 if (detail) {
3594 ss << ": " << noup_osds;
3595 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3596 }
3597 }
3598
3599 // warn if there is any nodown osds.
3600 vector<int> nodown_osds;
3601 osdmap.get_nodown_osds(&nodown_osds);
3602 if (nodown_osds.size()) {
3603 ostringstream ss;
3604 ss << nodown_osds.size() << " nodown osd(s)";
3605 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3606 if (detail) {
3607 ss << ": " << nodown_osds;
3608 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3609 }
3610 }
3611
3612 // warn if there is any noin osds.
3613 vector<int> noin_osds;
3614 osdmap.get_noin_osds(&noin_osds);
3615 if (noin_osds.size()) {
3616 ostringstream ss;
3617 ss << noin_osds.size() << " noin osd(s)";
3618 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3619 if (detail) {
3620 ss << ": " << noin_osds;
3621 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3622 }
3623 }
3624
3625 // warn if there is any noout osds.
3626 vector<int> noout_osds;
3627 osdmap.get_noout_osds(&noout_osds);
3628 if (noout_osds.size()) {
3629 ostringstream ss;
3630 ss << noout_osds.size() << " noout osd(s)";
3631 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3632 if (detail) {
3633 ss << ": " << noout_osds;
3634 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3635 }
3636 }
7c673cae
FG
3637 }
3638 // note: we leave it to ceph-mgr to generate details health warnings
3639 // with actual osd utilizations
3640
3641 // warn about flags
3642 uint64_t warn_flags =
3643 CEPH_OSDMAP_FULL |
3644 CEPH_OSDMAP_PAUSERD |
3645 CEPH_OSDMAP_PAUSEWR |
3646 CEPH_OSDMAP_PAUSEREC |
3647 CEPH_OSDMAP_NOUP |
3648 CEPH_OSDMAP_NODOWN |
3649 CEPH_OSDMAP_NOIN |
3650 CEPH_OSDMAP_NOOUT |
3651 CEPH_OSDMAP_NOBACKFILL |
3652 CEPH_OSDMAP_NORECOVER |
3653 CEPH_OSDMAP_NOSCRUB |
3654 CEPH_OSDMAP_NODEEP_SCRUB |
3655 CEPH_OSDMAP_NOTIERAGENT |
3656 CEPH_OSDMAP_NOREBALANCE;
3657 if (osdmap.test_flag(warn_flags)) {
3658 ostringstream ss;
3659 ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
3660 << " flag(s) set";
3661 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3662 if (detail)
3663 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3664 }
3665
3666 // old crush tunables?
3667 if (g_conf->mon_warn_on_legacy_crush_tunables) {
3668 string min = osdmap.crush->get_min_required_version();
3669 if (min < g_conf->mon_crush_min_required_version) {
3670 ostringstream ss;
3671 ss << "crush map has legacy tunables (require " << min
3672 << ", min is " << g_conf->mon_crush_min_required_version << ")";
3673 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3674 if (detail) {
3675 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3676 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3677 }
3678 }
3679 }
3680 if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
3681 if (osdmap.crush->get_straw_calc_version() == 0) {
3682 ostringstream ss;
3683 ss << "crush map has straw_calc_version=0";
3684 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3685 if (detail) {
3686 ss << "; see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables";
3687 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3688 }
3689 }
3690 }
3691
3692 // hit_set-less cache_mode?
3693 if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
3694 int problem_cache_pools = 0;
3695 for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
3696 p != osdmap.pools.end();
3697 ++p) {
3698 const pg_pool_t& info = p->second;
3699 if (info.cache_mode_requires_hit_set() &&
3700 info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
3701 ++problem_cache_pools;
3702 if (detail) {
3703 ostringstream ss;
3704 ss << "pool '" << osdmap.get_pool_name(p->first)
3705 << "' with cache_mode " << info.get_cache_mode_name()
3706 << " needs hit_set_type to be set but it is not";
3707 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3708 }
3709 }
3710 }
3711 if (problem_cache_pools) {
3712 ostringstream ss;
3713 ss << problem_cache_pools << " cache pools are missing hit_sets";
3714 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3715 }
3716 }
3717
31f18b77
FG
3718 if (osdmap.crush->has_multirule_rulesets()) {
3719 ostringstream ss;
3720 ss << "CRUSH map contains multirule rulesets";
3721 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3722 if (detail) {
3723 ss << "; please manually fix the map";
3724 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3725 }
3726 }
3727
7c673cae
FG
3728 // Not using 'sortbitwise' and should be?
3729 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
3730 (osdmap.get_up_osd_features() &
3731 CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
3732 ostringstream ss;
3733 ss << "no legacy OSD present but 'sortbitwise' flag is not set";
3734 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3735 }
3736
3737 // Warn if 'mon_osd_down_out_interval' is set to zero.
3738 // Having this option set to zero on the leader acts much like the
3739 // 'noout' flag. It's hard to figure out what's going wrong with clusters
3740 // without the 'noout' flag set but acting like that just the same, so
3741 // we report a HEALTH_WARN in case this option is set to zero.
3742 // This is an ugly hack to get the warning out, but until we find a way
3743 // to spread global options throughout the mon cluster and have all mons
3744 // using a base set of the same options, we need to work around this sort
3745 // of things.
3746 // There's also the obvious drawback that if this is set on a single
3747 // monitor on a 3-monitor cluster, this warning will only be shown every
3748 // third monitor connection.
3749 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
3750 g_conf->mon_osd_down_out_interval == 0) {
3751 ostringstream ss;
3752 ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
3753 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3754 if (detail) {
3755 ss << "; this has the same effect as the 'noout' flag";
3756 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3757 }
3758 }
3759
3760 // warn about upgrade flags that can be set but are not.
3761 if (g_conf->mon_debug_no_require_luminous) {
3762 // ignore these checks
3763 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS) &&
31f18b77
FG
3764 osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
3765 string msg = "all OSDs are running luminous or later but"
3766 " require_osd_release < luminous";
7c673cae
FG
3767 summary.push_back(make_pair(HEALTH_WARN, msg));
3768 if (detail) {
3769 detail->push_back(make_pair(HEALTH_WARN, msg));
3770 }
3771 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN) &&
31f18b77
FG
3772 osdmap.require_osd_release < CEPH_RELEASE_KRAKEN) {
3773 string msg = "all OSDs are running kraken or later but"
3774 " require_osd_release < kraken";
7c673cae
FG
3775 summary.push_back(make_pair(HEALTH_WARN, msg));
3776 if (detail) {
3777 detail->push_back(make_pair(HEALTH_WARN, msg));
3778 }
3779 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL) &&
31f18b77
FG
3780 osdmap.require_osd_release < CEPH_RELEASE_JEWEL) {
3781 string msg = "all OSDs are running jewel or later but"
3782 " require_osd_release < jewel";
7c673cae
FG
3783 summary.push_back(make_pair(HEALTH_WARN, msg));
3784 if (detail) {
3785 detail->push_back(make_pair(HEALTH_WARN, msg));
3786 }
3787 }
3788
224ce89b
WB
3789 for (auto it : osdmap.get_pools()) {
3790 const pg_pool_t &pool = it.second;
3791 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
3792 const string& pool_name = osdmap.get_pool_name(it.first);
3793 stringstream ss;
3794 ss << "pool '" << pool_name << "' is full";
3795 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
3796 if (detail)
3797 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
3798 }
3799 }
7c673cae
FG
3800 }
3801}
3802
3803void OSDMonitor::dump_info(Formatter *f)
3804{
3805 f->open_object_section("osdmap");
3806 osdmap.dump(f);
3807 f->close_section();
3808
3809 f->open_array_section("osd_metadata");
3810 for (int i=0; i<osdmap.get_max_osd(); ++i) {
3811 if (osdmap.exists(i)) {
3812 f->open_object_section("osd");
3813 f->dump_unsigned("id", i);
3814 dump_osd_metadata(i, f, NULL);
3815 f->close_section();
3816 }
3817 }
3818 f->close_section();
3819
3820 f->dump_unsigned("osdmap_first_committed", get_first_committed());
3821 f->dump_unsigned("osdmap_last_committed", get_last_committed());
3822
3823 f->open_object_section("crushmap");
3824 osdmap.crush->dump(f);
3825 f->close_section();
3826}
3827
3828namespace {
3829 enum osd_pool_get_choices {
3830 SIZE, MIN_SIZE, CRASH_REPLAY_INTERVAL,
31f18b77 3831 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL,
7c673cae
FG
3832 NODELETE, NOPGCHANGE, NOSIZECHANGE,
3833 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
3834 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
3835 USE_GMT_HITSET, AUID, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
3836 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
3837 CACHE_TARGET_FULL_RATIO,
3838 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
3839 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
3840 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
3841 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
3842 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
3843 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
3844 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
3845 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
3846 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK };
3847
3848 std::set<osd_pool_get_choices>
3849 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
3850 const std::set<osd_pool_get_choices>& second)
3851 {
3852 std::set<osd_pool_get_choices> result;
3853 std::set_difference(first.begin(), first.end(),
3854 second.begin(), second.end(),
3855 std::inserter(result, result.end()));
3856 return result;
3857 }
3858}
3859
3860
3861bool OSDMonitor::preprocess_command(MonOpRequestRef op)
3862{
3863 op->mark_osdmon_event(__func__);
3864 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
3865 int r = 0;
3866 bufferlist rdata;
3867 stringstream ss, ds;
3868
3869 map<string, cmd_vartype> cmdmap;
3870 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
3871 string rs = ss.str();
3872 mon->reply_command(op, -EINVAL, rs, get_last_committed());
3873 return true;
3874 }
3875
3876 MonSession *session = m->get_session();
3877 if (!session) {
3878 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
3879 return true;
3880 }
3881
3882 string prefix;
3883 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
3884
3885 string format;
3886 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
3887 boost::scoped_ptr<Formatter> f(Formatter::create(format));
3888
3889 if (prefix == "osd stat") {
224ce89b 3890 osdmap.print_summary(f.get(), ds, "");
7c673cae
FG
3891 if (f)
3892 f->flush(rdata);
3893 else
3894 rdata.append(ds);
3895 }
3896 else if (prefix == "osd perf" ||
3897 prefix == "osd blocked-by") {
31f18b77
FG
3898 r = mon->pgservice->process_pg_command(prefix, cmdmap,
3899 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
3900 }
3901 else if (prefix == "osd dump" ||
3902 prefix == "osd tree" ||
3903 prefix == "osd ls" ||
3904 prefix == "osd getmap" ||
31f18b77
FG
3905 prefix == "osd getcrushmap" ||
3906 prefix == "osd ls-tree") {
7c673cae
FG
3907 string val;
3908
3909 epoch_t epoch = 0;
3910 int64_t epochnum;
3911 cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
3912 epoch = epochnum;
3913
3914 bufferlist osdmap_bl;
3915 int err = get_version_full(epoch, osdmap_bl);
3916 if (err == -ENOENT) {
3917 r = -ENOENT;
3918 ss << "there is no map for epoch " << epoch;
3919 goto reply;
3920 }
3921 assert(err == 0);
3922 assert(osdmap_bl.length());
3923
3924 OSDMap *p;
3925 if (epoch == osdmap.get_epoch()) {
3926 p = &osdmap;
3927 } else {
3928 p = new OSDMap;
3929 p->decode(osdmap_bl);
3930 }
3931
224ce89b
WB
3932 auto sg = make_scope_guard([&] {
3933 if (p != &osdmap) {
3934 delete p;
3935 }
3936 });
3937
7c673cae
FG
3938 if (prefix == "osd dump") {
3939 stringstream ds;
3940 if (f) {
3941 f->open_object_section("osdmap");
3942 p->dump(f.get());
3943 f->close_section();
3944 f->flush(ds);
3945 } else {
3946 p->print(ds);
3947 }
3948 rdata.append(ds);
3949 if (!f)
3950 ds << " ";
3951 } else if (prefix == "osd ls") {
3952 if (f) {
3953 f->open_array_section("osds");
3954 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3955 if (osdmap.exists(i)) {
3956 f->dump_int("osd", i);
3957 }
3958 }
3959 f->close_section();
3960 f->flush(ds);
3961 } else {
3962 bool first = true;
3963 for (int i = 0; i < osdmap.get_max_osd(); i++) {
3964 if (osdmap.exists(i)) {
3965 if (!first)
3966 ds << "\n";
3967 first = false;
3968 ds << i;
3969 }
3970 }
3971 }
3972 rdata.append(ds);
3973 } else if (prefix == "osd tree") {
31f18b77
FG
3974 vector<string> states;
3975 cmd_getval(g_ceph_context, cmdmap, "states", states);
3976 unsigned filter = 0;
3977 for (auto& s : states) {
3978 if (s == "up") {
3979 filter |= OSDMap::DUMP_UP;
3980 } else if (s == "down") {
3981 filter |= OSDMap::DUMP_DOWN;
3982 } else if (s == "in") {
3983 filter |= OSDMap::DUMP_IN;
3984 } else if (s == "out") {
3985 filter |= OSDMap::DUMP_OUT;
3986 } else {
3987 ss << "unrecognized state '" << s << "'";
3988 r = -EINVAL;
3989 goto reply;
3990 }
3991 }
3992 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
3993 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT) ||
3994 (filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
3995 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) {
3996 ss << "cannot specify both up and down or both in and out";
3997 r = -EINVAL;
3998 goto reply;
3999 }
7c673cae
FG
4000 if (f) {
4001 f->open_object_section("tree");
31f18b77 4002 p->print_tree(f.get(), NULL, filter);
7c673cae
FG
4003 f->close_section();
4004 f->flush(ds);
4005 } else {
31f18b77 4006 p->print_tree(NULL, &ds, filter);
7c673cae
FG
4007 }
4008 rdata.append(ds);
4009 } else if (prefix == "osd getmap") {
4010 rdata.append(osdmap_bl);
4011 ss << "got osdmap epoch " << p->get_epoch();
4012 } else if (prefix == "osd getcrushmap") {
4013 p->crush->encode(rdata, mon->get_quorum_con_features());
31f18b77
FG
4014 ss << p->get_crush_version();
4015 } else if (prefix == "osd ls-tree") {
4016 string bucket_name;
4017 cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
4018 set<int> osds;
4019 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4020 if (r == -ENOENT) {
4021 ss << "\"" << bucket_name << "\" does not exist";
4022 goto reply;
4023 } else if (r < 0) {
4024 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4025 goto reply;
4026 }
4027
4028 if (f) {
4029 f->open_array_section("osds");
4030 for (auto &i : osds) {
4031 if (osdmap.exists(i)) {
4032 f->dump_int("osd", i);
4033 }
4034 }
4035 f->close_section();
4036 f->flush(ds);
4037 } else {
4038 bool first = true;
4039 for (auto &i : osds) {
4040 if (osdmap.exists(i)) {
4041 if (!first)
4042 ds << "\n";
4043 first = false;
4044 ds << i;
4045 }
4046 }
4047 }
4048
4049 rdata.append(ds);
7c673cae 4050 }
7c673cae
FG
4051 } else if (prefix == "osd df") {
4052 string method;
4053 cmd_getval(g_ceph_context, cmdmap, "output_method", method);
31f18b77
FG
4054 print_osd_utilization(osdmap, mon->pgservice, ds,
4055 f.get(), method == "tree");
7c673cae
FG
4056 rdata.append(ds);
4057 } else if (prefix == "osd getmaxosd") {
4058 if (f) {
4059 f->open_object_section("getmaxosd");
4060 f->dump_unsigned("epoch", osdmap.get_epoch());
4061 f->dump_int("max_osd", osdmap.get_max_osd());
4062 f->close_section();
4063 f->flush(rdata);
4064 } else {
4065 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4066 rdata.append(ds);
4067 }
4068 } else if (prefix == "osd utilization") {
4069 string out;
4070 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4071 if (f)
4072 f->flush(rdata);
4073 else
4074 rdata.append(out);
4075 r = 0;
4076 goto reply;
4077 } else if (prefix == "osd find") {
4078 int64_t osd;
4079 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4080 ss << "unable to parse osd id value '"
4081 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4082 r = -EINVAL;
4083 goto reply;
4084 }
4085 if (!osdmap.exists(osd)) {
4086 ss << "osd." << osd << " does not exist";
4087 r = -ENOENT;
4088 goto reply;
4089 }
4090 string format;
4091 cmd_getval(g_ceph_context, cmdmap, "format", format);
4092 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4093 f->open_object_section("osd_location");
4094 f->dump_int("osd", osd);
4095 f->dump_stream("ip") << osdmap.get_addr(osd);
4096 f->open_object_section("crush_location");
4097 map<string,string> loc = osdmap.crush->get_full_location(osd);
4098 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4099 f->dump_string(p->first.c_str(), p->second);
4100 f->close_section();
4101 f->close_section();
4102 f->flush(rdata);
4103 } else if (prefix == "osd metadata") {
4104 int64_t osd = -1;
4105 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4106 !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
4107 ss << "unable to parse osd id value '"
4108 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4109 r = -EINVAL;
4110 goto reply;
4111 }
4112 if (osd >= 0 && !osdmap.exists(osd)) {
4113 ss << "osd." << osd << " does not exist";
4114 r = -ENOENT;
4115 goto reply;
4116 }
4117 string format;
4118 cmd_getval(g_ceph_context, cmdmap, "format", format);
4119 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4120 if (osd >= 0) {
4121 f->open_object_section("osd_metadata");
4122 f->dump_unsigned("id", osd);
4123 r = dump_osd_metadata(osd, f.get(), &ss);
4124 if (r < 0)
4125 goto reply;
4126 f->close_section();
4127 } else {
4128 r = 0;
4129 f->open_array_section("osd_metadata");
4130 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4131 if (osdmap.exists(i)) {
4132 f->open_object_section("osd");
4133 f->dump_unsigned("id", i);
4134 r = dump_osd_metadata(i, f.get(), NULL);
4135 if (r == -EINVAL || r == -ENOENT) {
4136 // Drop error, continue to get other daemons' metadata
4137 dout(4) << "No metadata for osd." << i << dendl;
4138 r = 0;
4139 } else if (r < 0) {
4140 // Unexpected error
4141 goto reply;
4142 }
4143 f->close_section();
4144 }
4145 }
4146 f->close_section();
4147 }
4148 f->flush(rdata);
31f18b77
FG
4149 } else if (prefix == "osd versions") {
4150 if (!f)
4151 f.reset(Formatter::create("json-pretty"));
4152 count_metadata("ceph_version", f.get());
4153 f->flush(rdata);
4154 r = 0;
4155 } else if (prefix == "osd count-metadata") {
4156 if (!f)
4157 f.reset(Formatter::create("json-pretty"));
4158 string field;
4159 cmd_getval(g_ceph_context, cmdmap, "property", field);
4160 count_metadata(field, f.get());
4161 f->flush(rdata);
4162 r = 0;
7c673cae
FG
4163 } else if (prefix == "osd map") {
4164 string poolstr, objstr, namespacestr;
4165 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4166 cmd_getval(g_ceph_context, cmdmap, "object", objstr);
4167 cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
4168
4169 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4170 if (pool < 0) {
4171 ss << "pool " << poolstr << " does not exist";
4172 r = -ENOENT;
4173 goto reply;
4174 }
4175 object_locator_t oloc(pool, namespacestr);
4176 object_t oid(objstr);
4177 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4178 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4179 vector<int> up, acting;
4180 int up_p, acting_p;
4181 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
4182
4183 string fullobjname;
4184 if (!namespacestr.empty())
4185 fullobjname = namespacestr + string("/") + oid.name;
4186 else
4187 fullobjname = oid.name;
4188 if (f) {
4189 f->open_object_section("osd_map");
4190 f->dump_unsigned("epoch", osdmap.get_epoch());
4191 f->dump_string("pool", poolstr);
4192 f->dump_int("pool_id", pool);
4193 f->dump_stream("objname") << fullobjname;
4194 f->dump_stream("raw_pgid") << pgid;
4195 f->dump_stream("pgid") << mpgid;
4196 f->open_array_section("up");
4197 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
4198 f->dump_int("osd", *p);
4199 f->close_section();
4200 f->dump_int("up_primary", up_p);
4201 f->open_array_section("acting");
4202 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
4203 f->dump_int("osd", *p);
4204 f->close_section();
4205 f->dump_int("acting_primary", acting_p);
4206 f->close_section(); // osd_map
4207 f->flush(rdata);
4208 } else {
4209 ds << "osdmap e" << osdmap.get_epoch()
4210 << " pool '" << poolstr << "' (" << pool << ")"
4211 << " object '" << fullobjname << "' ->"
4212 << " pg " << pgid << " (" << mpgid << ")"
4213 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
4214 << pg_vector_string(acting) << ", p" << acting_p << ")";
4215 rdata.append(ds);
4216 }
4217
4218 } else if (prefix == "pg map") {
4219 pg_t pgid;
4220 string pgidstr;
4221 cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
4222 if (!pgid.parse(pgidstr.c_str())) {
4223 ss << "invalid pgid '" << pgidstr << "'";
4224 r = -EINVAL;
4225 goto reply;
4226 }
4227 vector<int> up, acting;
4228 if (!osdmap.have_pg_pool(pgid.pool())) {
4229 ss << "pg '" << pgidstr << "' does not exist";
4230 r = -ENOENT;
4231 goto reply;
4232 }
4233 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4234 osdmap.pg_to_up_acting_osds(pgid, up, acting);
4235 if (f) {
4236 f->open_object_section("pg_map");
4237 f->dump_unsigned("epoch", osdmap.get_epoch());
4238 f->dump_stream("raw_pgid") << pgid;
4239 f->dump_stream("pgid") << mpgid;
4240 f->open_array_section("up");
4241 for (auto osd : up) {
4242 f->dump_int("up_osd", osd);
4243 }
4244 f->close_section();
4245 f->open_array_section("acting");
4246 for (auto osd : acting) {
4247 f->dump_int("acting_osd", osd);
4248 }
4249 f->close_section();
4250 f->close_section();
4251 f->flush(rdata);
4252 } else {
4253 ds << "osdmap e" << osdmap.get_epoch()
4254 << " pg " << pgid << " (" << mpgid << ")"
4255 << " -> up " << up << " acting " << acting;
4256 rdata.append(ds);
4257 }
4258 goto reply;
4259
224ce89b
WB
4260 } else if (prefix == "osd scrub" ||
4261 prefix == "osd deep-scrub" ||
4262 prefix == "osd repair") {
7c673cae
FG
4263 string whostr;
4264 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
4265 vector<string> pvec;
4266 get_str_vec(prefix, pvec);
4267
224ce89b 4268 if (whostr == "*" || whostr == "all" || whostr == "any") {
7c673cae
FG
4269 ss << "osds ";
4270 int c = 0;
4271 for (int i = 0; i < osdmap.get_max_osd(); i++)
4272 if (osdmap.is_up(i)) {
4273 ss << (c++ ? "," : "") << i;
4274 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4275 pvec.back() == "repair",
4276 pvec.back() == "deep-scrub"),
4277 osdmap.get_inst(i));
4278 }
4279 r = 0;
4280 ss << " instructed to " << pvec.back();
4281 } else {
4282 long osd = parse_osd_id(whostr.c_str(), &ss);
4283 if (osd < 0) {
4284 r = -EINVAL;
4285 } else if (osdmap.is_up(osd)) {
4286 mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
4287 pvec.back() == "repair",
4288 pvec.back() == "deep-scrub"),
4289 osdmap.get_inst(osd));
4290 ss << "osd." << osd << " instructed to " << pvec.back();
4291 } else {
4292 ss << "osd." << osd << " is not up";
4293 r = -EAGAIN;
4294 }
4295 }
4296 } else if (prefix == "osd lspools") {
4297 int64_t auid;
4298 cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
4299 if (f)
4300 f->open_array_section("pools");
4301 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
4302 p != osdmap.pools.end();
4303 ++p) {
4304 if (!auid || p->second.auid == (uint64_t)auid) {
4305 if (f) {
4306 f->open_object_section("pool");
4307 f->dump_int("poolnum", p->first);
4308 f->dump_string("poolname", osdmap.pool_name[p->first]);
4309 f->close_section();
4310 } else {
4311 ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
4312 }
4313 }
4314 }
4315 if (f) {
4316 f->close_section();
4317 f->flush(ds);
4318 }
4319 rdata.append(ds);
4320 } else if (prefix == "osd blacklist ls") {
4321 if (f)
4322 f->open_array_section("blacklist");
4323
4324 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4325 p != osdmap.blacklist.end();
4326 ++p) {
4327 if (f) {
4328 f->open_object_section("entry");
4329 f->dump_stream("addr") << p->first;
4330 f->dump_stream("until") << p->second;
4331 f->close_section();
4332 } else {
4333 stringstream ss;
4334 string s;
4335 ss << p->first << " " << p->second;
4336 getline(ss, s);
4337 s += "\n";
4338 rdata.append(s);
4339 }
4340 }
4341 if (f) {
4342 f->close_section();
4343 f->flush(rdata);
4344 }
4345 ss << "listed " << osdmap.blacklist.size() << " entries";
4346
4347 } else if (prefix == "osd pool ls") {
4348 string detail;
4349 cmd_getval(g_ceph_context, cmdmap, "detail", detail);
4350 if (!f && detail == "detail") {
4351 ostringstream ss;
4352 osdmap.print_pools(ss);
4353 rdata.append(ss.str());
4354 } else {
4355 if (f)
4356 f->open_array_section("pools");
4357 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
4358 it != osdmap.get_pools().end();
4359 ++it) {
4360 if (f) {
4361 if (detail == "detail") {
4362 f->open_object_section("pool");
4363 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4364 it->second.dump(f.get());
4365 f->close_section();
4366 } else {
4367 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
4368 }
4369 } else {
4370 rdata.append(osdmap.get_pool_name(it->first) + "\n");
4371 }
4372 }
4373 if (f) {
4374 f->close_section();
4375 f->flush(rdata);
4376 }
4377 }
4378
4379 } else if (prefix == "osd crush get-tunable") {
4380 string tunable;
4381 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
4382 ostringstream rss;
4383 if (f)
4384 f->open_object_section("tunable");
4385 if (tunable == "straw_calc_version") {
4386 if (f)
4387 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
4388 else
4389 rss << osdmap.crush->get_straw_calc_version() << "\n";
4390 } else {
4391 r = -EINVAL;
4392 goto reply;
4393 }
4394 if (f) {
4395 f->close_section();
4396 f->flush(rdata);
4397 } else {
4398 rdata.append(rss.str());
4399 }
4400 r = 0;
4401
4402 } else if (prefix == "osd pool get") {
4403 string poolstr;
4404 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
4405 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4406 if (pool < 0) {
4407 ss << "unrecognized pool '" << poolstr << "'";
4408 r = -ENOENT;
4409 goto reply;
4410 }
4411
4412 const pg_pool_t *p = osdmap.get_pg_pool(pool);
4413 string var;
4414 cmd_getval(g_ceph_context, cmdmap, "var", var);
4415
4416 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
4417 const choices_map_t ALL_CHOICES = {
4418 {"size", SIZE},
4419 {"min_size", MIN_SIZE},
4420 {"crash_replay_interval", CRASH_REPLAY_INTERVAL},
4421 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
4422 {"crush_rule", CRUSH_RULE},
7c673cae
FG
4423 {"hashpspool", HASHPSPOOL}, {"nodelete", NODELETE},
4424 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
4425 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
4426 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
4427 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
4428 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
4429 {"use_gmt_hitset", USE_GMT_HITSET},
4430 {"auid", AUID}, {"target_max_objects", TARGET_MAX_OBJECTS},
4431 {"target_max_bytes", TARGET_MAX_BYTES},
4432 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
4433 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
4434 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
4435 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
4436 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
4437 {"erasure_code_profile", ERASURE_CODE_PROFILE},
4438 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
4439 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
4440 {"fast_read", FAST_READ},
4441 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
4442 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
4443 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
4444 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
4445 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
4446 {"recovery_priority", RECOVERY_PRIORITY},
4447 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
4448 {"scrub_priority", SCRUB_PRIORITY},
4449 {"compression_mode", COMPRESSION_MODE},
4450 {"compression_algorithm", COMPRESSION_ALGORITHM},
4451 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
4452 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
4453 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
4454 {"csum_type", CSUM_TYPE},
4455 {"csum_max_block", CSUM_MAX_BLOCK},
4456 {"csum_min_block", CSUM_MIN_BLOCK},
4457 };
4458
4459 typedef std::set<osd_pool_get_choices> choices_set_t;
4460
4461 const choices_set_t ONLY_TIER_CHOICES = {
4462 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4463 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
4464 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4465 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4466 MIN_READ_RECENCY_FOR_PROMOTE,
4467 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
4468 };
4469 const choices_set_t ONLY_ERASURE_CHOICES = {
4470 ERASURE_CODE_PROFILE
4471 };
4472
4473 choices_set_t selected_choices;
4474 if (var == "all") {
4475 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
4476 it != ALL_CHOICES.end(); ++it) {
4477 selected_choices.insert(it->second);
4478 }
4479
4480 if(!p->is_tier()) {
4481 selected_choices = subtract_second_from_first(selected_choices,
4482 ONLY_TIER_CHOICES);
4483 }
4484
4485 if(!p->is_erasure()) {
4486 selected_choices = subtract_second_from_first(selected_choices,
4487 ONLY_ERASURE_CHOICES);
4488 }
4489 } else /* var != "all" */ {
4490 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
4491 osd_pool_get_choices selected = found->second;
4492
4493 if (!p->is_tier() &&
4494 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
4495 ss << "pool '" << poolstr
4496 << "' is not a tier pool: variable not applicable";
4497 r = -EACCES;
4498 goto reply;
4499 }
4500
4501 if (!p->is_erasure() &&
4502 ONLY_ERASURE_CHOICES.find(selected)
4503 != ONLY_ERASURE_CHOICES.end()) {
4504 ss << "pool '" << poolstr
4505 << "' is not a erasure pool: variable not applicable";
4506 r = -EACCES;
4507 goto reply;
4508 }
4509
4510 selected_choices.insert(selected);
4511 }
4512
4513 if (f) {
4514 for(choices_set_t::const_iterator it = selected_choices.begin();
4515 it != selected_choices.end(); ++it) {
4516 choices_map_t::const_iterator i;
4517 f->open_object_section("pool");
4518 f->dump_string("pool", poolstr);
4519 f->dump_int("pool_id", pool);
4520 switch(*it) {
4521 case PG_NUM:
4522 f->dump_int("pg_num", p->get_pg_num());
4523 break;
4524 case PGP_NUM:
4525 f->dump_int("pgp_num", p->get_pgp_num());
4526 break;
4527 case AUID:
4528 f->dump_int("auid", p->get_auid());
4529 break;
4530 case SIZE:
4531 f->dump_int("size", p->get_size());
4532 break;
4533 case MIN_SIZE:
4534 f->dump_int("min_size", p->get_min_size());
4535 break;
4536 case CRASH_REPLAY_INTERVAL:
4537 f->dump_int("crash_replay_interval",
4538 p->get_crash_replay_interval());
4539 break;
4540 case CRUSH_RULE:
31f18b77 4541 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4542 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
31f18b77 4543 p->get_crush_rule()));
7c673cae 4544 } else {
31f18b77 4545 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
7c673cae
FG
4546 }
4547 break;
7c673cae
FG
4548 case HASHPSPOOL:
4549 case NODELETE:
4550 case NOPGCHANGE:
4551 case NOSIZECHANGE:
4552 case WRITE_FADVISE_DONTNEED:
4553 case NOSCRUB:
4554 case NODEEP_SCRUB:
4555 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4556 if (i->second == *it)
4557 break;
4558 }
4559 assert(i != ALL_CHOICES.end());
4560 f->dump_string(i->first.c_str(),
4561 p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4562 "true" : "false");
4563 break;
4564 case HIT_SET_PERIOD:
4565 f->dump_int("hit_set_period", p->hit_set_period);
4566 break;
4567 case HIT_SET_COUNT:
4568 f->dump_int("hit_set_count", p->hit_set_count);
4569 break;
4570 case HIT_SET_TYPE:
4571 f->dump_string("hit_set_type",
4572 HitSet::get_type_name(p->hit_set_params.get_type()));
4573 break;
4574 case HIT_SET_FPP:
4575 {
4576 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4577 BloomHitSet::Params *bloomp =
4578 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4579 f->dump_float("hit_set_fpp", bloomp->get_fpp());
4580 } else if(var != "all") {
4581 f->close_section();
4582 ss << "hit set is not of type Bloom; " <<
4583 "invalid to get a false positive rate!";
4584 r = -EINVAL;
4585 goto reply;
4586 }
4587 }
4588 break;
4589 case USE_GMT_HITSET:
4590 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
4591 break;
4592 case TARGET_MAX_OBJECTS:
4593 f->dump_unsigned("target_max_objects", p->target_max_objects);
4594 break;
4595 case TARGET_MAX_BYTES:
4596 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
4597 break;
4598 case CACHE_TARGET_DIRTY_RATIO:
4599 f->dump_unsigned("cache_target_dirty_ratio_micro",
4600 p->cache_target_dirty_ratio_micro);
4601 f->dump_float("cache_target_dirty_ratio",
4602 ((float)p->cache_target_dirty_ratio_micro/1000000));
4603 break;
4604 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4605 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
4606 p->cache_target_dirty_high_ratio_micro);
4607 f->dump_float("cache_target_dirty_high_ratio",
4608 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
4609 break;
4610 case CACHE_TARGET_FULL_RATIO:
4611 f->dump_unsigned("cache_target_full_ratio_micro",
4612 p->cache_target_full_ratio_micro);
4613 f->dump_float("cache_target_full_ratio",
4614 ((float)p->cache_target_full_ratio_micro/1000000));
4615 break;
4616 case CACHE_MIN_FLUSH_AGE:
4617 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
4618 break;
4619 case CACHE_MIN_EVICT_AGE:
4620 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
4621 break;
4622 case ERASURE_CODE_PROFILE:
4623 f->dump_string("erasure_code_profile", p->erasure_code_profile);
4624 break;
4625 case MIN_READ_RECENCY_FOR_PROMOTE:
4626 f->dump_int("min_read_recency_for_promote",
4627 p->min_read_recency_for_promote);
4628 break;
4629 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4630 f->dump_int("min_write_recency_for_promote",
4631 p->min_write_recency_for_promote);
4632 break;
4633 case FAST_READ:
4634 f->dump_int("fast_read", p->fast_read);
4635 break;
4636 case HIT_SET_GRADE_DECAY_RATE:
4637 f->dump_int("hit_set_grade_decay_rate",
4638 p->hit_set_grade_decay_rate);
4639 break;
4640 case HIT_SET_SEARCH_LAST_N:
4641 f->dump_int("hit_set_search_last_n",
4642 p->hit_set_search_last_n);
4643 break;
4644 case SCRUB_MIN_INTERVAL:
4645 case SCRUB_MAX_INTERVAL:
4646 case DEEP_SCRUB_INTERVAL:
4647 case RECOVERY_PRIORITY:
4648 case RECOVERY_OP_PRIORITY:
4649 case SCRUB_PRIORITY:
4650 case COMPRESSION_MODE:
4651 case COMPRESSION_ALGORITHM:
4652 case COMPRESSION_REQUIRED_RATIO:
4653 case COMPRESSION_MAX_BLOB_SIZE:
4654 case COMPRESSION_MIN_BLOB_SIZE:
4655 case CSUM_TYPE:
4656 case CSUM_MAX_BLOCK:
4657 case CSUM_MIN_BLOCK:
4658 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4659 if (i->second == *it)
4660 break;
4661 }
4662 assert(i != ALL_CHOICES.end());
4663 if(*it == CSUM_TYPE) {
4664 int val;
4665 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
4666 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
4667 }
4668 else {
4669 p->opts.dump(i->first, f.get());
4670 }
4671 break;
4672 }
4673 f->close_section();
4674 f->flush(rdata);
4675 }
4676
4677 } else /* !f */ {
4678 for(choices_set_t::const_iterator it = selected_choices.begin();
4679 it != selected_choices.end(); ++it) {
4680 choices_map_t::const_iterator i;
4681 switch(*it) {
4682 case PG_NUM:
4683 ss << "pg_num: " << p->get_pg_num() << "\n";
4684 break;
4685 case PGP_NUM:
4686 ss << "pgp_num: " << p->get_pgp_num() << "\n";
4687 break;
4688 case AUID:
4689 ss << "auid: " << p->get_auid() << "\n";
4690 break;
4691 case SIZE:
4692 ss << "size: " << p->get_size() << "\n";
4693 break;
4694 case MIN_SIZE:
4695 ss << "min_size: " << p->get_min_size() << "\n";
4696 break;
4697 case CRASH_REPLAY_INTERVAL:
4698 ss << "crash_replay_interval: " <<
4699 p->get_crash_replay_interval() << "\n";
4700 break;
4701 case CRUSH_RULE:
31f18b77 4702 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
7c673cae 4703 ss << "crush_rule: " << osdmap.crush->get_rule_name(
31f18b77 4704 p->get_crush_rule()) << "\n";
7c673cae 4705 } else {
31f18b77 4706 ss << "crush_rule: " << p->get_crush_rule() << "\n";
7c673cae
FG
4707 }
4708 break;
7c673cae
FG
4709 case HIT_SET_PERIOD:
4710 ss << "hit_set_period: " << p->hit_set_period << "\n";
4711 break;
4712 case HIT_SET_COUNT:
4713 ss << "hit_set_count: " << p->hit_set_count << "\n";
4714 break;
4715 case HIT_SET_TYPE:
4716 ss << "hit_set_type: " <<
4717 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
4718 break;
4719 case HIT_SET_FPP:
4720 {
4721 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
4722 BloomHitSet::Params *bloomp =
4723 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
4724 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
4725 } else if(var != "all") {
4726 ss << "hit set is not of type Bloom; " <<
4727 "invalid to get a false positive rate!";
4728 r = -EINVAL;
4729 goto reply;
4730 }
4731 }
4732 break;
4733 case USE_GMT_HITSET:
4734 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
4735 break;
4736 case TARGET_MAX_OBJECTS:
4737 ss << "target_max_objects: " << p->target_max_objects << "\n";
4738 break;
4739 case TARGET_MAX_BYTES:
4740 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
4741 break;
4742 case CACHE_TARGET_DIRTY_RATIO:
4743 ss << "cache_target_dirty_ratio: "
4744 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
4745 break;
4746 case CACHE_TARGET_DIRTY_HIGH_RATIO:
4747 ss << "cache_target_dirty_high_ratio: "
4748 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
4749 break;
4750 case CACHE_TARGET_FULL_RATIO:
4751 ss << "cache_target_full_ratio: "
4752 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
4753 break;
4754 case CACHE_MIN_FLUSH_AGE:
4755 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
4756 break;
4757 case CACHE_MIN_EVICT_AGE:
4758 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
4759 break;
4760 case ERASURE_CODE_PROFILE:
4761 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
4762 break;
4763 case MIN_READ_RECENCY_FOR_PROMOTE:
4764 ss << "min_read_recency_for_promote: " <<
4765 p->min_read_recency_for_promote << "\n";
4766 break;
4767 case HIT_SET_GRADE_DECAY_RATE:
4768 ss << "hit_set_grade_decay_rate: " <<
4769 p->hit_set_grade_decay_rate << "\n";
4770 break;
4771 case HIT_SET_SEARCH_LAST_N:
4772 ss << "hit_set_search_last_n: " <<
4773 p->hit_set_search_last_n << "\n";
4774 break;
4775 case HASHPSPOOL:
4776 case NODELETE:
4777 case NOPGCHANGE:
4778 case NOSIZECHANGE:
4779 case WRITE_FADVISE_DONTNEED:
4780 case NOSCRUB:
4781 case NODEEP_SCRUB:
4782 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4783 if (i->second == *it)
4784 break;
4785 }
4786 assert(i != ALL_CHOICES.end());
4787 ss << i->first << ": " <<
4788 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
4789 "true" : "false") << "\n";
4790 break;
4791 case MIN_WRITE_RECENCY_FOR_PROMOTE:
4792 ss << "min_write_recency_for_promote: " <<
4793 p->min_write_recency_for_promote << "\n";
4794 break;
4795 case FAST_READ:
4796 ss << "fast_read: " << p->fast_read << "\n";
4797 break;
4798 case SCRUB_MIN_INTERVAL:
4799 case SCRUB_MAX_INTERVAL:
4800 case DEEP_SCRUB_INTERVAL:
4801 case RECOVERY_PRIORITY:
4802 case RECOVERY_OP_PRIORITY:
4803 case SCRUB_PRIORITY:
4804 case COMPRESSION_MODE:
4805 case COMPRESSION_ALGORITHM:
4806 case COMPRESSION_REQUIRED_RATIO:
4807 case COMPRESSION_MAX_BLOB_SIZE:
4808 case COMPRESSION_MIN_BLOB_SIZE:
4809 case CSUM_TYPE:
4810 case CSUM_MAX_BLOCK:
4811 case CSUM_MIN_BLOCK:
4812 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
4813 if (i->second == *it)
4814 break;
4815 }
4816 assert(i != ALL_CHOICES.end());
4817 {
4818 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
4819 if (p->opts.is_set(key)) {
4820 if(key == pool_opts_t::CSUM_TYPE) {
4821 int val;
4822 p->opts.get(key, &val);
4823 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
4824 } else {
4825 ss << i->first << ": " << p->opts.get(key) << "\n";
4826 }
4827 }
4828 }
4829 break;
4830 }
4831 rdata.append(ss.str());
4832 ss.str("");
4833 }
4834 }
4835 r = 0;
4836 } else if (prefix == "osd pool stats") {
31f18b77
FG
4837 r = mon->pgservice->process_pg_command(prefix, cmdmap,
4838 osdmap, f.get(), &ss, &rdata);
7c673cae
FG
4839 } else if (prefix == "osd pool get-quota") {
4840 string pool_name;
4841 cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
4842
4843 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
4844 if (poolid < 0) {
4845 assert(poolid == -ENOENT);
4846 ss << "unrecognized pool '" << pool_name << "'";
4847 r = -ENOENT;
4848 goto reply;
4849 }
4850 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
4851
4852 if (f) {
4853 f->open_object_section("pool_quotas");
4854 f->dump_string("pool_name", pool_name);
4855 f->dump_unsigned("pool_id", poolid);
4856 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
4857 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
4858 f->close_section();
4859 f->flush(rdata);
4860 } else {
4861 stringstream rs;
4862 rs << "quotas for pool '" << pool_name << "':\n"
4863 << " max objects: ";
4864 if (p->quota_max_objects == 0)
4865 rs << "N/A";
4866 else
4867 rs << si_t(p->quota_max_objects) << " objects";
4868 rs << "\n"
4869 << " max bytes : ";
4870 if (p->quota_max_bytes == 0)
4871 rs << "N/A";
4872 else
4873 rs << si_t(p->quota_max_bytes) << "B";
4874 rdata.append(rs.str());
4875 }
4876 rdata.append("\n");
4877 r = 0;
4878 } else if (prefix == "osd crush rule list" ||
4879 prefix == "osd crush rule ls") {
4880 string format;
4881 cmd_getval(g_ceph_context, cmdmap, "format", format);
4882 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4883 f->open_array_section("rules");
4884 osdmap.crush->list_rules(f.get());
4885 f->close_section();
4886 ostringstream rs;
4887 f->flush(rs);
4888 rs << "\n";
4889 rdata.append(rs.str());
4890 } else if (prefix == "osd crush rule dump") {
4891 string name;
4892 cmd_getval(g_ceph_context, cmdmap, "name", name);
4893 string format;
4894 cmd_getval(g_ceph_context, cmdmap, "format", format);
4895 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4896 if (name == "") {
4897 f->open_array_section("rules");
4898 osdmap.crush->dump_rules(f.get());
4899 f->close_section();
4900 } else {
4901 int ruleno = osdmap.crush->get_rule_id(name);
4902 if (ruleno < 0) {
31f18b77 4903 ss << "unknown crush rule '" << name << "'";
7c673cae
FG
4904 r = ruleno;
4905 goto reply;
4906 }
4907 osdmap.crush->dump_rule(ruleno, f.get());
4908 }
4909 ostringstream rs;
4910 f->flush(rs);
4911 rs << "\n";
4912 rdata.append(rs.str());
4913 } else if (prefix == "osd crush dump") {
4914 string format;
4915 cmd_getval(g_ceph_context, cmdmap, "format", format);
4916 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4917 f->open_object_section("crush_map");
4918 osdmap.crush->dump(f.get());
4919 f->close_section();
4920 ostringstream rs;
4921 f->flush(rs);
4922 rs << "\n";
4923 rdata.append(rs.str());
4924 } else if (prefix == "osd crush show-tunables") {
4925 string format;
4926 cmd_getval(g_ceph_context, cmdmap, "format", format);
4927 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4928 f->open_object_section("crush_map_tunables");
4929 osdmap.crush->dump_tunables(f.get());
4930 f->close_section();
4931 ostringstream rs;
4932 f->flush(rs);
4933 rs << "\n";
4934 rdata.append(rs.str());
4935 } else if (prefix == "osd crush tree") {
4936 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4937 f->open_array_section("crush_map_roots");
4938 osdmap.crush->dump_tree(f.get());
4939 f->close_section();
4940 f->flush(rdata);
4941 } else if (prefix == "osd crush class ls") {
4942 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4943 f->open_array_section("crush_classes");
4944 for (auto i : osdmap.crush->class_name)
4945 f->dump_string("class", i.second);
4946 f->close_section();
4947 f->flush(rdata);
224ce89b
WB
4948 } else if (prefix == "osd crush class ls-osd") {
4949 string name;
4950 cmd_getval(g_ceph_context, cmdmap, "class", name);
4951 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4952 set<int> osds;
4953 osdmap.crush->get_devices_by_class(name, &osds);
4954 f->open_array_section("osds");
4955 for (auto& osd : osds)
4956 f->dump_int("osd", osd);
4957 f->close_section();
4958 f->flush(rdata);
7c673cae
FG
4959 } else if (prefix == "osd erasure-code-profile ls") {
4960 const auto &profiles = osdmap.get_erasure_code_profiles();
4961 if (f)
4962 f->open_array_section("erasure-code-profiles");
4963 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
4964 if (f)
4965 f->dump_string("profile", i->first.c_str());
4966 else
4967 rdata.append(i->first + "\n");
4968 }
4969 if (f) {
4970 f->close_section();
4971 ostringstream rs;
4972 f->flush(rs);
4973 rs << "\n";
4974 rdata.append(rs.str());
4975 }
4976 } else if (prefix == "osd erasure-code-profile get") {
4977 string name;
4978 cmd_getval(g_ceph_context, cmdmap, "name", name);
4979 if (!osdmap.has_erasure_code_profile(name)) {
4980 ss << "unknown erasure code profile '" << name << "'";
4981 r = -ENOENT;
4982 goto reply;
4983 }
4984 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
4985 if (f)
4986 f->open_object_section("profile");
4987 for (map<string,string>::const_iterator i = profile.begin();
4988 i != profile.end();
4989 ++i) {
4990 if (f)
4991 f->dump_string(i->first.c_str(), i->second.c_str());
4992 else
4993 rdata.append(i->first + "=" + i->second + "\n");
4994 }
4995 if (f) {
4996 f->close_section();
4997 ostringstream rs;
4998 f->flush(rs);
4999 rs << "\n";
5000 rdata.append(rs.str());
5001 }
5002 } else {
5003 // try prepare update
5004 return false;
5005 }
5006
5007 reply:
5008 string rs;
5009 getline(ss, rs);
5010 mon->reply_command(op, r, rs, rdata, get_last_committed());
5011 return true;
5012}
5013
5014void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
5015{
5016 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
5017 pending_inc.get_new_pool(pool_id, pool)->flags = flags;
5018}
5019
5020bool OSDMonitor::update_pools_status()
5021{
31f18b77 5022 if (!mon->pgservice->is_readable())
7c673cae
FG
5023 return false;
5024
5025 bool ret = false;
5026
5027 auto& pools = osdmap.get_pools();
5028 for (auto it = pools.begin(); it != pools.end(); ++it) {
31f18b77
FG
5029 const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
5030 if (!pstat)
7c673cae 5031 continue;
31f18b77 5032 const object_stat_sum_t& sum = pstat->stats.sum;
7c673cae
FG
5033 const pg_pool_t &pool = it->second;
5034 const string& pool_name = osdmap.get_pool_name(it->first);
5035
5036 bool pool_is_full =
5037 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
5038 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
5039
5040 if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
5041 if (pool_is_full)
5042 continue;
5043
5044 mon->clog->info() << "pool '" << pool_name
5045 << "' no longer full; removing FULL flag";
5046
5047 update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
5048 ret = true;
5049 } else {
5050 if (!pool_is_full)
5051 continue;
5052
5053 if (pool.quota_max_bytes > 0 &&
5054 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
5055 mon->clog->warn() << "pool '" << pool_name << "' is full"
5056 << " (reached quota's max_bytes: "
5057 << si_t(pool.quota_max_bytes) << ")";
5058 }
5059 if (pool.quota_max_objects > 0 &&
5060 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
5061 mon->clog->warn() << "pool '" << pool_name << "' is full"
5062 << " (reached quota's max_objects: "
5063 << pool.quota_max_objects << ")";
5064 }
5065 update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
5066 ret = true;
5067 }
5068 }
5069 return ret;
5070}
5071
7c673cae
FG
5072int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
5073{
5074 op->mark_osdmon_event(__func__);
5075 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
5076 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
5077 MonSession *session = m->get_session();
5078 if (!session)
5079 return -EPERM;
5080 string erasure_code_profile;
5081 stringstream ss;
31f18b77 5082 string rule_name;
7c673cae 5083 if (m->auid)
31f18b77 5084 return prepare_new_pool(m->name, m->auid, m->crush_rule, rule_name,
7c673cae
FG
5085 0, 0,
5086 erasure_code_profile,
5087 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5088 else
31f18b77 5089 return prepare_new_pool(m->name, session->auid, m->crush_rule, rule_name,
7c673cae
FG
5090 0, 0,
5091 erasure_code_profile,
5092 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
5093}
5094
5095int OSDMonitor::crush_rename_bucket(const string& srcname,
5096 const string& dstname,
5097 ostream *ss)
5098{
5099 int ret;
5100 //
5101 // Avoid creating a pending crush if it does not already exists and
5102 // the rename would fail.
5103 //
5104 if (!_have_pending_crush()) {
5105 ret = _get_stable_crush().can_rename_bucket(srcname,
5106 dstname,
5107 ss);
5108 if (ret)
5109 return ret;
5110 }
5111
5112 CrushWrapper newcrush;
5113 _get_pending_crush(newcrush);
5114
5115 ret = newcrush.rename_bucket(srcname,
5116 dstname,
5117 ss);
5118 if (ret)
5119 return ret;
5120
5121 pending_inc.crush.clear();
5122 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5123 *ss << "renamed bucket " << srcname << " into " << dstname;
5124 return 0;
5125}
5126
5127void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
5128{
5129 string replacement = "";
5130
5131 if (plugin == "jerasure_generic" ||
5132 plugin == "jerasure_sse3" ||
5133 plugin == "jerasure_sse4" ||
5134 plugin == "jerasure_neon") {
5135 replacement = "jerasure";
5136 } else if (plugin == "shec_generic" ||
5137 plugin == "shec_sse3" ||
5138 plugin == "shec_sse4" ||
5139 plugin == "shec_neon") {
5140 replacement = "shec";
5141 }
5142
5143 if (replacement != "") {
5144 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
5145 << plugin << " that has been deprecated. Please use "
5146 << replacement << " instead." << dendl;
5147 }
5148}
5149
5150int OSDMonitor::normalize_profile(const string& profilename,
5151 ErasureCodeProfile &profile,
5152 bool force,
5153 ostream *ss)
5154{
5155 ErasureCodeInterfaceRef erasure_code;
5156 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5157 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
5158 check_legacy_ec_plugin(plugin->second, profilename);
5159 int err = instance.factory(plugin->second,
5160 g_conf->get_val<std::string>("erasure_code_dir"),
5161 profile, &erasure_code, ss);
5162 if (err) {
5163 return err;
5164 }
5165
5166 err = erasure_code->init(profile, ss);
5167 if (err) {
5168 return err;
5169 }
5170
5171 auto it = profile.find("stripe_unit");
5172 if (it != profile.end()) {
5173 string err_str;
5174 uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5175 if (!err_str.empty()) {
5176 *ss << "could not parse stripe_unit '" << it->second
5177 << "': " << err_str << std::endl;
5178 return -EINVAL;
5179 }
5180 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5181 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
5182 if (chunk_size != stripe_unit) {
5183 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
5184 << "alignment. Would be padded to " << chunk_size
5185 << std::endl;
5186 return -EINVAL;
5187 }
5188 if ((stripe_unit % 4096) != 0 && !force) {
5189 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
5190 << "use --force to override this check" << std::endl;
5191 return -EINVAL;
5192 }
5193 }
5194 return 0;
5195}
5196
31f18b77 5197int OSDMonitor::crush_rule_create_erasure(const string &name,
7c673cae 5198 const string &profile,
31f18b77 5199 int *rule,
7c673cae
FG
5200 ostream *ss)
5201{
5202 int ruleid = osdmap.crush->get_rule_id(name);
5203 if (ruleid != -ENOENT) {
31f18b77 5204 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7c673cae
FG
5205 return -EEXIST;
5206 }
5207
5208 CrushWrapper newcrush;
5209 _get_pending_crush(newcrush);
5210
5211 ruleid = newcrush.get_rule_id(name);
5212 if (ruleid != -ENOENT) {
31f18b77 5213 *rule = newcrush.get_rule_mask_ruleset(ruleid);
7c673cae
FG
5214 return -EALREADY;
5215 } else {
5216 ErasureCodeInterfaceRef erasure_code;
5217 int err = get_erasure_code(profile, &erasure_code, ss);
5218 if (err) {
5219 *ss << "failed to load plugin using profile " << profile << std::endl;
5220 return err;
5221 }
5222
224ce89b 5223 err = erasure_code->create_rule(name, newcrush, ss);
7c673cae
FG
5224 erasure_code.reset();
5225 if (err < 0)
5226 return err;
31f18b77 5227 *rule = err;
7c673cae
FG
5228 pending_inc.crush.clear();
5229 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
5230 return 0;
5231 }
5232}
5233
5234int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
5235 ErasureCodeInterfaceRef *erasure_code,
5236 ostream *ss) const
5237{
5238 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
5239 return -EAGAIN;
5240 ErasureCodeProfile profile =
5241 osdmap.get_erasure_code_profile(erasure_code_profile);
5242 ErasureCodeProfile::const_iterator plugin =
5243 profile.find("plugin");
5244 if (plugin == profile.end()) {
5245 *ss << "cannot determine the erasure code plugin"
5246 << " because there is no 'plugin' entry in the erasure_code_profile "
5247 << profile << std::endl;
5248 return -EINVAL;
5249 }
5250 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
5251 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
5252 return instance.factory(plugin->second,
5253 g_conf->get_val<std::string>("erasure_code_dir"),
5254 profile, erasure_code, ss);
5255}
5256
5257int OSDMonitor::check_cluster_features(uint64_t features,
5258 stringstream &ss)
5259{
5260 stringstream unsupported_ss;
5261 int unsupported_count = 0;
5262 if ((mon->get_quorum_con_features() & features) != features) {
5263 unsupported_ss << "the monitor cluster";
5264 ++unsupported_count;
5265 }
5266
5267 set<int32_t> up_osds;
5268 osdmap.get_up_osds(up_osds);
5269 for (set<int32_t>::iterator it = up_osds.begin();
5270 it != up_osds.end(); ++it) {
5271 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
5272 if ((xi.features & features) != features) {
5273 if (unsupported_count > 0)
5274 unsupported_ss << ", ";
5275 unsupported_ss << "osd." << *it;
5276 unsupported_count ++;
5277 }
5278 }
5279
5280 if (unsupported_count > 0) {
5281 ss << "features " << features << " unsupported by: "
5282 << unsupported_ss.str();
5283 return -ENOTSUP;
5284 }
5285
5286 // check pending osd state, too!
5287 for (map<int32_t,osd_xinfo_t>::const_iterator p =
5288 pending_inc.new_xinfo.begin();
5289 p != pending_inc.new_xinfo.end(); ++p) {
5290 const osd_xinfo_t &xi = p->second;
5291 if ((xi.features & features) != features) {
5292 dout(10) << __func__ << " pending osd." << p->first
5293 << " features are insufficient; retry" << dendl;
5294 return -EAGAIN;
5295 }
5296 }
5297
5298 return 0;
5299}
5300
5301bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
5302 stringstream& ss)
5303{
5304 OSDMap::Incremental new_pending = pending_inc;
5305 ::encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
5306 OSDMap newmap;
5307 newmap.deepish_copy_from(osdmap);
5308 newmap.apply_incremental(new_pending);
5309
5310 // client compat
31f18b77 5311 if (newmap.require_min_compat_client > 0) {
7c673cae 5312 auto mv = newmap.get_min_compat_client();
31f18b77
FG
5313 if (mv > newmap.require_min_compat_client) {
5314 ss << "new crush map requires client version " << ceph_release_name(mv)
7c673cae 5315 << " but require_min_compat_client is "
31f18b77 5316 << ceph_release_name(newmap.require_min_compat_client);
7c673cae
FG
5317 return false;
5318 }
5319 }
5320
5321 // osd compat
5322 uint64_t features =
5323 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
5324 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
5325 stringstream features_ss;
5326 int r = check_cluster_features(features, features_ss);
5327 if (r) {
5328 ss << "Could not change CRUSH: " << features_ss.str();
5329 return false;
5330 }
5331
5332 return true;
5333}
5334
5335bool OSDMonitor::erasure_code_profile_in_use(
5336 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
5337 const string &profile,
5338 ostream *ss)
5339{
5340 bool found = false;
5341 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
5342 p != pools.end();
5343 ++p) {
5344 if (p->second.erasure_code_profile == profile) {
5345 *ss << osdmap.pool_name[p->first] << " ";
5346 found = true;
5347 }
5348 }
5349 if (found) {
5350 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
5351 }
5352 return found;
5353}
5354
5355int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
5356 map<string,string> *erasure_code_profile_map,
5357 ostream *ss)
5358{
5359 int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
5360 *ss,
5361 erasure_code_profile_map);
5362 if (r)
5363 return r;
5364 assert((*erasure_code_profile_map).count("plugin"));
5365 string default_plugin = (*erasure_code_profile_map)["plugin"];
5366 map<string,string> user_map;
5367 for (vector<string>::const_iterator i = erasure_code_profile.begin();
5368 i != erasure_code_profile.end();
5369 ++i) {
5370 size_t equal = i->find('=');
5371 if (equal == string::npos) {
5372 user_map[*i] = string();
5373 (*erasure_code_profile_map)[*i] = string();
5374 } else {
5375 const string key = i->substr(0, equal);
5376 equal++;
5377 const string value = i->substr(equal);
5378 user_map[key] = value;
5379 (*erasure_code_profile_map)[key] = value;
5380 }
5381 }
5382
5383 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
5384 (*erasure_code_profile_map) = user_map;
5385
5386 return 0;
5387}
5388
5389int OSDMonitor::prepare_pool_size(const unsigned pool_type,
5390 const string &erasure_code_profile,
5391 unsigned *size, unsigned *min_size,
5392 ostream *ss)
5393{
5394 int err = 0;
5395 switch (pool_type) {
5396 case pg_pool_t::TYPE_REPLICATED:
5397 *size = g_conf->osd_pool_default_size;
5398 *min_size = g_conf->get_osd_pool_default_min_size();
5399 break;
5400 case pg_pool_t::TYPE_ERASURE:
5401 {
5402 ErasureCodeInterfaceRef erasure_code;
5403 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5404 if (err == 0) {
5405 *size = erasure_code->get_chunk_count();
5406 *min_size = MIN(erasure_code->get_data_chunk_count() + 1, *size);
5407 }
5408 }
5409 break;
5410 default:
5411 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
5412 err = -EINVAL;
5413 break;
5414 }
5415 return err;
5416}
5417
5418int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
5419 const string &erasure_code_profile,
5420 uint32_t *stripe_width,
5421 ostream *ss)
5422{
5423 int err = 0;
5424 switch (pool_type) {
5425 case pg_pool_t::TYPE_REPLICATED:
5426 // ignored
5427 break;
5428 case pg_pool_t::TYPE_ERASURE:
5429 {
5430 ErasureCodeProfile profile =
5431 osdmap.get_erasure_code_profile(erasure_code_profile);
5432 ErasureCodeInterfaceRef erasure_code;
5433 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
5434 if (err)
5435 break;
5436 uint32_t data_chunks = erasure_code->get_data_chunk_count();
5437 uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
5438 auto it = profile.find("stripe_unit");
5439 if (it != profile.end()) {
5440 string err_str;
5441 stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
5442 assert(err_str.empty());
5443 }
5444 *stripe_width = data_chunks *
5445 erasure_code->get_chunk_size(stripe_unit * data_chunks);
5446 }
5447 break;
5448 default:
5449 *ss << "prepare_pool_stripe_width: "
5450 << pool_type << " is not a known pool type";
5451 err = -EINVAL;
5452 break;
5453 }
5454 return err;
5455}
5456
31f18b77 5457int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
224ce89b
WB
5458 const string &erasure_code_profile,
5459 const string &rule_name,
5460 int *crush_rule,
5461 ostream *ss)
7c673cae
FG
5462{
5463
31f18b77 5464 if (*crush_rule < 0) {
7c673cae
FG
5465 switch (pool_type) {
5466 case pg_pool_t::TYPE_REPLICATED:
5467 {
31f18b77 5468 if (rule_name == "") {
224ce89b 5469 // Use default rule
31f18b77
FG
5470 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
5471 if (*crush_rule < 0) {
5472 // Errors may happen e.g. if no valid rule is available
5473 *ss << "No suitable CRUSH rule exists, check "
7c673cae
FG
5474 << "'osd pool default crush *' config options";
5475 return -ENOENT;
5476 }
5477 } else {
31f18b77 5478 return get_crush_rule(rule_name, crush_rule, ss);
7c673cae
FG
5479 }
5480 }
5481 break;
5482 case pg_pool_t::TYPE_ERASURE:
5483 {
31f18b77 5484 int err = crush_rule_create_erasure(rule_name,
7c673cae 5485 erasure_code_profile,
31f18b77 5486 crush_rule, ss);
7c673cae
FG
5487 switch (err) {
5488 case -EALREADY:
31f18b77
FG
5489 dout(20) << "prepare_pool_crush_rule: rule "
5490 << rule_name << " try again" << dendl;
7c673cae
FG
5491 // fall through
5492 case 0:
5493 // need to wait for the crush rule to be proposed before proceeding
5494 err = -EAGAIN;
5495 break;
5496 case -EEXIST:
5497 err = 0;
5498 break;
5499 }
5500 return err;
5501 }
5502 break;
5503 default:
31f18b77 5504 *ss << "prepare_pool_crush_rule: " << pool_type
7c673cae
FG
5505 << " is not a known pool type";
5506 return -EINVAL;
5507 break;
5508 }
5509 } else {
31f18b77
FG
5510 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
5511 *ss << "CRUSH rule " << *crush_rule << " not found";
7c673cae
FG
5512 return -ENOENT;
5513 }
5514 }
5515
5516 return 0;
5517}
5518
31f18b77 5519int OSDMonitor::get_crush_rule(const string &rule_name,
224ce89b
WB
5520 int *crush_rule,
5521 ostream *ss)
7c673cae
FG
5522{
5523 int ret;
31f18b77 5524 ret = osdmap.crush->get_rule_id(rule_name);
7c673cae
FG
5525 if (ret != -ENOENT) {
5526 // found it, use it
31f18b77 5527 *crush_rule = ret;
7c673cae
FG
5528 } else {
5529 CrushWrapper newcrush;
5530 _get_pending_crush(newcrush);
5531
31f18b77 5532 ret = newcrush.get_rule_id(rule_name);
7c673cae
FG
5533 if (ret != -ENOENT) {
5534 // found it, wait for it to be proposed
31f18b77 5535 dout(20) << __func__ << ": rule " << rule_name
7c673cae
FG
5536 << " try again" << dendl;
5537 return -EAGAIN;
5538 } else {
224ce89b 5539 // Cannot find it , return error
31f18b77 5540 *ss << "specified rule " << rule_name << " doesn't exist";
7c673cae
FG
5541 return ret;
5542 }
5543 }
5544 return 0;
5545}
5546
5547/**
5548 * @param name The name of the new pool
5549 * @param auid The auid of the pool owner. Can be -1
31f18b77
FG
5550 * @param crush_rule The crush rule to use. If <0, will use the system default
5551 * @param crush_rule_name The crush rule to use, if crush_rulset <0
7c673cae
FG
5552 * @param pg_num The pg_num to use. If set to 0, will use the system default
5553 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
5554 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
5555 * @param pool_type TYPE_ERASURE, or TYPE_REP
5556 * @param expected_num_objects expected number of objects on the pool
5557 * @param fast_read fast read type.
5558 * @param ss human readable error message, if any.
5559 *
5560 * @return 0 on success, negative errno on failure.
5561 */
5562int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
31f18b77
FG
5563 int crush_rule,
5564 const string &crush_rule_name,
7c673cae
FG
5565 unsigned pg_num, unsigned pgp_num,
5566 const string &erasure_code_profile,
5567 const unsigned pool_type,
5568 const uint64_t expected_num_objects,
5569 FastReadType fast_read,
5570 ostream *ss)
5571{
5572 if (name.length() == 0)
5573 return -EINVAL;
5574 if (pg_num == 0)
5575 pg_num = g_conf->osd_pool_default_pg_num;
5576 if (pgp_num == 0)
5577 pgp_num = g_conf->osd_pool_default_pgp_num;
5578 if (pg_num > (unsigned)g_conf->mon_max_pool_pg_num) {
5579 *ss << "'pg_num' must be greater than 0 and less than or equal to "
5580 << g_conf->mon_max_pool_pg_num
5581 << " (you may adjust 'mon max pool pg num' for higher values)";
5582 return -ERANGE;
5583 }
5584 if (pgp_num > pg_num) {
5585 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
5586 << ", which in this case is " << pg_num;
5587 return -ERANGE;
5588 }
5589 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
5590 *ss << "'fast_read' can only apply to erasure coding pool";
5591 return -EINVAL;
5592 }
5593 int r;
31f18b77
FG
5594 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
5595 crush_rule_name, &crush_rule, ss);
7c673cae 5596 if (r) {
31f18b77 5597 dout(10) << " prepare_pool_crush_rule returns " << r << dendl;
7c673cae
FG
5598 return r;
5599 }
224ce89b
WB
5600 if (g_conf->mon_osd_crush_smoke_test) {
5601 CrushWrapper newcrush;
5602 _get_pending_crush(newcrush);
5603 ostringstream err;
5604 CrushTester tester(newcrush, err);
5605 tester.set_max_x(50);
5606 tester.set_rule(crush_rule);
5607 r = tester.test_with_fork(g_conf->mon_lease);
5608 if (r < 0) {
5609 dout(10) << " tester.test_with_fork returns " << r
5610 << ": " << err.str() << dendl;
5611 *ss << "crush test failed with " << r << ": " << err.str();
5612 return r;
5613 }
7c673cae
FG
5614 }
5615 unsigned size, min_size;
5616 r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
5617 if (r) {
5618 dout(10) << " prepare_pool_size returns " << r << dendl;
5619 return r;
5620 }
5621
31f18b77 5622 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7c673cae
FG
5623 return -EINVAL;
5624 }
5625
5626 uint32_t stripe_width = 0;
5627 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
5628 if (r) {
5629 dout(10) << " prepare_pool_stripe_width returns " << r << dendl;
5630 return r;
5631 }
5632
5633 bool fread = false;
5634 if (pool_type == pg_pool_t::TYPE_ERASURE) {
5635 switch (fast_read) {
5636 case FAST_READ_OFF:
5637 fread = false;
5638 break;
5639 case FAST_READ_ON:
5640 fread = true;
5641 break;
5642 case FAST_READ_DEFAULT:
5643 fread = g_conf->mon_osd_pool_ec_fast_read;
5644 break;
5645 default:
5646 *ss << "invalid fast_read setting: " << fast_read;
5647 return -EINVAL;
5648 }
5649 }
5650
5651 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
5652 p != pending_inc.new_pool_names.end();
5653 ++p) {
5654 if (p->second == name)
5655 return 0;
5656 }
5657
5658 if (-1 == pending_inc.new_pool_max)
5659 pending_inc.new_pool_max = osdmap.pool_max;
5660 int64_t pool = ++pending_inc.new_pool_max;
5661 pg_pool_t empty;
5662 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
5663 pi->type = pool_type;
5664 pi->fast_read = fread;
5665 pi->flags = g_conf->osd_pool_default_flags;
5666 if (g_conf->osd_pool_default_flag_hashpspool)
5667 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
5668 if (g_conf->osd_pool_default_flag_nodelete)
5669 pi->set_flag(pg_pool_t::FLAG_NODELETE);
5670 if (g_conf->osd_pool_default_flag_nopgchange)
5671 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
5672 if (g_conf->osd_pool_default_flag_nosizechange)
5673 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
5674 if (g_conf->osd_pool_use_gmt_hitset &&
5675 (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
5676 pi->use_gmt_hitset = true;
5677 else
5678 pi->use_gmt_hitset = false;
5679
5680 pi->size = size;
5681 pi->min_size = min_size;
31f18b77 5682 pi->crush_rule = crush_rule;
7c673cae
FG
5683 pi->expected_num_objects = expected_num_objects;
5684 pi->object_hash = CEPH_STR_HASH_RJENKINS;
5685 pi->set_pg_num(pg_num);
5686 pi->set_pgp_num(pgp_num);
5687 pi->last_change = pending_inc.epoch;
5688 pi->auid = auid;
5689 pi->erasure_code_profile = erasure_code_profile;
5690 pi->stripe_width = stripe_width;
5691 pi->cache_target_dirty_ratio_micro =
5692 g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
5693 pi->cache_target_dirty_high_ratio_micro =
5694 g_conf->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
5695 pi->cache_target_full_ratio_micro =
5696 g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
5697 pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
5698 pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
5699 pending_inc.new_pool_names[pool] = name;
5700 return 0;
5701}
5702
5703bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
5704{
5705 op->mark_osdmon_event(__func__);
5706 ostringstream ss;
5707 if (pending_inc.new_flags < 0)
5708 pending_inc.new_flags = osdmap.get_flags();
5709 pending_inc.new_flags |= flag;
5710 ss << OSDMap::get_flag_string(flag) << " is set";
5711 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5712 get_last_committed() + 1));
5713 return true;
5714}
5715
5716bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
5717{
5718 op->mark_osdmon_event(__func__);
5719 ostringstream ss;
5720 if (pending_inc.new_flags < 0)
5721 pending_inc.new_flags = osdmap.get_flags();
5722 pending_inc.new_flags &= ~flag;
5723 ss << OSDMap::get_flag_string(flag) << " is unset";
5724 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
5725 get_last_committed() + 1));
5726 return true;
5727}
5728
7c673cae
FG
5729int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
5730 stringstream& ss)
5731{
5732 string poolstr;
5733 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
5734 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5735 if (pool < 0) {
5736 ss << "unrecognized pool '" << poolstr << "'";
5737 return -ENOENT;
5738 }
5739 string var;
5740 cmd_getval(g_ceph_context, cmdmap, "var", var);
5741
5742 pg_pool_t p = *osdmap.get_pg_pool(pool);
5743 if (pending_inc.new_pools.count(pool))
5744 p = pending_inc.new_pools[pool];
5745
5746 // accept val as a json string in the normal case (current
5747 // generation monitor). parse out int or float values from the
5748 // string as needed. however, if it is not a string, try to pull
5749 // out an int, in case an older monitor with an older json schema is
5750 // forwarding a request.
5751 string val;
5752 string interr, floaterr;
5753 int64_t n = 0;
5754 double f = 0;
5755 int64_t uf = 0; // micro-f
5756 if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
5757 // wasn't a string; maybe an older mon forwarded json with an int?
5758 if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
5759 return -EINVAL; // no value!
5760 } else {
5761 // we got a string. see if it contains an int.
5762 n = strict_strtoll(val.c_str(), 10, &interr);
5763 // or a float
5764 f = strict_strtod(val.c_str(), &floaterr);
5765 uf = llrintl(f * (double)1000000.0);
5766 }
5767
5768 if (!p.is_tier() &&
5769 (var == "hit_set_type" || var == "hit_set_period" ||
5770 var == "hit_set_count" || var == "hit_set_fpp" ||
5771 var == "target_max_objects" || var == "target_max_bytes" ||
5772 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
5773 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
5774 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
5775 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
5776 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
5777 return -EACCES;
5778 }
5779
5780 if (var == "size") {
5781 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5782 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
5783 return -EPERM;
5784 }
5785 if (p.type == pg_pool_t::TYPE_ERASURE) {
5786 ss << "can not change the size of an erasure-coded pool";
5787 return -ENOTSUP;
5788 }
5789 if (interr.length()) {
5790 ss << "error parsing integer value '" << val << "': " << interr;
5791 return -EINVAL;
5792 }
5793 if (n <= 0 || n > 10) {
5794 ss << "pool size must be between 1 and 10";
5795 return -EINVAL;
5796 }
5797 p.size = n;
5798 if (n < p.min_size)
5799 p.min_size = n;
5800 } else if (var == "min_size") {
5801 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
5802 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
5803 return -EPERM;
5804 }
5805 if (interr.length()) {
5806 ss << "error parsing integer value '" << val << "': " << interr;
5807 return -EINVAL;
5808 }
5809
5810 if (p.type != pg_pool_t::TYPE_ERASURE) {
5811 if (n < 1 || n > p.size) {
5812 ss << "pool min_size must be between 1 and " << (int)p.size;
5813 return -EINVAL;
5814 }
5815 } else {
5816 ErasureCodeInterfaceRef erasure_code;
5817 int k;
5818 stringstream tmp;
5819 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
5820 if (err == 0) {
5821 k = erasure_code->get_data_chunk_count();
5822 } else {
5823 ss << __func__ << " get_erasure_code failed: " << tmp.rdbuf();
5824 return err;
5825 }
5826
5827 if (n < k || n > p.size) {
5828 ss << "pool min_size must be between " << k << " and " << (int)p.size;
5829 return -EINVAL;
5830 }
5831 }
5832 p.min_size = n;
5833 } else if (var == "auid") {
5834 if (interr.length()) {
5835 ss << "error parsing integer value '" << val << "': " << interr;
5836 return -EINVAL;
5837 }
5838 p.auid = n;
5839 } else if (var == "crash_replay_interval") {
5840 if (interr.length()) {
5841 ss << "error parsing integer value '" << val << "': " << interr;
5842 return -EINVAL;
5843 }
5844 p.crash_replay_interval = n;
5845 } else if (var == "pg_num") {
5846 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
5847 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
5848 return -EPERM;
5849 }
5850 if (interr.length()) {
5851 ss << "error parsing integer value '" << val << "': " << interr;
5852 return -EINVAL;
5853 }
5854 if (n <= (int)p.get_pg_num()) {
5855 ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
5856 if (n < (int)p.get_pg_num())
5857 return -EEXIST;
5858 return 0;
5859 }
5860 string force;
5861 cmd_getval(g_ceph_context,cmdmap, "force", force);
5862 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
5863 force != "--yes-i-really-mean-it") {
5864 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
5865 return -EPERM;
5866 }
5867 int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
5868 int64_t new_pgs = n - p.get_pg_num();
5869 if (new_pgs > g_conf->mon_osd_max_split_count * expected_osds) {
5870 ss << "specified pg_num " << n << " is too large (creating "
5871 << new_pgs << " new PGs on ~" << expected_osds
5872 << " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
5873 << ')';
5874 return -E2BIG;
5875 }
5876 p.set_pg_num(n);
5877 // force pre-luminous clients to resend their ops, since they
5878 // don't understand that split PGs now form a new interval.
5879 p.last_force_op_resend_preluminous = pending_inc.epoch;
5880 } else if (var == "pgp_num") {
5881 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
5882 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
5883 return -EPERM;
5884 }
5885 if (interr.length()) {
5886 ss << "error parsing integer value '" << val << "': " << interr;
5887 return -EINVAL;
5888 }
5889 if (n <= 0) {
5890 ss << "specified pgp_num must > 0, but you set to " << n;
5891 return -EINVAL;
5892 }
5893 if (n > (int)p.get_pg_num()) {
5894 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
5895 return -EINVAL;
5896 }
5897 p.set_pgp_num(n);
5898 } else if (var == "crush_rule") {
5899 int id = osdmap.crush->get_rule_id(val);
5900 if (id == -ENOENT) {
5901 ss << "crush rule " << val << " does not exist";
5902 return -ENOENT;
5903 }
5904 if (id < 0) {
5905 ss << cpp_strerror(id);
5906 return -ENOENT;
5907 }
5908 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
5909 return -EINVAL;
5910 }
31f18b77 5911 p.crush_rule = id;
7c673cae
FG
5912 } else if (var == "nodelete" || var == "nopgchange" ||
5913 var == "nosizechange" || var == "write_fadvise_dontneed" ||
5914 var == "noscrub" || var == "nodeep-scrub") {
5915 uint64_t flag = pg_pool_t::get_flag_by_name(var);
5916 // make sure we only compare against 'n' if we didn't receive a string
5917 if (val == "true" || (interr.empty() && n == 1)) {
5918 p.set_flag(flag);
5919 } else if (val == "false" || (interr.empty() && n == 0)) {
5920 p.unset_flag(flag);
5921 } else {
5922 ss << "expecting value 'true', 'false', '0', or '1'";
5923 return -EINVAL;
5924 }
5925 } else if (var == "hashpspool") {
5926 uint64_t flag = pg_pool_t::get_flag_by_name(var);
5927 string force;
5928 cmd_getval(g_ceph_context, cmdmap, "force", force);
5929 if (force != "--yes-i-really-mean-it") {
5930 ss << "are you SURE? this will remap all placement groups in this pool,"
5931 " this triggers large data movement,"
5932 " pass --yes-i-really-mean-it if you really do.";
5933 return -EPERM;
5934 }
5935 // make sure we only compare against 'n' if we didn't receive a string
5936 if (val == "true" || (interr.empty() && n == 1)) {
5937 p.set_flag(flag);
5938 } else if (val == "false" || (interr.empty() && n == 0)) {
5939 p.unset_flag(flag);
5940 } else {
5941 ss << "expecting value 'true', 'false', '0', or '1'";
5942 return -EINVAL;
5943 }
5944 } else if (var == "hit_set_type") {
5945 if (val == "none")
5946 p.hit_set_params = HitSet::Params();
5947 else {
5948 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
5949 if (err)
5950 return err;
5951 if (val == "bloom") {
5952 BloomHitSet::Params *bsp = new BloomHitSet::Params;
5953 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
5954 p.hit_set_params = HitSet::Params(bsp);
5955 } else if (val == "explicit_hash")
5956 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
5957 else if (val == "explicit_object")
5958 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
5959 else {
5960 ss << "unrecognized hit_set type '" << val << "'";
5961 return -EINVAL;
5962 }
5963 }
5964 } else if (var == "hit_set_period") {
5965 if (interr.length()) {
5966 ss << "error parsing integer value '" << val << "': " << interr;
5967 return -EINVAL;
5968 }
5969 p.hit_set_period = n;
5970 } else if (var == "hit_set_count") {
5971 if (interr.length()) {
5972 ss << "error parsing integer value '" << val << "': " << interr;
5973 return -EINVAL;
5974 }
5975 p.hit_set_count = n;
5976 } else if (var == "hit_set_fpp") {
5977 if (floaterr.length()) {
5978 ss << "error parsing floating point value '" << val << "': " << floaterr;
5979 return -EINVAL;
5980 }
5981 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
5982 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
5983 return -EINVAL;
5984 }
5985 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
5986 bloomp->set_fpp(f);
5987 } else if (var == "use_gmt_hitset") {
5988 if (val == "true" || (interr.empty() && n == 1)) {
5989 if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
5990 ss << "not all OSDs support GMT hit set.";
5991 return -EINVAL;
5992 }
5993 p.use_gmt_hitset = true;
5994 } else {
5995 ss << "expecting value 'true' or '1'";
5996 return -EINVAL;
5997 }
5998 } else if (var == "allow_ec_overwrites") {
5999 if (!p.is_erasure()) {
6000 ss << "ec overwrites can only be enabled for an erasure coded pool";
6001 return -EINVAL;
6002 }
224ce89b
WB
6003 stringstream err;
6004 if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
6005 !is_pool_currently_all_bluestore(pool, p, &err)) {
6006 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
6007 return -EINVAL;
6008 }
7c673cae
FG
6009 if (val == "true" || (interr.empty() && n == 1)) {
6010 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
6011 } else if (val == "false" || (interr.empty() && n == 0)) {
6012 ss << "ec overwrites cannot be disabled once enabled";
6013 return -EINVAL;
6014 } else {
6015 ss << "expecting value 'true', 'false', '0', or '1'";
6016 return -EINVAL;
6017 }
7c673cae
FG
6018 } else if (var == "target_max_objects") {
6019 if (interr.length()) {
6020 ss << "error parsing int '" << val << "': " << interr;
6021 return -EINVAL;
6022 }
6023 p.target_max_objects = n;
6024 } else if (var == "target_max_bytes") {
6025 if (interr.length()) {
6026 ss << "error parsing int '" << val << "': " << interr;
6027 return -EINVAL;
6028 }
6029 p.target_max_bytes = n;
6030 } else if (var == "cache_target_dirty_ratio") {
6031 if (floaterr.length()) {
6032 ss << "error parsing float '" << val << "': " << floaterr;
6033 return -EINVAL;
6034 }
6035 if (f < 0 || f > 1.0) {
6036 ss << "value must be in the range 0..1";
6037 return -ERANGE;
6038 }
6039 p.cache_target_dirty_ratio_micro = uf;
6040 } else if (var == "cache_target_dirty_high_ratio") {
6041 if (floaterr.length()) {
6042 ss << "error parsing float '" << val << "': " << floaterr;
6043 return -EINVAL;
6044 }
6045 if (f < 0 || f > 1.0) {
6046 ss << "value must be in the range 0..1";
6047 return -ERANGE;
6048 }
6049 p.cache_target_dirty_high_ratio_micro = uf;
6050 } else if (var == "cache_target_full_ratio") {
6051 if (floaterr.length()) {
6052 ss << "error parsing float '" << val << "': " << floaterr;
6053 return -EINVAL;
6054 }
6055 if (f < 0 || f > 1.0) {
6056 ss << "value must be in the range 0..1";
6057 return -ERANGE;
6058 }
6059 p.cache_target_full_ratio_micro = uf;
6060 } else if (var == "cache_min_flush_age") {
6061 if (interr.length()) {
6062 ss << "error parsing int '" << val << "': " << interr;
6063 return -EINVAL;
6064 }
6065 p.cache_min_flush_age = n;
6066 } else if (var == "cache_min_evict_age") {
6067 if (interr.length()) {
6068 ss << "error parsing int '" << val << "': " << interr;
6069 return -EINVAL;
6070 }
6071 p.cache_min_evict_age = n;
6072 } else if (var == "min_read_recency_for_promote") {
6073 if (interr.length()) {
6074 ss << "error parsing integer value '" << val << "': " << interr;
6075 return -EINVAL;
6076 }
6077 p.min_read_recency_for_promote = n;
6078 } else if (var == "hit_set_grade_decay_rate") {
6079 if (interr.length()) {
6080 ss << "error parsing integer value '" << val << "': " << interr;
6081 return -EINVAL;
6082 }
6083 if (n > 100 || n < 0) {
6084 ss << "value out of range,valid range is 0 - 100";
6085 return -EINVAL;
6086 }
6087 p.hit_set_grade_decay_rate = n;
6088 } else if (var == "hit_set_search_last_n") {
6089 if (interr.length()) {
6090 ss << "error parsing integer value '" << val << "': " << interr;
6091 return -EINVAL;
6092 }
6093 if (n > p.hit_set_count || n < 0) {
6094 ss << "value out of range,valid range is 0 - hit_set_count";
6095 return -EINVAL;
6096 }
6097 p.hit_set_search_last_n = n;
6098 } else if (var == "min_write_recency_for_promote") {
6099 if (interr.length()) {
6100 ss << "error parsing integer value '" << val << "': " << interr;
6101 return -EINVAL;
6102 }
6103 p.min_write_recency_for_promote = n;
6104 } else if (var == "fast_read") {
6105 if (p.is_replicated()) {
6106 ss << "fast read is not supported in replication pool";
6107 return -EINVAL;
6108 }
6109 if (val == "true" || (interr.empty() && n == 1)) {
6110 p.fast_read = true;
6111 } else if (val == "false" || (interr.empty() && n == 0)) {
6112 p.fast_read = false;
6113 } else {
6114 ss << "expecting value 'true', 'false', '0', or '1'";
6115 return -EINVAL;
6116 }
6117 } else if (pool_opts_t::is_opt_name(var)) {
224ce89b 6118 bool unset = val == "unset";
7c673cae 6119 if (var == "compression_mode") {
224ce89b
WB
6120 if (!unset) {
6121 auto cmode = Compressor::get_comp_mode_type(val);
6122 if (!cmode) {
6123 ss << "unrecognized compression mode '" << val << "'";
6124 return -EINVAL;
6125 }
7c673cae
FG
6126 }
6127 } else if (var == "compression_algorithm") {
224ce89b
WB
6128 if (!unset) {
6129 auto alg = Compressor::get_comp_alg_type(val);
6130 if (!alg) {
6131 ss << "unrecognized compression_algorithm '" << val << "'";
6132 return -EINVAL;
6133 }
7c673cae
FG
6134 }
6135 } else if (var == "compression_required_ratio") {
6136 if (floaterr.length()) {
6137 ss << "error parsing float value '" << val << "': " << floaterr;
6138 return -EINVAL;
6139 }
224ce89b 6140 if (f < 0 || f > 1) {
7c673cae 6141 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
224ce89b 6142 return -EINVAL;
7c673cae
FG
6143 }
6144 } else if (var == "csum_type") {
224ce89b 6145 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7c673cae
FG
6146 if (t < 0 ) {
6147 ss << "unrecognized csum_type '" << val << "'";
224ce89b 6148 return -EINVAL;
7c673cae
FG
6149 }
6150 //preserve csum_type numeric value
6151 n = t;
6152 interr.clear();
6153 } else if (var == "compression_max_blob_size" ||
6154 var == "compression_min_blob_size" ||
6155 var == "csum_max_block" ||
6156 var == "csum_min_block") {
6157 if (interr.length()) {
6158 ss << "error parsing int value '" << val << "': " << interr;
6159 return -EINVAL;
6160 }
6161 }
6162
6163 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
6164 switch (desc.type) {
6165 case pool_opts_t::STR:
224ce89b 6166 if (unset) {
7c673cae
FG
6167 p.opts.unset(desc.key);
6168 } else {
6169 p.opts.set(desc.key, static_cast<std::string>(val));
6170 }
6171 break;
6172 case pool_opts_t::INT:
6173 if (interr.length()) {
6174 ss << "error parsing integer value '" << val << "': " << interr;
6175 return -EINVAL;
6176 }
6177 if (n == 0) {
6178 p.opts.unset(desc.key);
6179 } else {
6180 p.opts.set(desc.key, static_cast<int>(n));
6181 }
6182 break;
6183 case pool_opts_t::DOUBLE:
6184 if (floaterr.length()) {
6185 ss << "error parsing floating point value '" << val << "': " << floaterr;
6186 return -EINVAL;
6187 }
6188 if (f == 0) {
6189 p.opts.unset(desc.key);
6190 } else {
6191 p.opts.set(desc.key, static_cast<double>(f));
6192 }
6193 break;
6194 default:
6195 assert(!"unknown type");
6196 }
6197 } else {
6198 ss << "unrecognized variable '" << var << "'";
6199 return -EINVAL;
6200 }
224ce89b
WB
6201 if (val != "unset") {
6202 ss << "set pool " << pool << " " << var << " to " << val;
6203 } else {
6204 ss << "unset pool " << pool << " " << var;
6205 }
7c673cae
FG
6206 p.last_change = pending_inc.epoch;
6207 pending_inc.new_pools[pool] = p;
6208 return 0;
6209}
6210
31f18b77
FG
6211int OSDMonitor::_prepare_command_osd_crush_remove(
6212 CrushWrapper &newcrush,
6213 int32_t id,
6214 int32_t ancestor,
6215 bool has_ancestor,
6216 bool unlink_only)
6217{
6218 int err = 0;
6219
6220 if (has_ancestor) {
6221 err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
6222 unlink_only);
6223 } else {
6224 err = newcrush.remove_item(g_ceph_context, id, unlink_only);
6225 }
6226 return err;
6227}
6228
6229void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
6230{
6231 pending_inc.crush.clear();
6232 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6233}
6234
6235int OSDMonitor::prepare_command_osd_crush_remove(
6236 CrushWrapper &newcrush,
6237 int32_t id,
6238 int32_t ancestor,
6239 bool has_ancestor,
6240 bool unlink_only)
6241{
6242 int err = _prepare_command_osd_crush_remove(
6243 newcrush, id, ancestor,
6244 has_ancestor, unlink_only);
6245
6246 if (err < 0)
6247 return err;
6248
6249 assert(err == 0);
6250 do_osd_crush_remove(newcrush);
6251
6252 return 0;
6253}
6254
6255int OSDMonitor::prepare_command_osd_remove(int32_t id)
6256{
6257 if (osdmap.is_up(id)) {
6258 return -EBUSY;
6259 }
6260
6261 pending_inc.new_state[id] = osdmap.get_state(id);
6262 pending_inc.new_uuid[id] = uuid_d();
6263 pending_metadata_rm.insert(id);
6264 pending_metadata.erase(id);
6265
6266 return 0;
6267}
6268
6269int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
6270{
6271 assert(existing_id);
6272 *existing_id = -1;
6273
6274 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
6275 if (!osdmap.exists(i) &&
6276 pending_inc.new_up_client.count(i) == 0 &&
6277 (pending_inc.new_state.count(i) == 0 ||
6278 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
6279 *existing_id = i;
6280 return -1;
6281 }
6282 }
6283
6284 if (pending_inc.new_max_osd < 0) {
6285 return osdmap.get_max_osd();
6286 }
6287 return pending_inc.new_max_osd;
6288}
6289
6290void OSDMonitor::do_osd_create(
6291 const int32_t id,
6292 const uuid_d& uuid,
6293 int32_t* new_id)
6294{
6295 dout(10) << __func__ << " uuid " << uuid << dendl;
6296 assert(new_id);
6297
6298 // We presume validation has been performed prior to calling this
6299 // function. We assert with prejudice.
6300
6301 int32_t allocated_id = -1; // declare here so we can jump
6302 int32_t existing_id = -1;
6303 if (!uuid.is_zero()) {
6304 existing_id = osdmap.identify_osd(uuid);
6305 if (existing_id >= 0) {
6306 assert(id < 0 || id == existing_id);
6307 *new_id = existing_id;
6308 goto out;
6309 } else if (id >= 0) {
6310 // uuid does not exist, and id has been provided, so just create
6311 // the new osd.id
6312 *new_id = id;
6313 goto out;
6314 }
6315 }
6316
6317 // allocate a new id
6318 allocated_id = _allocate_osd_id(&existing_id);
6319 dout(10) << __func__ << " allocated id " << allocated_id
6320 << " existing id " << existing_id << dendl;
6321 if (existing_id >= 0) {
6322 assert(existing_id < osdmap.get_max_osd());
6323 assert(allocated_id < 0);
6324 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
6325 *new_id = existing_id;
6326
6327 } else if (allocated_id >= 0) {
6328 assert(existing_id < 0);
6329 // raise max_osd
6330 if (pending_inc.new_max_osd < 0) {
6331 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
6332 } else {
6333 ++pending_inc.new_max_osd;
6334 }
6335 *new_id = pending_inc.new_max_osd - 1;
6336 assert(*new_id == allocated_id);
6337 } else {
6338 assert(0 == "unexpected condition");
6339 }
6340
6341out:
6342 dout(10) << __func__ << " using id " << *new_id << dendl;
6343 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
6344 pending_inc.new_max_osd = *new_id + 1;
6345 }
6346
6347 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
6348 if (!uuid.is_zero())
6349 pending_inc.new_uuid[*new_id] = uuid;
6350}
6351
6352int OSDMonitor::validate_osd_create(
6353 const int32_t id,
6354 const uuid_d& uuid,
6355 const bool check_osd_exists,
6356 int32_t* existing_id,
6357 stringstream& ss)
6358{
6359
6360 dout(10) << __func__ << " id " << id << " uuid " << uuid
6361 << " check_osd_exists " << check_osd_exists << dendl;
6362
6363 assert(existing_id);
6364
6365 if (id < 0 && uuid.is_zero()) {
6366 // we have nothing to validate
6367 *existing_id = -1;
6368 return 0;
6369 } else if (uuid.is_zero()) {
6370 // we have an id but we will ignore it - because that's what
6371 // `osd create` does.
6372 return 0;
6373 }
6374
6375 /*
6376 * This function will be used to validate whether we are able to
6377 * create a new osd when the `uuid` is specified.
6378 *
6379 * It will be used by both `osd create` and `osd new`, as the checks
6380 * are basically the same when it pertains to osd id and uuid validation.
6381 * However, `osd create` presumes an `uuid` is optional, for legacy
6382 * reasons, while `osd new` requires the `uuid` to be provided. This
6383 * means that `osd create` will not be idempotent if an `uuid` is not
6384 * provided, but we will always guarantee the idempotency of `osd new`.
6385 */
6386
6387 assert(!uuid.is_zero());
6388 if (pending_inc.identify_osd(uuid) >= 0) {
6389 // osd is about to exist
6390 return -EAGAIN;
6391 }
6392
6393 int32_t i = osdmap.identify_osd(uuid);
6394 if (i >= 0) {
6395 // osd already exists
6396 if (id >= 0 && i != id) {
6397 ss << "uuid " << uuid << " already in use for different id " << i;
6398 return -EEXIST;
6399 }
6400 // return a positive errno to distinguish between a blocking error
6401 // and an error we consider to not be a problem (i.e., this would be
6402 // an idempotent operation).
6403 *existing_id = i;
6404 return EEXIST;
6405 }
6406 // i < 0
6407 if (id >= 0) {
6408 if (pending_inc.new_state.count(id)) {
6409 // osd is about to exist
6410 return -EAGAIN;
6411 }
6412 // we may not care if an osd exists if we are recreating a previously
6413 // destroyed osd.
6414 if (check_osd_exists && osdmap.exists(id)) {
6415 ss << "id " << id << " already in use and does not match uuid "
6416 << uuid;
6417 return -EINVAL;
6418 }
6419 }
6420 return 0;
6421}
6422
6423int OSDMonitor::prepare_command_osd_create(
6424 const int32_t id,
6425 const uuid_d& uuid,
6426 int32_t* existing_id,
6427 stringstream& ss)
6428{
6429 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6430 assert(existing_id);
6431
6432 if (uuid.is_zero()) {
6433 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
6434 }
6435
6436 return validate_osd_create(id, uuid, true, existing_id, ss);
6437}
6438
6439int OSDMonitor::prepare_command_osd_new(
6440 MonOpRequestRef op,
6441 const map<string,cmd_vartype>& cmdmap,
6442 const map<string,string>& secrets,
6443 stringstream &ss,
6444 Formatter *f)
6445{
6446 uuid_d uuid;
6447 string uuidstr;
6448 int64_t id = -1;
6449
6450 assert(paxos->is_plugged());
6451
6452 dout(10) << __func__ << " " << op << dendl;
6453
6454 /* validate command. abort now if something's wrong. */
6455
6456 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
6457 *
6458 * If `id` is not specified, we will identify any existing osd based
6459 * on `uuid`. Operation will be idempotent iff secrets match.
6460 *
6461 * If `id` is specified, we will identify any existing osd based on
6462 * `uuid` and match against `id`. If they match, operation will be
6463 * idempotent iff secrets match.
6464 *
6465 * `-i secrets.json` will be optional. If supplied, will be used
6466 * to check for idempotency when `id` and `uuid` match.
6467 *
6468 * If `id` is not specified, and `uuid` does not exist, an id will
6469 * be found or allocated for the osd.
6470 *
6471 * If `id` is specified, and the osd has been previously marked
6472 * as destroyed, then the `id` will be reused.
6473 */
6474 if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
6475 ss << "requires the OSD's UUID to be specified.";
6476 return -EINVAL;
6477 } else if (!uuid.parse(uuidstr.c_str())) {
6478 ss << "invalid UUID value '" << uuidstr << "'.";
6479 return -EINVAL;
6480 }
6481
6482 if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
6483 (id < 0)) {
6484 ss << "invalid OSD id; must be greater or equal than zero.";
6485 return -EINVAL;
6486 }
6487
6488 // are we running an `osd create`-like command, or recreating
6489 // a previously destroyed osd?
6490
6491 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
6492
6493 // we will care about `id` to assess whether osd is `destroyed`, or
6494 // to create a new osd.
6495 // we will need an `id` by the time we reach auth.
6496
6497 int32_t existing_id = -1;
6498 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
6499 &existing_id, ss);
6500
6501 bool may_be_idempotent = false;
6502 if (err == EEXIST) {
6503 // this is idempotent from the osdmon's point-of-view
6504 may_be_idempotent = true;
6505 assert(existing_id >= 0);
6506 id = existing_id;
6507 } else if (err < 0) {
6508 return err;
6509 }
6510
6511 if (!may_be_idempotent) {
6512 // idempotency is out of the window. We are either creating a new
6513 // osd or recreating a destroyed osd.
6514 //
6515 // We now need to figure out if we have an `id` (and if it's valid),
6516 // of find an `id` if we don't have one.
6517
6518 // NOTE: we need to consider the case where the `id` is specified for
6519 // `osd create`, and we must honor it. So this means checking if
6520 // the `id` is destroyed, and if so assume the destroy; otherwise,
6521 // check if it `exists` - in which case we complain about not being
6522 // `destroyed`. In the end, if nothing fails, we must allow the
6523 // creation, so that we are compatible with `create`.
6524 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
6525 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
6526 ss << "OSD " << id << " has not yet been destroyed";
6527 return -EINVAL;
6528 } else if (id < 0) {
6529 // find an `id`
6530 id = _allocate_osd_id(&existing_id);
6531 if (id < 0) {
6532 assert(existing_id >= 0);
6533 id = existing_id;
6534 }
6535 dout(10) << __func__ << " found id " << id << " to use" << dendl;
6536 } else if (id >= 0 && osdmap.is_destroyed(id)) {
6537 dout(10) << __func__ << " recreating osd." << id << dendl;
6538 } else {
6539 dout(10) << __func__ << " creating new osd." << id << dendl;
6540 }
6541 } else {
6542 assert(id >= 0);
6543 assert(osdmap.exists(id));
6544 }
6545
6546 // we are now able to either create a brand new osd or reuse an existing
6547 // osd that has been previously destroyed.
6548
6549 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
6550
6551 if (may_be_idempotent && secrets.empty()) {
6552 // nothing to do, really.
6553 dout(10) << __func__ << " idempotent and no secrets -- no op." << dendl;
6554 assert(id >= 0);
6555 if (f) {
6556 f->open_object_section("created_osd");
6557 f->dump_int("osdid", id);
6558 f->close_section();
6559 } else {
6560 ss << id;
6561 }
6562 return EEXIST;
6563 }
6564
6565 string cephx_secret, lockbox_secret, dmcrypt_key;
6566 bool has_lockbox = false;
6567 bool has_secrets = (!secrets.empty());
6568
6569 ConfigKeyService *svc = nullptr;
6570 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
6571
6572 if (has_secrets) {
6573 if (secrets.count("cephx_secret") == 0) {
6574 ss << "requires a cephx secret.";
6575 return -EINVAL;
6576 }
6577 cephx_secret = secrets.at("cephx_secret");
6578
6579 bool has_lockbox_secret = (secrets.count("cephx_lockbox_secret") > 0);
6580 bool has_dmcrypt_key = (secrets.count("dmcrypt_key") > 0);
6581
6582 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
6583 << " dmcrypt " << has_dmcrypt_key << dendl;
6584
6585 if (has_lockbox_secret && has_dmcrypt_key) {
6586 has_lockbox = true;
6587 lockbox_secret = secrets.at("cephx_lockbox_secret");
6588 dmcrypt_key = secrets.at("dmcrypt_key");
6589 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
6590 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
6591 return -EINVAL;
6592 }
6593
6594 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
6595
6596 err = mon->authmon()->validate_osd_new(id, uuid,
6597 cephx_secret,
6598 lockbox_secret,
6599 cephx_entity,
6600 lockbox_entity,
6601 ss);
6602 if (err < 0) {
6603 return err;
6604 } else if (may_be_idempotent && err != EEXIST) {
6605 // for this to be idempotent, `id` should already be >= 0; no need
6606 // to use validate_id.
6607 assert(id >= 0);
6608 ss << "osd." << id << " exists but secrets do not match";
6609 return -EEXIST;
6610 }
6611
6612 if (has_lockbox) {
6613 svc = (ConfigKeyService*)mon->config_key_service;
6614 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
6615 if (err < 0) {
6616 return err;
6617 } else if (may_be_idempotent && err != EEXIST) {
6618 assert(id >= 0);
6619 ss << "osd." << id << " exists but dm-crypt key does not match.";
6620 return -EEXIST;
6621 }
6622 }
6623 }
6624 assert(!has_secrets || !cephx_secret.empty());
6625 assert(!has_lockbox || !lockbox_secret.empty());
6626
6627 if (may_be_idempotent) {
6628 // we have nothing to do for either the osdmon or the authmon,
6629 // and we have no lockbox - so the config key service will not be
6630 // touched. This is therefore an idempotent operation, and we can
6631 // just return right away.
6632 dout(10) << __func__ << " idempotent -- no op." << dendl;
6633 assert(id >= 0);
6634 if (f) {
6635 f->open_object_section("created_osd");
6636 f->dump_int("osdid", id);
6637 f->close_section();
6638 } else {
6639 ss << id;
6640 }
6641 return EEXIST;
6642 }
6643 assert(!may_be_idempotent);
6644
6645 // perform updates.
6646 if (has_secrets) {
6647 assert(!cephx_secret.empty());
6648 assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
6649 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
6650
6651 err = mon->authmon()->do_osd_new(cephx_entity,
6652 lockbox_entity,
6653 has_lockbox);
6654 assert(0 == err);
6655
6656 if (has_lockbox) {
6657 assert(nullptr != svc);
6658 svc->do_osd_new(uuid, dmcrypt_key);
6659 }
6660 }
6661
6662 if (is_recreate_destroyed) {
6663 assert(id >= 0);
6664 assert(osdmap.is_destroyed(id));
6665 pending_inc.new_weight[id] = CEPH_OSD_OUT;
6666 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED | CEPH_OSD_NEW;
6667 pending_inc.new_uuid[id] = uuid;
6668 } else {
6669 assert(id >= 0);
6670 int32_t new_id = -1;
6671 do_osd_create(id, uuid, &new_id);
6672 assert(new_id >= 0);
6673 assert(id == new_id);
6674 }
6675
6676 if (f) {
6677 f->open_object_section("created_osd");
6678 f->dump_int("osdid", id);
6679 f->close_section();
6680 } else {
6681 ss << id;
6682 }
6683
6684 return 0;
6685}
6686
7c673cae
FG
6687bool OSDMonitor::prepare_command(MonOpRequestRef op)
6688{
6689 op->mark_osdmon_event(__func__);
6690 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
6691 stringstream ss;
6692 map<string, cmd_vartype> cmdmap;
6693 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
6694 string rs = ss.str();
6695 mon->reply_command(op, -EINVAL, rs, get_last_committed());
6696 return true;
6697 }
6698
6699 MonSession *session = m->get_session();
6700 if (!session) {
6701 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
6702 return true;
6703 }
6704
6705 return prepare_command_impl(op, cmdmap);
6706}
6707
6708static int parse_reweights(CephContext *cct,
6709 const map<string,cmd_vartype> &cmdmap,
6710 const OSDMap& osdmap,
6711 map<int32_t, uint32_t>* weights)
6712{
6713 string weights_str;
6714 if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
6715 return -EINVAL;
6716 }
6717 std::replace(begin(weights_str), end(weights_str), '\'', '"');
6718 json_spirit::mValue json_value;
6719 if (!json_spirit::read(weights_str, json_value)) {
6720 return -EINVAL;
6721 }
6722 if (json_value.type() != json_spirit::obj_type) {
6723 return -EINVAL;
6724 }
6725 const auto obj = json_value.get_obj();
6726 try {
6727 for (auto& osd_weight : obj) {
6728 auto osd_id = std::stoi(osd_weight.first);
6729 if (!osdmap.exists(osd_id)) {
6730 return -ENOENT;
6731 }
6732 if (osd_weight.second.type() != json_spirit::str_type) {
6733 return -EINVAL;
6734 }
6735 auto weight = std::stoul(osd_weight.second.get_str());
6736 weights->insert({osd_id, weight});
6737 }
6738 } catch (const std::logic_error& e) {
6739 return -EINVAL;
6740 }
6741 return 0;
6742}
6743
31f18b77
FG
6744int OSDMonitor::prepare_command_osd_destroy(
6745 int32_t id,
6746 stringstream& ss)
6747{
6748 assert(paxos->is_plugged());
6749
6750 // we check if the osd exists for the benefit of `osd purge`, which may
6751 // have previously removed the osd. If the osd does not exist, return
6752 // -ENOENT to convey this, and let the caller deal with it.
6753 //
6754 // we presume that all auth secrets and config keys were removed prior
6755 // to this command being called. if they exist by now, we also assume
6756 // they must have been created by some other command and do not pertain
6757 // to this non-existent osd.
6758 if (!osdmap.exists(id)) {
6759 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
6760 return -ENOENT;
6761 }
6762
6763 uuid_d uuid = osdmap.get_uuid(id);
6764 dout(10) << __func__ << " destroying osd." << id
6765 << " uuid " << uuid << dendl;
6766
6767 // if it has been destroyed, we assume our work here is done.
6768 if (osdmap.is_destroyed(id)) {
6769 ss << "destroyed osd." << id;
6770 return 0;
6771 }
6772
6773 EntityName cephx_entity, lockbox_entity;
6774 bool idempotent_auth = false, idempotent_cks = false;
6775
6776 int err = mon->authmon()->validate_osd_destroy(id, uuid,
6777 cephx_entity,
6778 lockbox_entity,
6779 ss);
6780 if (err < 0) {
6781 if (err == -ENOENT) {
6782 idempotent_auth = true;
31f18b77
FG
6783 } else {
6784 return err;
6785 }
6786 }
6787
6788 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
6789 err = svc->validate_osd_destroy(id, uuid);
6790 if (err < 0) {
6791 assert(err == -ENOENT);
6792 err = 0;
6793 idempotent_cks = true;
6794 }
6795
6796 if (!idempotent_auth) {
6797 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
6798 assert(0 == err);
6799 }
6800
6801 if (!idempotent_cks) {
6802 svc->do_osd_destroy(id, uuid);
6803 }
6804
6805 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
6806 pending_inc.new_uuid[id] = uuid_d();
6807
6808 // we can only propose_pending() once per service, otherwise we'll be
6809 // defying PaxosService and all laws of nature. Therefore, as we may
6810 // be used during 'osd purge', let's keep the caller responsible for
6811 // proposing.
6812 assert(err == 0);
6813 return 0;
6814}
6815
6816int OSDMonitor::prepare_command_osd_purge(
6817 int32_t id,
6818 stringstream& ss)
6819{
6820 assert(paxos->is_plugged());
6821 dout(10) << __func__ << " purging osd." << id << dendl;
6822
6823 assert(!osdmap.is_up(id));
6824
6825 /*
6826 * This may look a bit weird, but this is what's going to happen:
6827 *
6828 * 1. we make sure that removing from crush works
6829 * 2. we call `prepare_command_osd_destroy()`. If it returns an
6830 * error, then we abort the whole operation, as no updates
6831 * have been made. However, we this function will have
6832 * side-effects, thus we need to make sure that all operations
6833 * performed henceforth will *always* succeed.
6834 * 3. we call `prepare_command_osd_remove()`. Although this
6835 * function can return an error, it currently only checks if the
6836 * osd is up - and we have made sure that it is not so, so there
6837 * is no conflict, and it is effectively an update.
6838 * 4. finally, we call `do_osd_crush_remove()`, which will perform
6839 * the crush update we delayed from before.
6840 */
6841
6842 CrushWrapper newcrush;
6843 _get_pending_crush(newcrush);
6844
6845 bool may_be_idempotent = false;
6846
6847 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
6848 if (err == -ENOENT) {
6849 err = 0;
6850 may_be_idempotent = true;
6851 } else if (err < 0) {
6852 ss << "error removing osd." << id << " from crush";
6853 return err;
6854 }
6855
6856 // no point destroying the osd again if it has already been marked destroyed
6857 if (!osdmap.is_destroyed(id)) {
6858 err = prepare_command_osd_destroy(id, ss);
6859 if (err < 0) {
6860 if (err == -ENOENT) {
6861 err = 0;
6862 } else {
6863 return err;
6864 }
6865 } else {
6866 may_be_idempotent = false;
6867 }
6868 }
6869 assert(0 == err);
6870
6871 if (may_be_idempotent && !osdmap.exists(id)) {
6872 dout(10) << __func__ << " osd." << id << " does not exist and "
6873 << "we are idempotent." << dendl;
6874 return -ENOENT;
6875 }
6876
6877 err = prepare_command_osd_remove(id);
6878 // we should not be busy, as we should have made sure this id is not up.
6879 assert(0 == err);
6880
6881 do_osd_crush_remove(newcrush);
6882 return 0;
6883}
6884
7c673cae
FG
6885bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
6886 map<string,cmd_vartype> &cmdmap)
6887{
6888 op->mark_osdmon_event(__func__);
6889 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
6890 bool ret = false;
6891 stringstream ss;
6892 string rs;
6893 bufferlist rdata;
6894 int err = 0;
6895
6896 string format;
6897 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
6898 boost::scoped_ptr<Formatter> f(Formatter::create(format));
6899
6900 string prefix;
6901 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
6902
6903 int64_t osdid;
6904 string name;
6905 bool osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
6906 if (osdid_present) {
6907 ostringstream oss;
6908 oss << "osd." << osdid;
6909 name = oss.str();
6910 }
6911
6912 // Even if there's a pending state with changes that could affect
6913 // a command, considering that said state isn't yet committed, we
6914 // just don't care about those changes if the command currently being
6915 // handled acts as a no-op against the current committed state.
6916 // In a nutshell, we assume this command happens *before*.
6917 //
6918 // Let me make this clearer:
6919 //
6920 // - If we have only one client, and that client issues some
6921 // operation that would conflict with this operation but is
6922 // still on the pending state, then we would be sure that said
6923 // operation wouldn't have returned yet, so the client wouldn't
6924 // issue this operation (unless the client didn't wait for the
6925 // operation to finish, and that would be the client's own fault).
6926 //
6927 // - If we have more than one client, each client will observe
6928 // whatever is the state at the moment of the commit. So, if we
6929 // have two clients, one issuing an unlink and another issuing a
6930 // link, and if the link happens while the unlink is still on the
6931 // pending state, from the link's point-of-view this is a no-op.
6932 // If different clients are issuing conflicting operations and
6933 // they care about that, then the clients should make sure they
6934 // enforce some kind of concurrency mechanism -- from our
6935 // perspective that's what Douglas Adams would call an SEP.
6936 //
6937 // This should be used as a general guideline for most commands handled
6938 // in this function. Adapt as you see fit, but please bear in mind that
6939 // this is the expected behavior.
6940
6941
6942 if (prefix == "osd setcrushmap" ||
6943 (prefix == "osd crush set" && !osdid_present)) {
31f18b77
FG
6944 if (pending_inc.crush.length()) {
6945 dout(10) << __func__ << " waiting for pending crush update " << dendl;
6946 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
6947 return true;
6948 }
7c673cae
FG
6949 dout(10) << "prepare_command setting new crush map" << dendl;
6950 bufferlist data(m->get_data());
6951 CrushWrapper crush;
6952 try {
6953 bufferlist::iterator bl(data.begin());
6954 crush.decode(bl);
6955 }
6956 catch (const std::exception &e) {
6957 err = -EINVAL;
6958 ss << "Failed to parse crushmap: " << e.what();
6959 goto reply;
6960 }
31f18b77
FG
6961
6962 int64_t prior_version = 0;
6963 if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
6964 if (prior_version == osdmap.get_crush_version() - 1) {
6965 // see if we are a resend of the last update. this is imperfect
6966 // (multiple racing updaters may not both get reliable success)
6967 // but we expect crush updaters (via this interface) to be rare-ish.
6968 bufferlist current, proposed;
6969 osdmap.crush->encode(current, mon->get_quorum_con_features());
6970 crush.encode(proposed, mon->get_quorum_con_features());
6971 if (current.contents_equal(proposed)) {
6972 dout(10) << __func__
6973 << " proposed matches current and version equals previous"
6974 << dendl;
6975 err = 0;
6976 ss << osdmap.get_crush_version();
6977 goto reply;
6978 }
6979 }
6980 if (prior_version != osdmap.get_crush_version()) {
6981 err = -EPERM;
6982 ss << "prior_version " << prior_version << " != crush version "
6983 << osdmap.get_crush_version();
6984 goto reply;
6985 }
6986 }
7c673cae 6987
31f18b77
FG
6988 if (crush.has_legacy_rulesets()) {
6989 err = -EINVAL;
6990 ss << "crush maps with ruleset != ruleid are no longer allowed";
6991 goto reply;
6992 }
7c673cae
FG
6993 if (!validate_crush_against_features(&crush, ss)) {
6994 err = -EINVAL;
6995 goto reply;
6996 }
31f18b77 6997
7c673cae
FG
6998 const auto& osdmap_pools = osdmap.get_pools();
6999 for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
7000 const int64_t pool_id = pit->first;
7001 const pg_pool_t &pool = pit->second;
31f18b77 7002 int ruleno = pool.get_crush_rule();
7c673cae
FG
7003 if (!crush.rule_exists(ruleno)) {
7004 ss << " the crush rule no "<< ruleno << " for pool id " << pool_id << " is in use";
7005 err = -EINVAL;
7006 goto reply;
7007 }
7008 }
7009
224ce89b
WB
7010 if (g_conf->mon_osd_crush_smoke_test) {
7011 // sanity check: test some inputs to make sure this map isn't
7012 // totally broken
7013 dout(10) << " testing map" << dendl;
7014 stringstream ess;
7015 CrushTester tester(crush, ess);
7016 tester.set_max_x(50);
7017 int r = tester.test_with_fork(g_conf->mon_lease);
7018 if (r < 0) {
7019 dout(10) << " tester.test_with_fork returns " << r
7020 << ": " << ess.str() << dendl;
7021 ss << "crush smoke test failed with " << r << ": " << ess.str();
7022 err = r;
7023 goto reply;
7024 }
7025 dout(10) << " crush test result " << ess.str() << dendl;
7c673cae
FG
7026 }
7027
7c673cae 7028 pending_inc.crush = data;
31f18b77 7029 ss << osdmap.get_crush_version() + 1;
7c673cae
FG
7030 goto update;
7031
7032 } else if (prefix == "osd crush set-device-class") {
224ce89b
WB
7033 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7034 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7035 << "luminous' before using crush device classes";
7036 err = -EPERM;
7c673cae
FG
7037 goto reply;
7038 }
7039
7040 string device_class;
7041 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7042 err = -EINVAL; // no value!
7043 goto reply;
7044 }
7045
224ce89b
WB
7046 bool stop = false;
7047 vector<string> idvec;
7048 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
7c673cae
FG
7049 CrushWrapper newcrush;
7050 _get_pending_crush(newcrush);
224ce89b
WB
7051 set<int> updated;
7052 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
7053 set<int> osds;
7054 // wildcard?
7055 if (j == 0 &&
7056 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
7057 osdmap.get_all_osds(osds);
7058 stop = true;
7059 } else {
7060 // try traditional single osd way
7061 long osd = parse_osd_id(idvec[j].c_str(), &ss);
7062 if (osd < 0) {
7063 // ss has reason for failure
7064 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
7065 err = -EINVAL;
7066 continue;
7067 }
7068 osds.insert(osd);
7069 }
7c673cae 7070
224ce89b
WB
7071 for (auto &osd : osds) {
7072 if (!osdmap.exists(osd)) {
7073 ss << "osd." << osd << " does not exist. ";
7074 continue;
7075 }
7c673cae 7076
224ce89b
WB
7077 ostringstream oss;
7078 oss << "osd." << osd;
7079 string name = oss.str();
7c673cae 7080
224ce89b
WB
7081 string action;
7082 if (newcrush.item_exists(osd)) {
7083 action = "updating";
7084 } else {
7085 action = "creating";
7086 newcrush.set_item_name(osd, name);
7087 }
7c673cae 7088
224ce89b
WB
7089 dout(5) << action << " crush item id " << osd << " name '" << name
7090 << "' device_class '" << device_class << "'"
7091 << dendl;
7092 err = newcrush.update_device_class(osd, device_class, name, &ss);
7093 if (err < 0) {
7094 goto reply;
7095 }
7096 if (err == 0 && !_have_pending_crush()) {
7097 if (!stop) {
7098 // for single osd only, wildcard makes too much noise
7099 ss << "set-device-class item id " << osd << " name '" << name
7100 << "' device_class '" << device_class << "': no change";
7101 }
7102 } else {
7103 updated.insert(osd);
7104 }
7105 }
7c673cae
FG
7106 }
7107
224ce89b
WB
7108 if (!updated.empty()) {
7109 pending_inc.crush.clear();
7110 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7111 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
7112 getline(ss, rs);
7113 wait_for_finished_proposal(op,
7114 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
7115 return true;
7116 }
7c673cae
FG
7117
7118 } else if (prefix == "osd crush add-bucket") {
7119 // os crush add-bucket <name> <type>
7120 string name, typestr;
7121 cmd_getval(g_ceph_context, cmdmap, "name", name);
7122 cmd_getval(g_ceph_context, cmdmap, "type", typestr);
7123
7124 if (!_have_pending_crush() &&
7125 _get_stable_crush().name_exists(name)) {
7126 ss << "bucket '" << name << "' already exists";
7127 goto reply;
7128 }
7129
7130 CrushWrapper newcrush;
7131 _get_pending_crush(newcrush);
7132
7133 if (newcrush.name_exists(name)) {
7134 ss << "bucket '" << name << "' already exists";
7135 goto update;
7136 }
7137 int type = newcrush.get_type_id(typestr);
7138 if (type < 0) {
7139 ss << "type '" << typestr << "' does not exist";
7140 err = -EINVAL;
7141 goto reply;
7142 }
7143 if (type == 0) {
7144 ss << "type '" << typestr << "' is for devices, not buckets";
7145 err = -EINVAL;
7146 goto reply;
7147 }
7148 int bucketno;
7149 err = newcrush.add_bucket(0, 0,
7150 CRUSH_HASH_DEFAULT, type, 0, NULL,
7151 NULL, &bucketno);
7152 if (err < 0) {
7153 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
7154 goto reply;
7155 }
7156 err = newcrush.set_item_name(bucketno, name);
7157 if (err < 0) {
7158 ss << "error setting bucket name to '" << name << "'";
7159 goto reply;
7160 }
7161
7162 pending_inc.crush.clear();
7163 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7164 ss << "added bucket " << name << " type " << typestr
7165 << " to crush map";
7166 goto update;
7167 } else if (prefix == "osd crush rename-bucket") {
7168 string srcname, dstname;
7169 cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
7170 cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
7171
7172 err = crush_rename_bucket(srcname, dstname, &ss);
7173 if (err == -EALREADY) // equivalent to success for idempotency
7174 err = 0;
7175 if (err)
7176 goto reply;
7177 else
7178 goto update;
7179 } else if (prefix == "osd crush class create") {
7180 string device_class;
7181 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7182 err = -EINVAL; // no value!
7183 goto reply;
7184 }
224ce89b
WB
7185 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7186 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7187 << "luminous' before using crush device classes";
7188 err = -EPERM;
7189 goto reply;
7190 }
7c673cae
FG
7191 if (!_have_pending_crush() &&
7192 _get_stable_crush().class_exists(device_class)) {
7193 ss << "class '" << device_class << "' already exists";
7194 goto reply;
7195 }
7196
7197 CrushWrapper newcrush;
7198 _get_pending_crush(newcrush);
7199
7200 if (newcrush.class_exists(name)) {
7201 ss << "class '" << device_class << "' already exists";
7202 goto update;
7203 }
7204
7205 int class_id = newcrush.get_or_create_class_id(device_class);
7206
7207 pending_inc.crush.clear();
7208 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7209 ss << "created class " << device_class << " with id " << class_id
7210 << " to crush map";
7211 goto update;
7212
7213 } else if (prefix == "osd crush class rm") {
7214 string device_class;
7215 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
7216 err = -EINVAL; // no value!
7217 goto reply;
7218 }
224ce89b
WB
7219 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7220 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7221 << "luminous' before using crush device classes";
7222 err = -EPERM;
7223 goto reply;
7224 }
7c673cae
FG
7225
7226 CrushWrapper newcrush;
7227 _get_pending_crush(newcrush);
7228
7229 if (!newcrush.class_exists(device_class)) {
7230 err = -ENOENT;
7231 ss << "class '" << device_class << "' does not exist";
7232 goto reply;
7233 }
7234
7235 int class_id = newcrush.get_class_id(device_class);
7236
7237 if (newcrush.class_is_in_use(class_id)) {
7238 err = -EBUSY;
7239 ss << "class '" << device_class << "' is in use";
7240 goto reply;
7241 }
7242
7243 err = newcrush.remove_class_name(device_class);
7244 if (err < 0) {
7245 ss << "class '" << device_class << "' cannot be removed '"
7246 << cpp_strerror(err) << "'";
7247 goto reply;
7248 }
7249
7250 pending_inc.crush.clear();
7251 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7252 ss << "removed class " << device_class << " with id " << class_id
7253 << " from crush map";
7254 goto update;
224ce89b
WB
7255
7256 } else if (prefix == "osd crush class rename") {
7257 string srcname, dstname;
7258 if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
7259 err = -EINVAL;
7260 goto reply;
7261 }
7262 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7263 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7264 << "luminous' before using crush device classes";
7265 err = -EPERM;
7266 goto reply;
7267 }
7268
7269 if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
7270 err = -EINVAL;
7271 goto reply;
7272 }
7273
7274 CrushWrapper newcrush;
7275 _get_pending_crush(newcrush);
7276
7277 if (!newcrush.class_exists(srcname)) {
7278 err = -ENOENT;
7279 ss << "class '" << srcname << "' does not exist";
7280 goto reply;
7281 }
7282
7283 if (newcrush.class_exists(dstname)) {
7284 err = -EEXIST;
7285 ss << "class '" << dstname << "' already exists";
7286 goto reply;
7287 }
7288
7289 int class_id = newcrush.get_class_id(srcname);
7290
7291 if (newcrush.class_is_in_use(class_id)) {
7292 err = -EBUSY;
7293 ss << "class '" << srcname << "' is in use";
7294 goto reply;
7295 }
7296
7297 err = newcrush.rename_class(srcname, dstname);
7298 if (err < 0) {
7299 ss << "fail to rename '" << srcname << "' to '" << dstname << "':"
7300 << cpp_strerror(err);
7301 goto reply;
7302 }
7303
7304 pending_inc.crush.clear();
7305 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7306 ss << "rename class '" << srcname << "' to '" << dstname << "'";
7307 goto update;
7c673cae
FG
7308
7309 } else if (osdid_present &&
7310 (prefix == "osd crush set" || prefix == "osd crush add")) {
7311 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
7312 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
7313 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
7314
7315 if (!osdmap.exists(osdid)) {
7316 err = -ENOENT;
7317 ss << name << " does not exist. create it before updating the crush map";
7318 goto reply;
7319 }
7320
7321 double weight;
7322 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7323 ss << "unable to parse weight value '"
7324 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7325 err = -EINVAL;
7326 goto reply;
7327 }
7328
7329 string args;
7330 vector<string> argvec;
7331 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7332 map<string,string> loc;
7333 CrushWrapper::parse_loc_map(argvec, &loc);
7334
7335 if (prefix == "osd crush set"
7336 && !_get_stable_crush().item_exists(osdid)) {
7337 err = -ENOENT;
7338 ss << "unable to set item id " << osdid << " name '" << name
7339 << "' weight " << weight << " at location " << loc
7340 << ": does not exist";
7341 goto reply;
7342 }
7343
7344 dout(5) << "adding/updating crush item id " << osdid << " name '"
7345 << name << "' weight " << weight << " at location "
7346 << loc << dendl;
7347 CrushWrapper newcrush;
7348 _get_pending_crush(newcrush);
7349
7350 string action;
7351 if (prefix == "osd crush set" ||
7352 newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
7353 action = "set";
7354 err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
7355 } else {
7356 action = "add";
7357 err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
7358 if (err == 0)
7359 err = 1;
7360 }
7361
7362 if (err < 0)
7363 goto reply;
7364
7365 if (err == 0 && !_have_pending_crush()) {
7366 ss << action << " item id " << osdid << " name '" << name << "' weight "
7367 << weight << " at location " << loc << ": no change";
7368 goto reply;
7369 }
7370
7371 pending_inc.crush.clear();
7372 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7373 ss << action << " item id " << osdid << " name '" << name << "' weight "
7374 << weight << " at location " << loc << " to crush map";
7375 getline(ss, rs);
7376 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7377 get_last_committed() + 1));
7378 return true;
7379
7380 } else if (prefix == "osd crush create-or-move") {
7381 do {
7382 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
7383 if (!osdmap.exists(osdid)) {
7384 err = -ENOENT;
7385 ss << name << " does not exist. create it before updating the crush map";
7386 goto reply;
7387 }
7388
7389 double weight;
7390 if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
7391 ss << "unable to parse weight value '"
7392 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7393 err = -EINVAL;
7394 goto reply;
7395 }
7396
7397 string args;
7398 vector<string> argvec;
7399 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7400 map<string,string> loc;
7401 CrushWrapper::parse_loc_map(argvec, &loc);
7402
7403 dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
7404 << " at location " << loc << dendl;
7405
7406 CrushWrapper newcrush;
7407 _get_pending_crush(newcrush);
7408
7409 err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
7410 if (err == 0) {
7411 ss << "create-or-move updated item name '" << name << "' weight " << weight
7412 << " at location " << loc << " to crush map";
7413 break;
7414 }
7415 if (err > 0) {
7416 pending_inc.crush.clear();
7417 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7418 ss << "create-or-move updating item name '" << name << "' weight " << weight
7419 << " at location " << loc << " to crush map";
7420 getline(ss, rs);
7421 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7422 get_last_committed() + 1));
7423 return true;
7424 }
7425 } while (false);
7426
7427 } else if (prefix == "osd crush move") {
7428 do {
7429 // osd crush move <name> <loc1> [<loc2> ...]
7430
7431 string args;
7432 vector<string> argvec;
7433 cmd_getval(g_ceph_context, cmdmap, "name", name);
7434 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7435 map<string,string> loc;
7436 CrushWrapper::parse_loc_map(argvec, &loc);
7437
7438 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
7439 CrushWrapper newcrush;
7440 _get_pending_crush(newcrush);
7441
7442 if (!newcrush.name_exists(name)) {
7443 err = -ENOENT;
7444 ss << "item " << name << " does not exist";
7445 break;
7446 }
7447 int id = newcrush.get_item_id(name);
7448
7449 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
7450 if (id >= 0) {
7451 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
7452 } else {
7453 err = newcrush.move_bucket(g_ceph_context, id, loc);
7454 }
7455 if (err >= 0) {
7456 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7457 pending_inc.crush.clear();
7458 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7459 getline(ss, rs);
7460 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7461 get_last_committed() + 1));
7462 return true;
7463 }
7464 } else {
7465 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
7466 err = 0;
7467 }
7468 } while (false);
31f18b77
FG
7469 } else if (prefix == "osd crush swap-bucket") {
7470 string source, dest, force;
7471 cmd_getval(g_ceph_context, cmdmap, "source", source);
7472 cmd_getval(g_ceph_context, cmdmap, "dest", dest);
7473 cmd_getval(g_ceph_context, cmdmap, "force", force);
7474 CrushWrapper newcrush;
7475 _get_pending_crush(newcrush);
7476 if (!newcrush.name_exists(source)) {
7477 ss << "source item " << source << " does not exist";
7478 err = -ENOENT;
7479 goto reply;
7480 }
7481 if (!newcrush.name_exists(dest)) {
7482 ss << "dest item " << dest << " does not exist";
7483 err = -ENOENT;
7484 goto reply;
7485 }
7486 int sid = newcrush.get_item_id(source);
7487 int did = newcrush.get_item_id(dest);
7488 int sparent;
7489 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
7490 force != "--yes-i-really-mean-it") {
7491 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
7492 err = -EPERM;
7493 goto reply;
7494 }
7495 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
7496 force != "--yes-i-really-mean-it") {
7497 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
7498 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
7499 << "; pass --yes-i-really-mean-it to proceed anyway";
7500 err = -EPERM;
7501 goto reply;
7502 }
7503 int r = newcrush.swap_bucket(g_ceph_context, sid, did);
7504 if (r < 0) {
7505 ss << "failed to swap bucket contents: " << cpp_strerror(r);
224ce89b 7506 err = r;
31f18b77
FG
7507 goto reply;
7508 }
7509 ss << "swapped bucket of " << source << " to " << dest;
7510 pending_inc.crush.clear();
7511 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7512 wait_for_finished_proposal(op,
7513 new Monitor::C_Command(mon, op, err, ss.str(),
7514 get_last_committed() + 1));
7515 return true;
7516 } else if (prefix == "osd crush link") {
7517 // osd crush link <name> <loc1> [<loc2> ...]
7518 string name;
7519 cmd_getval(g_ceph_context, cmdmap, "name", name);
7520 vector<string> argvec;
7521 cmd_getval(g_ceph_context, cmdmap, "args", argvec);
7522 map<string,string> loc;
7523 CrushWrapper::parse_loc_map(argvec, &loc);
7524
7525 // Need an explicit check for name_exists because get_item_id returns
7526 // 0 on unfound.
7527 int id = osdmap.crush->get_item_id(name);
7c673cae
FG
7528 if (!osdmap.crush->name_exists(name)) {
7529 err = -ENOENT;
7530 ss << "item " << name << " does not exist";
7531 goto reply;
7532 } else {
7533 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
7534 }
7535 if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
7536 ss << "no need to move item id " << id << " name '" << name
7537 << "' to location " << loc << " in crush map";
7538 err = 0;
7539 goto reply;
7540 }
7541
7542 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
7543 CrushWrapper newcrush;
7544 _get_pending_crush(newcrush);
7545
7546 if (!newcrush.name_exists(name)) {
7547 err = -ENOENT;
7548 ss << "item " << name << " does not exist";
7549 goto reply;
7550 } else {
7551 int id = newcrush.get_item_id(name);
7552 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
7553 err = newcrush.link_bucket(g_ceph_context, id, loc);
7554 if (err >= 0) {
7555 ss << "linked item id " << id << " name '" << name
7556 << "' to location " << loc << " in crush map";
7557 pending_inc.crush.clear();
7558 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7559 } else {
7560 ss << "cannot link item id " << id << " name '" << name
7561 << "' to location " << loc;
7562 goto reply;
7563 }
7564 } else {
7565 ss << "no need to move item id " << id << " name '" << name
7566 << "' to location " << loc << " in crush map";
7567 err = 0;
7568 }
7569 }
7570 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
7571 get_last_committed() + 1));
7572 return true;
7573 } else if (prefix == "osd crush rm" ||
7574 prefix == "osd crush remove" ||
7575 prefix == "osd crush unlink") {
7576 do {
7577 // osd crush rm <id> [ancestor]
7578 CrushWrapper newcrush;
7579 _get_pending_crush(newcrush);
7580
7581 string name;
7582 cmd_getval(g_ceph_context, cmdmap, "name", name);
7583
7584 if (!osdmap.crush->name_exists(name)) {
7585 err = 0;
7586 ss << "device '" << name << "' does not appear in the crush map";
7587 break;
7588 }
7589 if (!newcrush.name_exists(name)) {
7590 err = 0;
7591 ss << "device '" << name << "' does not appear in the crush map";
7592 getline(ss, rs);
7593 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7594 get_last_committed() + 1));
7595 return true;
7596 }
7597 int id = newcrush.get_item_id(name);
31f18b77
FG
7598 int ancestor = 0;
7599
7c673cae
FG
7600 bool unlink_only = prefix == "osd crush unlink";
7601 string ancestor_str;
7602 if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
7603 if (!newcrush.name_exists(ancestor_str)) {
7604 err = -ENOENT;
7605 ss << "ancestor item '" << ancestor_str
7606 << "' does not appear in the crush map";
7607 break;
7608 }
31f18b77 7609 ancestor = newcrush.get_item_id(ancestor_str);
7c673cae 7610 }
31f18b77
FG
7611
7612 err = prepare_command_osd_crush_remove(
7613 newcrush,
7614 id, ancestor,
7615 (ancestor < 0), unlink_only);
7616
7c673cae
FG
7617 if (err == -ENOENT) {
7618 ss << "item " << id << " does not appear in that position";
7619 err = 0;
7620 break;
7621 }
7622 if (err == 0) {
7c673cae
FG
7623 ss << "removed item id " << id << " name '" << name << "' from crush map";
7624 getline(ss, rs);
7625 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7626 get_last_committed() + 1));
7627 return true;
7628 }
7629 } while (false);
7630
7631 } else if (prefix == "osd crush reweight-all") {
7c673cae
FG
7632 CrushWrapper newcrush;
7633 _get_pending_crush(newcrush);
7634
7635 newcrush.reweight(g_ceph_context);
7636 pending_inc.crush.clear();
7637 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7638 ss << "reweighted crush hierarchy";
7639 getline(ss, rs);
7640 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7641 get_last_committed() + 1));
7642 return true;
7643 } else if (prefix == "osd crush reweight") {
7644 // osd crush reweight <name> <weight>
7645 CrushWrapper newcrush;
7646 _get_pending_crush(newcrush);
7647
7648 string name;
7649 cmd_getval(g_ceph_context, cmdmap, "name", name);
7650 if (!newcrush.name_exists(name)) {
7651 err = -ENOENT;
7652 ss << "device '" << name << "' does not appear in the crush map";
7653 goto reply;
7654 }
7655
7656 int id = newcrush.get_item_id(name);
7657 if (id < 0) {
7658 ss << "device '" << name << "' is not a leaf in the crush map";
7659 err = -EINVAL;
7660 goto reply;
7661 }
7662 double w;
7663 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
7664 ss << "unable to parse weight value '"
7665 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7666 err = -EINVAL;
7667 goto reply;
7668 }
7669
7670 err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
7671 if (err < 0)
7672 goto reply;
7673 pending_inc.crush.clear();
7674 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7675 ss << "reweighted item id " << id << " name '" << name << "' to " << w
7676 << " in crush map";
7677 getline(ss, rs);
7678 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7679 get_last_committed() + 1));
7680 return true;
7681 } else if (prefix == "osd crush reweight-subtree") {
7682 // osd crush reweight <name> <weight>
7683 CrushWrapper newcrush;
7684 _get_pending_crush(newcrush);
7685
7686 string name;
7687 cmd_getval(g_ceph_context, cmdmap, "name", name);
7688 if (!newcrush.name_exists(name)) {
7689 err = -ENOENT;
7690 ss << "device '" << name << "' does not appear in the crush map";
7691 goto reply;
7692 }
7693
7694 int id = newcrush.get_item_id(name);
7695 if (id >= 0) {
7696 ss << "device '" << name << "' is not a subtree in the crush map";
7697 err = -EINVAL;
7698 goto reply;
7699 }
7700 double w;
7701 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
7702 ss << "unable to parse weight value '"
7703 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
7704 err = -EINVAL;
7705 goto reply;
7706 }
7707
7708 err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
7709 if (err < 0)
7710 goto reply;
7711 pending_inc.crush.clear();
7712 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7713 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
7714 << " in crush map";
7715 getline(ss, rs);
7716 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7717 get_last_committed() + 1));
7718 return true;
7719 } else if (prefix == "osd crush tunables") {
7720 CrushWrapper newcrush;
7721 _get_pending_crush(newcrush);
7722
7723 err = 0;
7724 string profile;
7725 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
7726 if (profile == "legacy" || profile == "argonaut") {
7727 newcrush.set_tunables_legacy();
7728 } else if (profile == "bobtail") {
7729 newcrush.set_tunables_bobtail();
7730 } else if (profile == "firefly") {
7731 newcrush.set_tunables_firefly();
7732 } else if (profile == "hammer") {
7733 newcrush.set_tunables_hammer();
7734 } else if (profile == "jewel") {
7735 newcrush.set_tunables_jewel();
7736 } else if (profile == "optimal") {
7737 newcrush.set_tunables_optimal();
7738 } else if (profile == "default") {
7739 newcrush.set_tunables_default();
7740 } else {
7741 ss << "unrecognized profile '" << profile << "'";
7742 err = -EINVAL;
7743 goto reply;
7744 }
7745
7746 if (!validate_crush_against_features(&newcrush, ss)) {
7747 err = -EINVAL;
7748 goto reply;
7749 }
7750
7751 pending_inc.crush.clear();
7752 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7753 ss << "adjusted tunables profile to " << profile;
7754 getline(ss, rs);
7755 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7756 get_last_committed() + 1));
7757 return true;
7758 } else if (prefix == "osd crush set-tunable") {
7759 CrushWrapper newcrush;
7760 _get_pending_crush(newcrush);
7761
7762 err = 0;
7763 string tunable;
7764 cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
7765
7766 int64_t value = -1;
7767 if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
7768 err = -EINVAL;
7769 ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
7770 goto reply;
7771 }
7772
7773 if (tunable == "straw_calc_version") {
224ce89b 7774 if (value != 0 && value != 1) {
7c673cae
FG
7775 ss << "value must be 0 or 1; got " << value;
7776 err = -EINVAL;
7777 goto reply;
7778 }
7779 newcrush.set_straw_calc_version(value);
7780 } else {
7781 ss << "unrecognized tunable '" << tunable << "'";
7782 err = -EINVAL;
7783 goto reply;
7784 }
7785
7786 if (!validate_crush_against_features(&newcrush, ss)) {
7787 err = -EINVAL;
7788 goto reply;
7789 }
7790
7791 pending_inc.crush.clear();
7792 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7793 ss << "adjusted tunable " << tunable << " to " << value;
7794 getline(ss, rs);
7795 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7796 get_last_committed() + 1));
7797 return true;
7798
7799 } else if (prefix == "osd crush rule create-simple") {
7800 string name, root, type, mode;
7801 cmd_getval(g_ceph_context, cmdmap, "name", name);
7802 cmd_getval(g_ceph_context, cmdmap, "root", root);
7803 cmd_getval(g_ceph_context, cmdmap, "type", type);
7804 cmd_getval(g_ceph_context, cmdmap, "mode", mode);
7805 if (mode == "")
7806 mode = "firstn";
7807
7808 if (osdmap.crush->rule_exists(name)) {
31f18b77
FG
7809 // The name is uniquely associated to a ruleid and the rule it contains
7810 // From the user point of view, the rule is more meaningfull.
7811 ss << "rule " << name << " already exists";
7c673cae
FG
7812 err = 0;
7813 goto reply;
7814 }
7815
7816 CrushWrapper newcrush;
7817 _get_pending_crush(newcrush);
7818
7819 if (newcrush.rule_exists(name)) {
31f18b77
FG
7820 // The name is uniquely associated to a ruleid and the rule it contains
7821 // From the user point of view, the rule is more meaningfull.
7822 ss << "rule " << name << " already exists";
7c673cae
FG
7823 err = 0;
7824 } else {
224ce89b 7825 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
7c673cae
FG
7826 pg_pool_t::TYPE_REPLICATED, &ss);
7827 if (ruleno < 0) {
7828 err = ruleno;
7829 goto reply;
7830 }
7831
7832 pending_inc.crush.clear();
7833 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7834 }
7835 getline(ss, rs);
7836 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7837 get_last_committed() + 1));
7838 return true;
7839
224ce89b
WB
7840 } else if (prefix == "osd crush rule create-replicated") {
7841 string name, root, type, device_class;
7842 cmd_getval(g_ceph_context, cmdmap, "name", name);
7843 cmd_getval(g_ceph_context, cmdmap, "root", root);
7844 cmd_getval(g_ceph_context, cmdmap, "type", type);
7845 cmd_getval(g_ceph_context, cmdmap, "class", device_class);
7846
7847 if (!device_class.empty()) {
7848 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
7849 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
7850 << "luminous' before using crush device classes";
7851 err = -EPERM;
7852 goto reply;
7853 }
7854 }
7855
7856 if (osdmap.crush->rule_exists(name)) {
7857 // The name is uniquely associated to a ruleid and the rule it contains
7858 // From the user point of view, the rule is more meaningfull.
7859 ss << "rule " << name << " already exists";
7860 err = 0;
7861 goto reply;
7862 }
7863
7864 CrushWrapper newcrush;
7865 _get_pending_crush(newcrush);
7866
7867 if (newcrush.rule_exists(name)) {
7868 // The name is uniquely associated to a ruleid and the rule it contains
7869 // From the user point of view, the rule is more meaningfull.
7870 ss << "rule " << name << " already exists";
7871 err = 0;
7872 } else {
7873 int ruleno = newcrush.add_simple_rule(
7874 name, root, type, device_class,
7875 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
7876 if (ruleno < 0) {
7877 err = ruleno;
7878 goto reply;
7879 }
7880
7881 pending_inc.crush.clear();
7882 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7883 }
7884 getline(ss, rs);
7885 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7886 get_last_committed() + 1));
7887 return true;
7888
7c673cae
FG
7889 } else if (prefix == "osd erasure-code-profile rm") {
7890 string name;
7891 cmd_getval(g_ceph_context, cmdmap, "name", name);
7892
7893 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
7894 goto wait;
7895
7896 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
7897 err = -EBUSY;
7898 goto reply;
7899 }
7900
7901 if (osdmap.has_erasure_code_profile(name) ||
7902 pending_inc.new_erasure_code_profiles.count(name)) {
7903 if (osdmap.has_erasure_code_profile(name)) {
7904 pending_inc.old_erasure_code_profiles.push_back(name);
7905 } else {
7906 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
7907 pending_inc.new_erasure_code_profiles.erase(name);
7908 }
7909
7910 getline(ss, rs);
7911 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7912 get_last_committed() + 1));
7913 return true;
7914 } else {
7915 ss << "erasure-code-profile " << name << " does not exist";
7916 err = 0;
7917 goto reply;
7918 }
7919
7920 } else if (prefix == "osd erasure-code-profile set") {
7921 string name;
7922 cmd_getval(g_ceph_context, cmdmap, "name", name);
7923 vector<string> profile;
7924 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
7925 bool force;
7926 if (profile.size() > 0 && profile.back() == "--force") {
7927 profile.pop_back();
7928 force = true;
7929 } else {
7930 force = false;
7931 }
7932 map<string,string> profile_map;
7933 err = parse_erasure_code_profile(profile, &profile_map, &ss);
7934 if (err)
7935 goto reply;
7936 if (profile_map.find("plugin") == profile_map.end()) {
7937 ss << "erasure-code-profile " << profile_map
7938 << " must contain a plugin entry" << std::endl;
7939 err = -EINVAL;
7940 goto reply;
7941 }
7942 string plugin = profile_map["plugin"];
7943
7944 if (pending_inc.has_erasure_code_profile(name)) {
7945 dout(20) << "erasure code profile " << name << " try again" << dendl;
7946 goto wait;
7947 } else {
7948 if (plugin == "isa" || plugin == "lrc") {
7949 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
7950 if (err == -EAGAIN)
7951 goto wait;
7952 if (err)
7953 goto reply;
7954 } else if (plugin == "shec") {
7955 err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3, ss);
7956 if (err == -EAGAIN)
7957 goto wait;
7958 if (err)
7959 goto reply;
7960 }
7961 err = normalize_profile(name, profile_map, force, &ss);
7962 if (err)
7963 goto reply;
7964
7965 if (osdmap.has_erasure_code_profile(name)) {
7966 ErasureCodeProfile existing_profile_map =
7967 osdmap.get_erasure_code_profile(name);
7968 err = normalize_profile(name, existing_profile_map, force, &ss);
7969 if (err)
7970 goto reply;
7971
7972 if (existing_profile_map == profile_map) {
7973 err = 0;
7974 goto reply;
7975 }
7976 if (!force) {
7977 err = -EPERM;
7978 ss << "will not override erasure code profile " << name
7979 << " because the existing profile "
7980 << existing_profile_map
7981 << " is different from the proposed profile "
7982 << profile_map;
7983 goto reply;
7984 }
7985 }
7986
7987 dout(20) << "erasure code profile set " << name << "="
7988 << profile_map << dendl;
7989 pending_inc.set_erasure_code_profile(name, profile_map);
7990 }
7991
7992 getline(ss, rs);
7993 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
7994 get_last_committed() + 1));
7995 return true;
7996
7997 } else if (prefix == "osd crush rule create-erasure") {
7998 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
7999 if (err == -EAGAIN)
8000 goto wait;
8001 if (err)
8002 goto reply;
8003 string name, poolstr;
8004 cmd_getval(g_ceph_context, cmdmap, "name", name);
8005 string profile;
8006 cmd_getval(g_ceph_context, cmdmap, "profile", profile);
8007 if (profile == "")
8008 profile = "default";
8009 if (profile == "default") {
8010 if (!osdmap.has_erasure_code_profile(profile)) {
8011 if (pending_inc.has_erasure_code_profile(profile)) {
8012 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
8013 goto wait;
8014 }
8015
8016 map<string,string> profile_map;
8017 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
8018 profile_map,
8019 &ss);
8020 if (err)
8021 goto reply;
8022 err = normalize_profile(name, profile_map, true, &ss);
8023 if (err)
8024 goto reply;
8025 dout(20) << "erasure code profile set " << profile << "="
8026 << profile_map << dendl;
8027 pending_inc.set_erasure_code_profile(profile, profile_map);
8028 goto wait;
8029 }
8030 }
8031
31f18b77
FG
8032 int rule;
8033 err = crush_rule_create_erasure(name, profile, &rule, &ss);
7c673cae
FG
8034 if (err < 0) {
8035 switch(err) {
8036 case -EEXIST: // return immediately
8037 ss << "rule " << name << " already exists";
8038 err = 0;
8039 goto reply;
8040 break;
8041 case -EALREADY: // wait for pending to be proposed
8042 ss << "rule " << name << " already exists";
8043 err = 0;
8044 break;
8045 default: // non recoverable error
8046 goto reply;
8047 break;
8048 }
8049 } else {
31f18b77 8050 ss << "created rule " << name << " at " << rule;
7c673cae
FG
8051 }
8052
8053 getline(ss, rs);
8054 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8055 get_last_committed() + 1));
8056 return true;
8057
8058 } else if (prefix == "osd crush rule rm") {
8059 string name;
8060 cmd_getval(g_ceph_context, cmdmap, "name", name);
8061
8062 if (!osdmap.crush->rule_exists(name)) {
8063 ss << "rule " << name << " does not exist";
8064 err = 0;
8065 goto reply;
8066 }
8067
8068 CrushWrapper newcrush;
8069 _get_pending_crush(newcrush);
8070
8071 if (!newcrush.rule_exists(name)) {
8072 ss << "rule " << name << " does not exist";
8073 err = 0;
8074 } else {
8075 int ruleno = newcrush.get_rule_id(name);
8076 assert(ruleno >= 0);
8077
8078 // make sure it is not in use.
8079 // FIXME: this is ok in some situations, but let's not bother with that
8080 // complexity now.
8081 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
8082 if (osdmap.crush_ruleset_in_use(ruleset)) {
8083 ss << "crush ruleset " << name << " " << ruleset << " is in use";
8084 err = -EBUSY;
8085 goto reply;
8086 }
8087
8088 err = newcrush.remove_rule(ruleno);
8089 if (err < 0) {
8090 goto reply;
8091 }
8092
8093 pending_inc.crush.clear();
8094 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8095 }
8096 getline(ss, rs);
8097 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8098 get_last_committed() + 1));
8099 return true;
8100
8101 } else if (prefix == "osd setmaxosd") {
8102 int64_t newmax;
8103 if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
8104 ss << "unable to parse 'newmax' value '"
8105 << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
8106 err = -EINVAL;
8107 goto reply;
8108 }
8109
8110 if (newmax > g_conf->mon_max_osd) {
8111 err = -ERANGE;
8112 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
8113 << g_conf->mon_max_osd << ")";
8114 goto reply;
8115 }
8116
8117 // Don't allow shrinking OSD number as this will cause data loss
8118 // and may cause kernel crashes.
8119 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
8120 if (newmax < osdmap.get_max_osd()) {
8121 // Check if the OSDs exist between current max and new value.
8122 // If there are any OSDs exist, then don't allow shrinking number
8123 // of OSDs.
8124 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
8125 if (osdmap.exists(i)) {
8126 err = -EBUSY;
8127 ss << "cannot shrink max_osd to " << newmax
8128 << " because osd." << i << " (and possibly others) still in use";
8129 goto reply;
8130 }
8131 }
8132 }
8133
8134 pending_inc.new_max_osd = newmax;
8135 ss << "set new max_osd = " << pending_inc.new_max_osd;
8136 getline(ss, rs);
8137 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8138 get_last_committed() + 1));
8139 return true;
8140
8141 } else if (prefix == "osd set-full-ratio" ||
8142 prefix == "osd set-backfillfull-ratio" ||
8143 prefix == "osd set-nearfull-ratio") {
31f18b77 8144 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
8145 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8146 << "luminous' before using the new interface";
7c673cae
FG
8147 err = -EPERM;
8148 goto reply;
8149 }
8150 double n;
8151 if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
8152 ss << "unable to parse 'ratio' value '"
224ce89b 8153 << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
7c673cae
FG
8154 err = -EINVAL;
8155 goto reply;
8156 }
8157 if (prefix == "osd set-full-ratio")
8158 pending_inc.new_full_ratio = n;
8159 else if (prefix == "osd set-backfillfull-ratio")
8160 pending_inc.new_backfillfull_ratio = n;
8161 else if (prefix == "osd set-nearfull-ratio")
8162 pending_inc.new_nearfull_ratio = n;
8163 ss << prefix << " " << n;
8164 getline(ss, rs);
8165 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8166 get_last_committed() + 1));
8167 return true;
8168 } else if (prefix == "osd set-require-min-compat-client") {
31f18b77 8169 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
8170 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8171 << "luminous' before using the new interface";
7c673cae
FG
8172 err = -EPERM;
8173 goto reply;
8174 }
8175 string v;
8176 cmd_getval(g_ceph_context, cmdmap, "version", v);
31f18b77
FG
8177 int vno = ceph_release_from_name(v.c_str());
8178 if (vno <= 0) {
7c673cae
FG
8179 ss << "version " << v << " is not recognized";
8180 err = -EINVAL;
8181 goto reply;
8182 }
8183 OSDMap newmap;
8184 newmap.deepish_copy_from(osdmap);
8185 newmap.apply_incremental(pending_inc);
31f18b77
FG
8186 newmap.require_min_compat_client = vno;
8187 auto mvno = newmap.get_min_compat_client();
8188 if (vno < mvno) {
8189 ss << "osdmap current utilizes features that require "
8190 << ceph_release_name(mvno)
8191 << "; cannot set require_min_compat_client below that to "
8192 << ceph_release_name(vno);
7c673cae
FG
8193 err = -EPERM;
8194 goto reply;
8195 }
31f18b77
FG
8196 string sure;
8197 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
8198 if (sure != "--yes-i-really-mean-it") {
8199 FeatureMap m;
8200 mon->get_combined_feature_map(&m);
8201 uint64_t features = ceph_release_features(vno);
8202 bool first = true;
8203 bool ok = true;
8204 for (int type : {
8205 CEPH_ENTITY_TYPE_CLIENT,
8206 CEPH_ENTITY_TYPE_MDS,
8207 CEPH_ENTITY_TYPE_MGR }) {
8208 auto p = m.m.find(type);
8209 if (p == m.m.end()) {
8210 continue;
8211 }
8212 for (auto& q : p->second) {
8213 uint64_t missing = ~q.first & features;
8214 if (missing) {
8215 if (first) {
8216 ss << "cannot set require_min_compat_client to " << v << ": ";
8217 } else {
8218 ss << "; ";
8219 }
8220 first = false;
8221 ss << q.second << " connected " << ceph_entity_type_name(type)
8222 << "(s) look like " << ceph_release_name(
8223 ceph_release_from_features(q.first))
8224 << " (missing 0x" << std::hex << missing << std::dec << ")";
8225 ok = false;
8226 }
8227 }
8228 }
8229 if (!ok) {
8230 ss << "; add --yes-i-really-mean-it to do it anyway";
8231 err = -EPERM;
8232 goto reply;
8233 }
8234 }
8235 ss << "set require_min_compat_client to " << ceph_release_name(vno);
8236 pending_inc.new_require_min_compat_client = vno;
7c673cae
FG
8237 getline(ss, rs);
8238 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8239 get_last_committed() + 1));
8240 return true;
8241 } else if (prefix == "osd pause") {
8242 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8243
8244 } else if (prefix == "osd unpause") {
8245 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8246
8247 } else if (prefix == "osd set") {
8248 string key;
8249 cmd_getval(g_ceph_context, cmdmap, "key", key);
8250 if (key == "full")
8251 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
8252 else if (key == "pause")
8253 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8254 else if (key == "noup")
8255 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
8256 else if (key == "nodown")
8257 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
8258 else if (key == "noout")
8259 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
8260 else if (key == "noin")
8261 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
8262 else if (key == "nobackfill")
8263 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
8264 else if (key == "norebalance")
8265 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
8266 else if (key == "norecover")
8267 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
8268 else if (key == "noscrub")
8269 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
8270 else if (key == "nodeep-scrub")
8271 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8272 else if (key == "notieragent")
8273 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
8274 else if (key == "sortbitwise") {
8275 if (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT) {
8276 return prepare_set_flag(op, CEPH_OSDMAP_SORTBITWISE);
8277 } else {
8278 ss << "not all up OSDs have OSD_BITWISE_HOBJ_SORT feature";
8279 err = -EPERM;
31f18b77 8280 goto reply;
7c673cae
FG
8281 }
8282 } else if (key == "require_jewel_osds") {
8283 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8284 ss << "the sortbitwise flag must be set before require_jewel_osds";
8285 err = -EPERM;
31f18b77
FG
8286 goto reply;
8287 } else if (osdmap.require_osd_release >= CEPH_RELEASE_JEWEL) {
8288 ss << "require_osd_release is already >= jewel";
8289 err = 0;
8290 goto reply;
7c673cae
FG
8291 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_JEWEL)) {
8292 return prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_JEWEL);
8293 } else {
8294 ss << "not all up OSDs have CEPH_FEATURE_SERVER_JEWEL feature";
8295 err = -EPERM;
8296 }
8297 } else if (key == "require_kraken_osds") {
8298 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8299 ss << "the sortbitwise flag must be set before require_kraken_osds";
8300 err = -EPERM;
31f18b77
FG
8301 goto reply;
8302 } else if (osdmap.require_osd_release >= CEPH_RELEASE_KRAKEN) {
8303 ss << "require_osd_release is already >= kraken";
8304 err = 0;
8305 goto reply;
7c673cae
FG
8306 } else if (HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_KRAKEN)) {
8307 bool r = prepare_set_flag(op, CEPH_OSDMAP_REQUIRE_KRAKEN);
8308 // ensure JEWEL is also set
8309 pending_inc.new_flags |= CEPH_OSDMAP_REQUIRE_JEWEL;
8310 return r;
8311 } else {
8312 ss << "not all up OSDs have CEPH_FEATURE_SERVER_KRAKEN feature";
8313 err = -EPERM;
8314 }
7c673cae
FG
8315 } else {
8316 ss << "unrecognized flag '" << key << "'";
8317 err = -EINVAL;
8318 }
8319
8320 } else if (prefix == "osd unset") {
8321 string key;
8322 cmd_getval(g_ceph_context, cmdmap, "key", key);
8323 if (key == "full")
8324 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
8325 else if (key == "pause")
8326 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
8327 else if (key == "noup")
8328 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
8329 else if (key == "nodown")
8330 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
8331 else if (key == "noout")
8332 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
8333 else if (key == "noin")
8334 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
8335 else if (key == "nobackfill")
8336 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
8337 else if (key == "norebalance")
8338 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
8339 else if (key == "norecover")
8340 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
8341 else if (key == "noscrub")
8342 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
8343 else if (key == "nodeep-scrub")
8344 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
8345 else if (key == "notieragent")
8346 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
224ce89b 8347 else {
7c673cae
FG
8348 ss << "unrecognized flag '" << key << "'";
8349 err = -EINVAL;
8350 }
8351
31f18b77
FG
8352 } else if (prefix == "osd require-osd-release") {
8353 string release;
8354 cmd_getval(g_ceph_context, cmdmap, "release", release);
8355 if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8356 ss << "the sortbitwise flag must be set first";
8357 err = -EPERM;
8358 goto reply;
8359 }
8360 int rel = ceph_release_from_name(release.c_str());
8361 if (rel <= 0) {
8362 ss << "unrecognized release " << release;
8363 err = -EINVAL;
8364 goto reply;
8365 }
8366 if (rel < CEPH_RELEASE_LUMINOUS) {
8367 ss << "use this command only for luminous and later";
8368 err = -EINVAL;
8369 goto reply;
8370 }
8371 if (rel == CEPH_RELEASE_LUMINOUS) {
8372 if (!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_LUMINOUS)) {
8373 ss << "not all up OSDs have CEPH_FEATURE_SERVER_LUMINOUS feature";
8374 err = -EPERM;
8375 goto reply;
8376 }
8377 } else {
8378 ss << "not supported for this release yet";
8379 err = -EPERM;
8380 goto reply;
8381 }
8382 if (rel < osdmap.require_osd_release) {
8383 ss << "require_osd_release cannot be lowered once it has been set";
8384 err = -EPERM;
8385 goto reply;
8386 }
8387 pending_inc.new_require_osd_release = rel;
8388 goto update;
7c673cae
FG
8389 } else if (prefix == "osd cluster_snap") {
8390 // ** DISABLE THIS FOR NOW **
8391 ss << "cluster snapshot currently disabled (broken implementation)";
8392 // ** DISABLE THIS FOR NOW **
8393
8394 } else if (prefix == "osd down" ||
8395 prefix == "osd out" ||
8396 prefix == "osd in" ||
8397 prefix == "osd rm") {
8398
8399 bool any = false;
31f18b77
FG
8400 bool stop = false;
8401 bool verbose = true;
7c673cae
FG
8402
8403 vector<string> idvec;
8404 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
31f18b77
FG
8405 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8406 set<int> osds;
8407
8408 // wildcard?
8409 if (j == 0 &&
8410 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8411 if (prefix == "osd in") {
8412 // touch out osds only
8413 osdmap.get_out_osds(osds);
8414 } else {
8415 osdmap.get_all_osds(osds);
8416 }
8417 stop = true;
8418 verbose = false; // so the output is less noisy.
8419 } else {
8420 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8421 if (osd < 0) {
8422 ss << "invalid osd id" << osd;
8423 err = -EINVAL;
8424 continue;
8425 } else if (!osdmap.exists(osd)) {
8426 ss << "osd." << osd << " does not exist. ";
8427 continue;
8428 }
8429
8430 osds.insert(osd);
7c673cae 8431 }
31f18b77
FG
8432
8433 for (auto &osd : osds) {
8434 if (prefix == "osd down") {
8435 if (osdmap.is_down(osd)) {
8436 if (verbose)
8437 ss << "osd." << osd << " is already down. ";
8438 } else {
8439 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
8440 ss << "marked down osd." << osd << ". ";
8441 any = true;
8442 }
8443 } else if (prefix == "osd out") {
8444 if (osdmap.is_out(osd)) {
8445 if (verbose)
8446 ss << "osd." << osd << " is already out. ";
8447 } else {
8448 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
8449 if (osdmap.osd_weight[osd]) {
8450 if (pending_inc.new_xinfo.count(osd) == 0) {
8451 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
8452 }
8453 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
7c673cae 8454 }
31f18b77 8455 ss << "marked out osd." << osd << ". ";
224ce89b
WB
8456 std::ostringstream msg;
8457 msg << "Client " << op->get_session()->entity_name
8458 << " marked osd." << osd << " out";
8459 if (osdmap.is_up(osd)) {
8460 msg << ", while it was still marked up";
8461 } else {
8462 msg << ", after it was down for " << int(down_pending_out[osd].sec())
8463 << " seconds";
8464 }
8465
8466 mon->clog->info() << msg.str();
31f18b77 8467 any = true;
7c673cae 8468 }
31f18b77
FG
8469 } else if (prefix == "osd in") {
8470 if (osdmap.is_in(osd)) {
8471 if (verbose)
8472 ss << "osd." << osd << " is already in. ";
8473 } else {
8474 if (osdmap.osd_xinfo[osd].old_weight > 0) {
8475 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
8476 if (pending_inc.new_xinfo.count(osd) == 0) {
8477 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
8478 }
8479 pending_inc.new_xinfo[osd].old_weight = 0;
8480 } else {
8481 pending_inc.new_weight[osd] = CEPH_OSD_IN;
7c673cae 8482 }
31f18b77
FG
8483 ss << "marked in osd." << osd << ". ";
8484 any = true;
8485 }
8486 } else if (prefix == "osd rm") {
8487 err = prepare_command_osd_remove(osd);
8488
8489 if (err == -EBUSY) {
8490 if (any)
8491 ss << ", ";
8492 ss << "osd." << osd << " is still up; must be down before removal. ";
7c673cae 8493 } else {
31f18b77
FG
8494 assert(err == 0);
8495 if (any) {
8496 ss << ", osd." << osd;
8497 } else {
8498 ss << "removed osd." << osd;
8499 }
8500 any = true;
7c673cae 8501 }
31f18b77
FG
8502 }
8503 }
8504 }
8505 if (any) {
8506 getline(ss, rs);
8507 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
8508 get_last_committed() + 1));
8509 return true;
8510 }
8511 } else if (prefix == "osd add-noup" ||
8512 prefix == "osd add-nodown" ||
8513 prefix == "osd add-noin" ||
8514 prefix == "osd add-noout") {
8515
8516 enum {
8517 OP_NOUP,
8518 OP_NODOWN,
8519 OP_NOIN,
8520 OP_NOOUT,
8521 } option;
8522
8523 if (prefix == "osd add-noup") {
8524 option = OP_NOUP;
8525 } else if (prefix == "osd add-nodown") {
8526 option = OP_NODOWN;
8527 } else if (prefix == "osd add-noin") {
8528 option = OP_NOIN;
8529 } else {
8530 option = OP_NOOUT;
8531 }
8532
8533 bool any = false;
8534 bool stop = false;
8535
8536 vector<string> idvec;
8537 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
8538 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8539
8540 set<int> osds;
8541
8542 // wildcard?
8543 if (j == 0 &&
8544 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8545 osdmap.get_all_osds(osds);
8546 stop = true;
8547 } else {
8548 // try traditional single osd way
8549
8550 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8551 if (osd < 0) {
8552 // ss has reason for failure
8553 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8554 err = -EINVAL;
8555 continue;
8556 }
8557
8558 osds.insert(osd);
8559 }
8560
8561 for (auto &osd : osds) {
8562
8563 if (!osdmap.exists(osd)) {
8564 ss << "osd." << osd << " does not exist. ";
8565 continue;
8566 }
8567
8568 switch (option) {
8569 case OP_NOUP:
8570 if (osdmap.is_up(osd)) {
8571 ss << "osd." << osd << " is already up. ";
8572 continue;
8573 }
8574
8575 if (osdmap.is_noup(osd)) {
8576 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP))
8577 any = true;
7c673cae 8578 } else {
31f18b77
FG
8579 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
8580 any = true;
7c673cae 8581 }
31f18b77
FG
8582
8583 break;
8584
8585 case OP_NODOWN:
8586 if (osdmap.is_down(osd)) {
8587 ss << "osd." << osd << " is already down. ";
8588 continue;
8589 }
8590
8591 if (osdmap.is_nodown(osd)) {
8592 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN))
8593 any = true;
8594 } else {
8595 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
8596 any = true;
8597 }
8598
8599 break;
8600
8601 case OP_NOIN:
8602 if (osdmap.is_in(osd)) {
8603 ss << "osd." << osd << " is already in. ";
8604 continue;
8605 }
8606
8607 if (osdmap.is_noin(osd)) {
8608 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN))
8609 any = true;
8610 } else {
8611 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
8612 any = true;
8613 }
8614
8615 break;
8616
8617 case OP_NOOUT:
8618 if (osdmap.is_out(osd)) {
8619 ss << "osd." << osd << " is already out. ";
8620 continue;
8621 }
8622
8623 if (osdmap.is_noout(osd)) {
8624 if (pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT))
8625 any = true;
8626 } else {
8627 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
8628 any = true;
8629 }
8630
8631 break;
8632
8633 default:
8634 assert(0 == "invalid option");
8635 }
7c673cae
FG
8636 }
8637 }
31f18b77 8638
7c673cae
FG
8639 if (any) {
8640 getline(ss, rs);
8641 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
31f18b77
FG
8642 get_last_committed() + 1));
8643 return true;
8644 }
8645 } else if (prefix == "osd rm-noup" ||
8646 prefix == "osd rm-nodown" ||
8647 prefix == "osd rm-noin" ||
8648 prefix == "osd rm-noout") {
8649
8650 enum {
8651 OP_NOUP,
8652 OP_NODOWN,
8653 OP_NOIN,
8654 OP_NOOUT,
8655 } option;
8656
8657 if (prefix == "osd rm-noup") {
8658 option = OP_NOUP;
8659 } else if (prefix == "osd rm-nodown") {
8660 option = OP_NODOWN;
8661 } else if (prefix == "osd rm-noin") {
8662 option = OP_NOIN;
8663 } else {
8664 option = OP_NOOUT;
8665 }
8666
8667 bool any = false;
8668 bool stop = false;
8669
8670 vector<string> idvec;
8671 cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
8672
8673 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8674
8675 vector<int> osds;
8676
8677 // wildcard?
8678 if (j == 0 &&
8679 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8680
8681 // touch previous noup/nodown/noin/noout osds only
8682 switch (option) {
8683 case OP_NOUP:
8684 osdmap.get_noup_osds(&osds);
8685 break;
8686 case OP_NODOWN:
8687 osdmap.get_nodown_osds(&osds);
8688 break;
8689 case OP_NOIN:
8690 osdmap.get_noin_osds(&osds);
8691 break;
8692 case OP_NOOUT:
8693 osdmap.get_noout_osds(&osds);
8694 break;
8695 default:
8696 assert(0 == "invalid option");
8697 }
8698
8699 // cancel any pending noup/nodown/noin/noout requests too
8700 vector<int> pending_state_osds;
8701 (void) pending_inc.get_pending_state_osds(&pending_state_osds);
8702 for (auto &p : pending_state_osds) {
8703
8704 switch (option) {
8705 case OP_NOUP:
8706 if (!osdmap.is_noup(p) &&
8707 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOUP)) {
8708 any = true;
8709 }
8710 break;
8711
8712 case OP_NODOWN:
8713 if (!osdmap.is_nodown(p) &&
8714 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NODOWN)) {
8715 any = true;
8716 }
8717 break;
8718
8719 case OP_NOIN:
8720 if (!osdmap.is_noin(p) &&
8721 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOIN)) {
8722 any = true;
8723 }
8724 break;
8725
8726 case OP_NOOUT:
8727 if (!osdmap.is_noout(p) &&
8728 pending_inc.pending_osd_state_clear(p, CEPH_OSD_NOOUT)) {
8729 any = true;
8730 }
8731 break;
8732
8733 default:
8734 assert(0 == "invalid option");
8735 }
8736 }
8737
8738 stop = true;
8739 } else {
8740 // try traditional single osd way
8741
8742 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8743 if (osd < 0) {
8744 // ss has reason for failure
8745 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8746 err = -EINVAL;
8747 continue;
8748 }
8749
8750 osds.push_back(osd);
8751 }
8752
8753 for (auto &osd : osds) {
8754
8755 if (!osdmap.exists(osd)) {
8756 ss << "osd." << osd << " does not exist. ";
8757 continue;
8758 }
8759
8760 switch (option) {
8761 case OP_NOUP:
8762 if (osdmap.is_noup(osd)) {
8763 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
8764 any = true;
8765 } else if (pending_inc.pending_osd_state_clear(
8766 osd, CEPH_OSD_NOUP)) {
8767 any = true;
8768 }
8769 break;
8770
8771 case OP_NODOWN:
8772 if (osdmap.is_nodown(osd)) {
8773 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
8774 any = true;
8775 } else if (pending_inc.pending_osd_state_clear(
8776 osd, CEPH_OSD_NODOWN)) {
8777 any = true;
8778 }
8779 break;
8780
8781 case OP_NOIN:
8782 if (osdmap.is_noin(osd)) {
8783 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
8784 any = true;
8785 } else if (pending_inc.pending_osd_state_clear(
8786 osd, CEPH_OSD_NOIN)) {
8787 any = true;
8788 }
8789 break;
8790
8791 case OP_NOOUT:
8792 if (osdmap.is_noout(osd)) {
8793 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
8794 any = true;
8795 } else if (pending_inc.pending_osd_state_clear(
8796 osd, CEPH_OSD_NOOUT)) {
8797 any = true;
8798 }
8799 break;
8800
8801 default:
8802 assert(0 == "invalid option");
8803 }
8804 }
8805 }
8806
8807 if (any) {
8808 getline(ss, rs);
8809 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
8810 get_last_committed() + 1));
7c673cae
FG
8811 return true;
8812 }
8813 } else if (prefix == "osd pg-temp") {
8814 string pgidstr;
8815 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8816 ss << "unable to parse 'pgid' value '"
8817 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8818 err = -EINVAL;
8819 goto reply;
8820 }
8821 pg_t pgid;
8822 if (!pgid.parse(pgidstr.c_str())) {
8823 ss << "invalid pgid '" << pgidstr << "'";
8824 err = -EINVAL;
8825 goto reply;
8826 }
8827 if (!osdmap.pg_exists(pgid)) {
8828 ss << "pg " << pgid << " does not exist";
8829 err = -ENOENT;
8830 goto reply;
8831 }
8832 if (pending_inc.new_pg_temp.count(pgid)) {
8833 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
8834 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8835 return true;
8836 }
8837
8838 vector<int64_t> id_vec;
8839 vector<int32_t> new_pg_temp;
8840 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
8841 ss << "unable to parse 'id' value(s) '"
8842 << cmd_vartype_stringify(cmdmap["id"]) << "'";
8843 err = -EINVAL;
8844 goto reply;
8845 }
8846 for (auto osd : id_vec) {
8847 if (!osdmap.exists(osd)) {
8848 ss << "osd." << osd << " does not exist";
8849 err = -ENOENT;
8850 goto reply;
8851 }
8852 new_pg_temp.push_back(osd);
8853 }
8854
224ce89b
WB
8855 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
8856 if ((int)new_pg_temp.size() < pool_min_size) {
8857 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
8858 << pool_min_size << ")";
8859 err = -EINVAL;
8860 goto reply;
8861 }
8862
8863 int pool_size = osdmap.get_pg_pool_size(pgid);
8864 if ((int)new_pg_temp.size() > pool_size) {
8865 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
8866 << pool_size << ")";
8867 err = -EINVAL;
8868 goto reply;
8869 }
8870
7c673cae
FG
8871 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
8872 new_pg_temp.begin(), new_pg_temp.end());
8873 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
8874 goto update;
8875 } else if (prefix == "osd primary-temp") {
8876 string pgidstr;
8877 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8878 ss << "unable to parse 'pgid' value '"
8879 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8880 err = -EINVAL;
8881 goto reply;
8882 }
8883 pg_t pgid;
8884 if (!pgid.parse(pgidstr.c_str())) {
8885 ss << "invalid pgid '" << pgidstr << "'";
8886 err = -EINVAL;
8887 goto reply;
8888 }
8889 if (!osdmap.pg_exists(pgid)) {
8890 ss << "pg " << pgid << " does not exist";
8891 err = -ENOENT;
8892 goto reply;
8893 }
8894
8895 int64_t osd;
8896 if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
8897 ss << "unable to parse 'id' value '"
8898 << cmd_vartype_stringify(cmdmap["id"]) << "'";
8899 err = -EINVAL;
8900 goto reply;
8901 }
8902 if (osd != -1 && !osdmap.exists(osd)) {
8903 ss << "osd." << osd << " does not exist";
8904 err = -ENOENT;
8905 goto reply;
8906 }
8907
31f18b77
FG
8908 if (osdmap.require_min_compat_client > 0 &&
8909 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
8910 ss << "require_min_compat_client "
8911 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
8912 << " < firefly, which is required for primary-temp";
8913 err = -EPERM;
8914 goto reply;
8915 } else if (!g_conf->mon_osd_allow_primary_temp) {
8916 ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
8917 err = -EPERM;
8918 goto reply;
8919 }
8920
8921 pending_inc.new_primary_temp[pgid] = osd;
8922 ss << "set " << pgid << " primary_temp mapping to " << osd;
8923 goto update;
224ce89b
WB
8924 } else if (prefix == "osd pg-upmap" ||
8925 prefix == "osd rm-pg-upmap" ||
8926 prefix == "osd pg-upmap-items" ||
8927 prefix == "osd rm-pg-upmap-items") {
31f18b77 8928 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
224ce89b
WB
8929 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8930 << "luminous' before using the new interface";
7c673cae
FG
8931 err = -EPERM;
8932 goto reply;
8933 }
31f18b77
FG
8934 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
8935 ss << "min_compat_client "
8936 << ceph_release_name(osdmap.require_min_compat_client)
224ce89b
WB
8937 << " < luminous, which is required for pg-upmap. "
8938 << "Try 'ceph osd set-require-min-compat-client luminous' "
8939 << "before using the new interface";
7c673cae
FG
8940 err = -EPERM;
8941 goto reply;
8942 }
8943 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
8944 if (err == -EAGAIN)
8945 goto wait;
8946 if (err < 0)
8947 goto reply;
8948 string pgidstr;
8949 if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
8950 ss << "unable to parse 'pgid' value '"
8951 << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
8952 err = -EINVAL;
8953 goto reply;
8954 }
8955 pg_t pgid;
8956 if (!pgid.parse(pgidstr.c_str())) {
8957 ss << "invalid pgid '" << pgidstr << "'";
8958 err = -EINVAL;
8959 goto reply;
8960 }
8961 if (!osdmap.pg_exists(pgid)) {
8962 ss << "pg " << pgid << " does not exist";
8963 err = -ENOENT;
8964 goto reply;
8965 }
224ce89b
WB
8966
8967 enum {
8968 OP_PG_UPMAP,
8969 OP_RM_PG_UPMAP,
8970 OP_PG_UPMAP_ITEMS,
8971 OP_RM_PG_UPMAP_ITEMS,
8972 } option;
8973
8974 if (prefix == "osd pg-upmap") {
8975 option = OP_PG_UPMAP;
8976 } else if (prefix == "osd rm-pg-upmap") {
8977 option = OP_RM_PG_UPMAP;
8978 } else if (prefix == "osd pg-upmap-items") {
8979 option = OP_PG_UPMAP_ITEMS;
8980 } else {
8981 option = OP_RM_PG_UPMAP_ITEMS;
7c673cae 8982 }
224ce89b
WB
8983
8984 // check pending upmap changes
8985 switch (option) {
8986 case OP_PG_UPMAP: // fall through
8987 case OP_RM_PG_UPMAP:
8988 if (pending_inc.new_pg_upmap.count(pgid) ||
8989 pending_inc.old_pg_upmap.count(pgid)) {
8990 dout(10) << __func__ << " waiting for pending update on "
8991 << pgid << dendl;
8992 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8993 return true;
7c673cae 8994 }
224ce89b 8995 break;
7c673cae 8996
224ce89b
WB
8997 case OP_PG_UPMAP_ITEMS: // fall through
8998 case OP_RM_PG_UPMAP_ITEMS:
8999 if (pending_inc.new_pg_upmap_items.count(pgid) ||
9000 pending_inc.old_pg_upmap_items.count(pgid)) {
9001 dout(10) << __func__ << " waiting for pending update on "
9002 << pgid << dendl;
9003 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9004 return true;
9005 }
9006 break;
7c673cae 9007
224ce89b
WB
9008 default:
9009 assert(0 == "invalid option");
7c673cae 9010 }
224ce89b
WB
9011
9012 switch (option) {
9013 case OP_PG_UPMAP:
9014 {
9015 vector<int64_t> id_vec;
9016 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9017 ss << "unable to parse 'id' value(s) '"
9018 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9019 err = -EINVAL;
9020 goto reply;
9021 }
9022
9023 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
9024 if ((int)id_vec.size() < pool_min_size) {
9025 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
9026 << pool_min_size << ")";
9027 err = -EINVAL;
9028 goto reply;
9029 }
9030
9031 int pool_size = osdmap.get_pg_pool_size(pgid);
9032 if ((int)id_vec.size() > pool_size) {
9033 ss << "num of osds (" << id_vec.size() <<") > pool size ("
9034 << pool_size << ")";
9035 err = -EINVAL;
9036 goto reply;
9037 }
9038
9039 vector<int32_t> new_pg_upmap;
9040 for (auto osd : id_vec) {
9041 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
9042 ss << "osd." << osd << " does not exist";
9043 err = -ENOENT;
9044 goto reply;
9045 }
9046 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
9047 if (it != new_pg_upmap.end()) {
9048 ss << "osd." << osd << " already exists, ";
9049 continue;
9050 }
9051 new_pg_upmap.push_back(osd);
9052 }
9053
9054 if (new_pg_upmap.empty()) {
9055 ss << "no valid upmap items(pairs) is specified";
9056 err = -EINVAL;
9057 goto reply;
9058 }
9059
9060 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
9061 new_pg_upmap.begin(), new_pg_upmap.end());
9062 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
7c673cae 9063 }
224ce89b
WB
9064 break;
9065
9066 case OP_RM_PG_UPMAP:
9067 {
9068 pending_inc.old_pg_upmap.insert(pgid);
9069 ss << "clear " << pgid << " pg_upmap mapping";
7c673cae 9070 }
224ce89b 9071 break;
7c673cae 9072
224ce89b
WB
9073 case OP_PG_UPMAP_ITEMS:
9074 {
9075 vector<int64_t> id_vec;
9076 if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
9077 ss << "unable to parse 'id' value(s) '"
9078 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9079 err = -EINVAL;
9080 goto reply;
9081 }
9082
9083 if (id_vec.size() % 2) {
9084 ss << "you must specify pairs of osd ids to be remapped";
9085 err = -EINVAL;
9086 goto reply;
9087 }
9088
9089 int pool_size = osdmap.get_pg_pool_size(pgid);
9090 if ((int)(id_vec.size() / 2) > pool_size) {
9091 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
9092 << pool_size << ")";
9093 err = -EINVAL;
9094 goto reply;
9095 }
9096
9097 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
9098 ostringstream items;
9099 items << "[";
9100 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
9101 int from = *p++;
9102 int to = *p;
9103 if (from == to) {
9104 ss << "from osd." << from << " == to osd." << to << ", ";
9105 continue;
9106 }
9107 if (!osdmap.exists(from)) {
9108 ss << "osd." << from << " does not exist";
9109 err = -ENOENT;
9110 goto reply;
9111 }
9112 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
9113 ss << "osd." << to << " does not exist";
9114 err = -ENOENT;
9115 goto reply;
9116 }
9117 new_pg_upmap_items.push_back(make_pair(from, to));
9118 items << from << "->" << to << ",";
9119 }
9120 string out(items.str());
9121 out.resize(out.size() - 1); // drop last ','
9122 out += "]";
9123
9124 if (new_pg_upmap_items.empty()) {
9125 ss << "no valid upmap items(pairs) is specified";
9126 err = -EINVAL;
9127 goto reply;
9128 }
9129
9130 pending_inc.new_pg_upmap_items[pgid] =
9131 mempool::osdmap::vector<pair<int32_t,int32_t>>(
9132 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
9133 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
9134 }
9135 break;
9136
9137 case OP_RM_PG_UPMAP_ITEMS:
9138 {
9139 pending_inc.old_pg_upmap_items.insert(pgid);
9140 ss << "clear " << pgid << " pg_upmap_items mapping";
9141 }
9142 break;
9143
9144 default:
9145 assert(0 == "invalid option");
7c673cae
FG
9146 }
9147
7c673cae
FG
9148 goto update;
9149 } else if (prefix == "osd primary-affinity") {
9150 int64_t id;
9151 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9152 ss << "invalid osd id value '"
9153 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9154 err = -EINVAL;
9155 goto reply;
9156 }
9157 double w;
9158 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
9159 ss << "unable to parse 'weight' value '"
9160 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
9161 err = -EINVAL;
9162 goto reply;
9163 }
9164 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
9165 if (ww < 0L) {
9166 ss << "weight must be >= 0";
9167 err = -EINVAL;
9168 goto reply;
9169 }
31f18b77
FG
9170 if (osdmap.require_min_compat_client > 0 &&
9171 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
9172 ss << "require_min_compat_client "
9173 << ceph_release_name(osdmap.require_min_compat_client)
7c673cae
FG
9174 << " < firefly, which is required for primary-affinity";
9175 err = -EPERM;
9176 goto reply;
9177 } else if (!g_conf->mon_osd_allow_primary_affinity) {
9178 ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
9179 err = -EPERM;
9180 goto reply;
9181 }
9182 err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
9183 if (err == -EAGAIN)
9184 goto wait;
9185 if (err < 0)
9186 goto reply;
9187 if (osdmap.exists(id)) {
9188 pending_inc.new_primary_affinity[id] = ww;
9189 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
9190 getline(ss, rs);
9191 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9192 get_last_committed() + 1));
9193 return true;
9194 } else {
9195 ss << "osd." << id << " does not exist";
9196 err = -ENOENT;
9197 goto reply;
9198 }
9199 } else if (prefix == "osd reweight") {
9200 int64_t id;
9201 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9202 ss << "unable to parse osd id value '"
9203 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9204 err = -EINVAL;
9205 goto reply;
9206 }
9207 double w;
9208 if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
9209 ss << "unable to parse weight value '"
9210 << cmd_vartype_stringify(cmdmap["weight"]) << "'";
9211 err = -EINVAL;
9212 goto reply;
9213 }
9214 long ww = (int)((double)CEPH_OSD_IN*w);
9215 if (ww < 0L) {
9216 ss << "weight must be >= 0";
9217 err = -EINVAL;
9218 goto reply;
9219 }
9220 if (osdmap.exists(id)) {
9221 pending_inc.new_weight[id] = ww;
9222 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
9223 getline(ss, rs);
9224 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9225 get_last_committed() + 1));
9226 return true;
9227 } else {
9228 ss << "osd." << id << " does not exist";
9229 err = -ENOENT;
9230 goto reply;
9231 }
9232 } else if (prefix == "osd reweightn") {
9233 map<int32_t, uint32_t> weights;
9234 err = parse_reweights(g_ceph_context, cmdmap, osdmap, &weights);
9235 if (err) {
9236 ss << "unable to parse 'weights' value '"
9237 << cmd_vartype_stringify(cmdmap["weights"]) << "'";
9238 goto reply;
9239 }
9240 pending_inc.new_weight.insert(weights.begin(), weights.end());
9241 wait_for_finished_proposal(
9242 op,
9243 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
224ce89b 9244 return true;
7c673cae
FG
9245 } else if (prefix == "osd lost") {
9246 int64_t id;
9247 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9248 ss << "unable to parse osd id value '"
9249 << cmd_vartype_stringify(cmdmap["id"]) << "'";
9250 err = -EINVAL;
9251 goto reply;
9252 }
9253 string sure;
9254 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
9255 ss << "are you SURE? this might mean real, permanent data loss. pass "
9256 "--yes-i-really-mean-it if you really do.";
9257 err = -EPERM;
9258 goto reply;
9259 } else if (!osdmap.exists(id)) {
9260 ss << "osd." << id << " does not exist";
9261 err = -ENOENT;
9262 goto reply;
9263 } else if (!osdmap.is_down(id)) {
9264 ss << "osd." << id << " is not down";
9265 err = -EBUSY;
9266 goto reply;
9267 } else {
9268 epoch_t e = osdmap.get_info(id).down_at;
9269 pending_inc.new_lost[id] = e;
9270 ss << "marked osd lost in epoch " << e;
9271 getline(ss, rs);
9272 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9273 get_last_committed() + 1));
9274 return true;
9275 }
9276
31f18b77
FG
9277 } else if (prefix == "osd destroy" || prefix == "osd purge") {
9278 /* Destroying an OSD means that we don't expect to further make use of
9279 * the OSDs data (which may even become unreadable after this operation),
9280 * and that we are okay with scrubbing all its cephx keys and config-key
9281 * data (which may include lockbox keys, thus rendering the osd's data
9282 * unreadable).
9283 *
9284 * The OSD will not be removed. Instead, we will mark it as destroyed,
9285 * such that a subsequent call to `create` will not reuse the osd id.
9286 * This will play into being able to recreate the OSD, at the same
9287 * crush location, with minimal data movement.
9288 */
9289
9290 // make sure authmon is writeable.
9291 if (!mon->authmon()->is_writeable()) {
9292 dout(10) << __func__ << " waiting for auth mon to be writeable for "
9293 << "osd destroy" << dendl;
9294 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9295 return false;
9296 }
9297
9298 int64_t id;
9299 if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
9300 ss << "unable to parse osd id value '"
9301 << cmd_vartype_stringify(cmdmap["id"]) << "";
9302 err = -EINVAL;
9303 goto reply;
9304 }
9305
9306 bool is_destroy = (prefix == "osd destroy");
9307 if (!is_destroy) {
9308 assert("osd purge" == prefix);
9309 }
9310
9311 string sure;
9312 if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
9313 sure != "--yes-i-really-mean-it") {
9314 ss << "Are you SURE? This will mean real, permanent data loss, as well "
9315 << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
9316 << "really do.";
9317 err = -EPERM;
9318 goto reply;
9319 } else if (is_destroy && !osdmap.exists(id)) {
9320 ss << "osd." << id << " does not exist";
9321 err = -ENOENT;
9322 goto reply;
9323 } else if (osdmap.is_up(id)) {
9324 ss << "osd." << id << " is not `down`.";
9325 err = -EBUSY;
9326 goto reply;
9327 } else if (is_destroy && osdmap.is_destroyed(id)) {
9328 ss << "destroyed osd." << id;
9329 err = 0;
9330 goto reply;
9331 }
9332
9333 bool goto_reply = false;
9334
9335 paxos->plug();
9336 if (is_destroy) {
9337 err = prepare_command_osd_destroy(id, ss);
9338 // we checked above that it should exist.
9339 assert(err != -ENOENT);
9340 } else {
9341 err = prepare_command_osd_purge(id, ss);
9342 if (err == -ENOENT) {
9343 err = 0;
9344 ss << "osd." << id << " does not exist.";
9345 goto_reply = true;
9346 }
9347 }
9348 paxos->unplug();
9349
9350 if (err < 0 || goto_reply) {
9351 goto reply;
9352 }
9353
9354 if (is_destroy) {
9355 ss << "destroyed osd." << id;
9356 } else {
9357 ss << "purged osd." << id;
9358 }
9359
9360 getline(ss, rs);
9361 wait_for_finished_proposal(op,
9362 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
9363 force_immediate_propose();
9364 return true;
9365
9366 } else if (prefix == "osd new") {
9367
9368 // make sure authmon is writeable.
9369 if (!mon->authmon()->is_writeable()) {
9370 dout(10) << __func__ << " waiting for auth mon to be writeable for "
224ce89b 9371 << "osd new" << dendl;
31f18b77
FG
9372 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
9373 return false;
9374 }
9375
9376 map<string,string> secrets_map;
9377
9378 bufferlist bl = m->get_data();
9379 string secrets_json = bl.to_str();
9380 dout(20) << __func__ << " osd new json = " << secrets_json << dendl;
9381
9382 err = get_json_str_map(secrets_json, ss, &secrets_map);
9383 if (err < 0)
9384 goto reply;
9385
9386 dout(20) << __func__ << " osd new secrets " << secrets_map << dendl;
9387
9388 paxos->plug();
9389 err = prepare_command_osd_new(op, cmdmap, secrets_map, ss, f.get());
9390 paxos->unplug();
9391
9392 if (err < 0) {
9393 goto reply;
9394 }
9395
9396 if (f) {
9397 f->flush(rdata);
9398 } else {
9399 rdata.append(ss);
9400 }
9401
9402 if (err == EEXIST) {
9403 // idempotent operation
9404 err = 0;
9405 goto reply;
9406 }
9407
9408 wait_for_finished_proposal(op,
9409 new Monitor::C_Command(mon, op, 0, rs, rdata,
9410 get_last_committed() + 1));
9411 force_immediate_propose();
9412 return true;
9413
7c673cae 9414 } else if (prefix == "osd create") {
7c673cae
FG
9415
9416 // optional id provided?
31f18b77
FG
9417 int64_t id = -1, cmd_id = -1;
9418 if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
9419 if (cmd_id < 0) {
9420 ss << "invalid osd id value '" << cmd_id << "'";
7c673cae
FG
9421 err = -EINVAL;
9422 goto reply;
9423 }
31f18b77 9424 dout(10) << " osd create got id " << cmd_id << dendl;
7c673cae
FG
9425 }
9426
7c673cae
FG
9427 uuid_d uuid;
9428 string uuidstr;
9429 if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
9430 if (!uuid.parse(uuidstr.c_str())) {
31f18b77
FG
9431 ss << "invalid uuid value '" << uuidstr << "'";
9432 err = -EINVAL;
9433 goto reply;
7c673cae 9434 }
31f18b77
FG
9435 // we only care about the id if we also have the uuid, to
9436 // ensure the operation's idempotency.
9437 id = cmd_id;
7c673cae
FG
9438 }
9439
31f18b77
FG
9440 int32_t new_id = -1;
9441 err = prepare_command_osd_create(id, uuid, &new_id, ss);
9442 if (err < 0) {
9443 if (err == -EAGAIN) {
9444 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9445 return true;
9446 }
9447 // a check has failed; reply to the user.
9448 goto reply;
9449
9450 } else if (err == EEXIST) {
9451 // this is an idempotent operation; we can go ahead and reply.
9452 if (f) {
9453 f->open_object_section("created_osd");
9454 f->dump_int("osdid", new_id);
9455 f->close_section();
9456 f->flush(rdata);
9457 } else {
9458 ss << new_id;
9459 rdata.append(ss);
7c673cae 9460 }
31f18b77
FG
9461 err = 0;
9462 goto reply;
7c673cae
FG
9463 }
9464
31f18b77
FG
9465 do_osd_create(id, uuid, &new_id);
9466
7c673cae
FG
9467 if (f) {
9468 f->open_object_section("created_osd");
31f18b77 9469 f->dump_int("osdid", new_id);
7c673cae
FG
9470 f->close_section();
9471 f->flush(rdata);
9472 } else {
31f18b77 9473 ss << new_id;
7c673cae
FG
9474 rdata.append(ss);
9475 }
31f18b77
FG
9476 wait_for_finished_proposal(op,
9477 new Monitor::C_Command(mon, op, 0, rs, rdata,
9478 get_last_committed() + 1));
7c673cae
FG
9479 return true;
9480
9481 } else if (prefix == "osd blacklist clear") {
9482 pending_inc.new_blacklist.clear();
9483 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
9484 osdmap.get_blacklist(&blacklist);
9485 for (const auto &entry : blacklist) {
9486 pending_inc.old_blacklist.push_back(entry.first);
9487 }
9488 ss << " removed all blacklist entries";
9489 getline(ss, rs);
9490 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9491 get_last_committed() + 1));
9492 return true;
9493 } else if (prefix == "osd blacklist") {
9494 string addrstr;
9495 cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
9496 entity_addr_t addr;
9497 if (!addr.parse(addrstr.c_str(), 0)) {
9498 ss << "unable to parse address " << addrstr;
9499 err = -EINVAL;
9500 goto reply;
9501 }
9502 else {
9503 string blacklistop;
9504 cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
9505 if (blacklistop == "add") {
9506 utime_t expires = ceph_clock_now();
9507 double d;
9508 // default one hour
224ce89b
WB
9509 cmd_getval(g_ceph_context, cmdmap, "expire", d,
9510 g_conf->mon_osd_blacklist_default_expire);
7c673cae
FG
9511 expires += d;
9512
9513 pending_inc.new_blacklist[addr] = expires;
224ce89b
WB
9514
9515 {
9516 // cancel any pending un-blacklisting request too
9517 auto it = std::find(pending_inc.old_blacklist.begin(),
9518 pending_inc.old_blacklist.end(), addr);
9519 if (it != pending_inc.old_blacklist.end()) {
9520 pending_inc.old_blacklist.erase(it);
9521 }
9522 }
9523
7c673cae
FG
9524 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
9525 getline(ss, rs);
9526 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9527 get_last_committed() + 1));
9528 return true;
9529 } else if (blacklistop == "rm") {
9530 if (osdmap.is_blacklisted(addr) ||
9531 pending_inc.new_blacklist.count(addr)) {
9532 if (osdmap.is_blacklisted(addr))
9533 pending_inc.old_blacklist.push_back(addr);
9534 else
9535 pending_inc.new_blacklist.erase(addr);
9536 ss << "un-blacklisting " << addr;
9537 getline(ss, rs);
9538 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9539 get_last_committed() + 1));
9540 return true;
9541 }
9542 ss << addr << " isn't blacklisted";
9543 err = 0;
9544 goto reply;
9545 }
9546 }
9547 } else if (prefix == "osd pool mksnap") {
9548 string poolstr;
9549 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9550 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
9551 if (pool < 0) {
9552 ss << "unrecognized pool '" << poolstr << "'";
9553 err = -ENOENT;
9554 goto reply;
9555 }
9556 string snapname;
9557 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
9558 const pg_pool_t *p = osdmap.get_pg_pool(pool);
9559 if (p->is_unmanaged_snaps_mode()) {
9560 ss << "pool " << poolstr << " is in unmanaged snaps mode";
9561 err = -EINVAL;
9562 goto reply;
9563 } else if (p->snap_exists(snapname.c_str())) {
9564 ss << "pool " << poolstr << " snap " << snapname << " already exists";
9565 err = 0;
9566 goto reply;
9567 } else if (p->is_tier()) {
9568 ss << "pool " << poolstr << " is a cache tier";
9569 err = -EINVAL;
9570 goto reply;
9571 }
9572 pg_pool_t *pp = 0;
9573 if (pending_inc.new_pools.count(pool))
9574 pp = &pending_inc.new_pools[pool];
9575 if (!pp) {
9576 pp = &pending_inc.new_pools[pool];
9577 *pp = *p;
9578 }
9579 if (pp->snap_exists(snapname.c_str())) {
9580 ss << "pool " << poolstr << " snap " << snapname << " already exists";
9581 } else {
9582 pp->add_snap(snapname.c_str(), ceph_clock_now());
9583 pp->set_snap_epoch(pending_inc.epoch);
9584 ss << "created pool " << poolstr << " snap " << snapname;
9585 }
9586 getline(ss, rs);
9587 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9588 get_last_committed() + 1));
9589 return true;
9590 } else if (prefix == "osd pool rmsnap") {
9591 string poolstr;
9592 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9593 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
9594 if (pool < 0) {
9595 ss << "unrecognized pool '" << poolstr << "'";
9596 err = -ENOENT;
9597 goto reply;
9598 }
9599 string snapname;
9600 cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
9601 const pg_pool_t *p = osdmap.get_pg_pool(pool);
9602 if (p->is_unmanaged_snaps_mode()) {
9603 ss << "pool " << poolstr << " is in unmanaged snaps mode";
9604 err = -EINVAL;
9605 goto reply;
9606 } else if (!p->snap_exists(snapname.c_str())) {
9607 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
9608 err = 0;
9609 goto reply;
9610 }
9611 pg_pool_t *pp = 0;
9612 if (pending_inc.new_pools.count(pool))
9613 pp = &pending_inc.new_pools[pool];
9614 if (!pp) {
9615 pp = &pending_inc.new_pools[pool];
9616 *pp = *p;
9617 }
9618 snapid_t sn = pp->snap_exists(snapname.c_str());
9619 if (sn) {
9620 pp->remove_snap(sn);
9621 pp->set_snap_epoch(pending_inc.epoch);
9622 ss << "removed pool " << poolstr << " snap " << snapname;
9623 } else {
9624 ss << "already removed pool " << poolstr << " snap " << snapname;
9625 }
9626 getline(ss, rs);
9627 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9628 get_last_committed() + 1));
9629 return true;
9630 } else if (prefix == "osd pool create") {
9631 int64_t pg_num;
9632 int64_t pgp_num;
9633 cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
9634 cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
9635
9636 string pool_type_str;
9637 cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
9638 if (pool_type_str.empty())
224ce89b 9639 pool_type_str = g_conf->osd_pool_default_type;
7c673cae
FG
9640
9641 string poolstr;
9642 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9643 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9644 if (pool_id >= 0) {
9645 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9646 if (pool_type_str != p->get_type_name()) {
9647 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
9648 err = -EINVAL;
9649 } else {
9650 ss << "pool '" << poolstr << "' already exists";
9651 err = 0;
9652 }
9653 goto reply;
9654 }
9655
9656 int pool_type;
9657 if (pool_type_str == "replicated") {
9658 pool_type = pg_pool_t::TYPE_REPLICATED;
9659 } else if (pool_type_str == "erasure") {
9660 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
9661 CEPH_FEATURE_OSD_ERASURE_CODES,
9662 ss);
9663 if (err == -EAGAIN)
9664 goto wait;
9665 if (err)
9666 goto reply;
9667 pool_type = pg_pool_t::TYPE_ERASURE;
9668 } else {
9669 ss << "unknown pool type '" << pool_type_str << "'";
9670 err = -EINVAL;
9671 goto reply;
9672 }
9673
31f18b77
FG
9674 bool implicit_rule_creation = false;
9675 string rule_name;
9676 cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
7c673cae
FG
9677 string erasure_code_profile;
9678 cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
9679
9680 if (pool_type == pg_pool_t::TYPE_ERASURE) {
9681 if (erasure_code_profile == "")
9682 erasure_code_profile = "default";
9683 //handle the erasure code profile
9684 if (erasure_code_profile == "default") {
9685 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
9686 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
9687 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
9688 goto wait;
9689 }
9690
9691 map<string,string> profile_map;
9692 err = osdmap.get_erasure_code_profile_default(g_ceph_context,
9693 profile_map,
9694 &ss);
9695 if (err)
9696 goto reply;
9697 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
9698 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
9699 goto wait;
9700 }
9701 }
31f18b77
FG
9702 if (rule_name == "") {
9703 implicit_rule_creation = true;
7c673cae 9704 if (erasure_code_profile == "default") {
31f18b77 9705 rule_name = "erasure-code";
7c673cae 9706 } else {
31f18b77 9707 dout(1) << "implicitly use rule named after the pool: "
7c673cae 9708 << poolstr << dendl;
31f18b77 9709 rule_name = poolstr;
7c673cae
FG
9710 }
9711 }
9712 } else {
31f18b77
FG
9713 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
9714 rule_name = erasure_code_profile;
7c673cae
FG
9715 }
9716
31f18b77
FG
9717 if (!implicit_rule_creation && rule_name != "") {
9718 int rule;
9719 err = get_crush_rule(rule_name, &rule, &ss);
7c673cae
FG
9720 if (err == -EAGAIN) {
9721 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9722 return true;
9723 }
9724 if (err)
9725 goto reply;
9726 }
9727
9728 int64_t expected_num_objects;
9729 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
9730 if (expected_num_objects < 0) {
9731 ss << "'expected_num_objects' must be non-negative";
9732 err = -EINVAL;
9733 goto reply;
9734 }
9735
9736 int64_t fast_read_param;
9737 cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
9738 FastReadType fast_read = FAST_READ_DEFAULT;
9739 if (fast_read_param == 0)
9740 fast_read = FAST_READ_OFF;
9741 else if (fast_read_param > 0)
9742 fast_read = FAST_READ_ON;
9743
9744 err = prepare_new_pool(poolstr, 0, // auid=0 for admin created pool
9745 -1, // default crush rule
31f18b77 9746 rule_name,
7c673cae
FG
9747 pg_num, pgp_num,
9748 erasure_code_profile, pool_type,
9749 (uint64_t)expected_num_objects,
9750 fast_read,
9751 &ss);
9752 if (err < 0) {
9753 switch(err) {
9754 case -EEXIST:
9755 ss << "pool '" << poolstr << "' already exists";
9756 break;
9757 case -EAGAIN:
9758 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9759 return true;
9760 case -ERANGE:
9761 goto reply;
9762 default:
9763 goto reply;
9764 break;
9765 }
9766 } else {
9767 ss << "pool '" << poolstr << "' created";
9768 }
9769 getline(ss, rs);
9770 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9771 get_last_committed() + 1));
9772 return true;
9773
9774 } else if (prefix == "osd pool delete" ||
9775 prefix == "osd pool rm") {
9776 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
9777 string poolstr, poolstr2, sure;
9778 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9779 cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
9780 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
9781 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
9782 if (pool < 0) {
9783 ss << "pool '" << poolstr << "' does not exist";
9784 err = 0;
9785 goto reply;
9786 }
9787
9788 bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
9789 if (poolstr2 != poolstr ||
9790 (sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
9791 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
9792 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
9793 << "followed by --yes-i-really-really-mean-it.";
9794 err = -EPERM;
9795 goto reply;
9796 }
9797 err = _prepare_remove_pool(pool, &ss, force_no_fake);
9798 if (err == -EAGAIN) {
9799 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9800 return true;
9801 }
9802 if (err < 0)
9803 goto reply;
9804 goto update;
9805 } else if (prefix == "osd pool rename") {
9806 string srcpoolstr, destpoolstr;
9807 cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
9808 cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
9809 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
9810 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
9811
9812 if (pool_src < 0) {
9813 if (pool_dst >= 0) {
9814 // src pool doesn't exist, dst pool does exist: to ensure idempotency
9815 // of operations, assume this rename succeeded, as it is not changing
9816 // the current state. Make sure we output something understandable
9817 // for whoever is issuing the command, if they are paying attention,
9818 // in case it was not intentional; or to avoid a "wtf?" and a bug
9819 // report in case it was intentional, while expecting a failure.
9820 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
9821 << destpoolstr << "' does -- assuming successful rename";
9822 err = 0;
9823 } else {
9824 ss << "unrecognized pool '" << srcpoolstr << "'";
9825 err = -ENOENT;
9826 }
9827 goto reply;
9828 } else if (pool_dst >= 0) {
9829 // source pool exists and so does the destination pool
9830 ss << "pool '" << destpoolstr << "' already exists";
9831 err = -EEXIST;
9832 goto reply;
9833 }
9834
9835 int ret = _prepare_rename_pool(pool_src, destpoolstr);
9836 if (ret == 0) {
9837 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
9838 } else {
9839 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
9840 << cpp_strerror(ret);
9841 }
9842 getline(ss, rs);
9843 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
9844 get_last_committed() + 1));
9845 return true;
9846
9847 } else if (prefix == "osd pool set") {
9848 err = prepare_command_pool_set(cmdmap, ss);
9849 if (err == -EAGAIN)
9850 goto wait;
9851 if (err < 0)
9852 goto reply;
9853
9854 getline(ss, rs);
9855 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9856 get_last_committed() + 1));
9857 return true;
9858 } else if (prefix == "osd tier add") {
9859 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
9860 if (err == -EAGAIN)
9861 goto wait;
9862 if (err)
9863 goto reply;
9864 string poolstr;
9865 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9866 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9867 if (pool_id < 0) {
9868 ss << "unrecognized pool '" << poolstr << "'";
9869 err = -ENOENT;
9870 goto reply;
9871 }
9872 string tierpoolstr;
9873 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
9874 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
9875 if (tierpool_id < 0) {
9876 ss << "unrecognized pool '" << tierpoolstr << "'";
9877 err = -ENOENT;
9878 goto reply;
9879 }
9880 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9881 assert(p);
9882 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
9883 assert(tp);
9884
9885 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
9886 goto reply;
9887 }
9888
9889 // make sure new tier is empty
9890 string force_nonempty;
9891 cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
31f18b77
FG
9892 const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
9893 if (pstats && pstats->stats.sum.num_objects != 0 &&
7c673cae
FG
9894 force_nonempty != "--force-nonempty") {
9895 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
9896 err = -ENOTEMPTY;
9897 goto reply;
9898 }
9899 if (tp->ec_pool()) {
9900 ss << "tier pool '" << tierpoolstr
9901 << "' is an ec pool, which cannot be a tier";
9902 err = -ENOTSUP;
9903 goto reply;
9904 }
9905 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
9906 ((force_nonempty != "--force-nonempty") ||
9907 (!g_conf->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
9908 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
9909 err = -ENOTEMPTY;
9910 goto reply;
9911 }
9912 // go
9913 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
9914 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
9915 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
9916 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9917 return true;
9918 }
9919 np->tiers.insert(tierpool_id);
9920 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
9921 ntp->tier_of = pool_id;
9922 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
9923 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
9924 get_last_committed() + 1));
9925 return true;
9926 } else if (prefix == "osd tier remove" ||
9927 prefix == "osd tier rm") {
9928 string poolstr;
9929 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9930 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9931 if (pool_id < 0) {
9932 ss << "unrecognized pool '" << poolstr << "'";
9933 err = -ENOENT;
9934 goto reply;
9935 }
9936 string tierpoolstr;
9937 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
9938 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
9939 if (tierpool_id < 0) {
9940 ss << "unrecognized pool '" << tierpoolstr << "'";
9941 err = -ENOENT;
9942 goto reply;
9943 }
9944 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
9945 assert(p);
9946 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
9947 assert(tp);
9948
9949 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
9950 goto reply;
9951 }
9952
9953 if (p->tiers.count(tierpool_id) == 0) {
9954 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
9955 err = 0;
9956 goto reply;
9957 }
9958 if (tp->tier_of != pool_id) {
9959 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
9960 << osdmap.get_pool_name(tp->tier_of) << "': "
9961 // be scary about it; this is an inconsistency and bells must go off
9962 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
9963 err = -EINVAL;
9964 goto reply;
9965 }
9966 if (p->read_tier == tierpool_id) {
9967 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
9968 err = -EBUSY;
9969 goto reply;
9970 }
9971 // go
9972 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
9973 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
9974 if (np->tiers.count(tierpool_id) == 0 ||
9975 ntp->tier_of != pool_id ||
9976 np->read_tier == tierpool_id) {
9977 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9978 return true;
9979 }
9980 np->tiers.erase(tierpool_id);
9981 ntp->clear_tier();
9982 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
9983 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
9984 get_last_committed() + 1));
9985 return true;
9986 } else if (prefix == "osd tier set-overlay") {
9987 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
9988 if (err == -EAGAIN)
9989 goto wait;
9990 if (err)
9991 goto reply;
9992 string poolstr;
9993 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
9994 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
9995 if (pool_id < 0) {
9996 ss << "unrecognized pool '" << poolstr << "'";
9997 err = -ENOENT;
9998 goto reply;
9999 }
10000 string overlaypoolstr;
10001 cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
10002 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
10003 if (overlaypool_id < 0) {
10004 ss << "unrecognized pool '" << overlaypoolstr << "'";
10005 err = -ENOENT;
10006 goto reply;
10007 }
10008 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10009 assert(p);
10010 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
10011 assert(overlay_p);
10012 if (p->tiers.count(overlaypool_id) == 0) {
10013 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
10014 err = -EINVAL;
10015 goto reply;
10016 }
10017 if (p->read_tier == overlaypool_id) {
10018 err = 0;
10019 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10020 goto reply;
10021 }
10022 if (p->has_read_tier()) {
10023 ss << "pool '" << poolstr << "' has overlay '"
10024 << osdmap.get_pool_name(p->read_tier)
10025 << "'; please remove-overlay first";
10026 err = -EINVAL;
10027 goto reply;
10028 }
10029
10030 // go
10031 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10032 np->read_tier = overlaypool_id;
10033 np->write_tier = overlaypool_id;
10034 np->set_last_force_op_resend(pending_inc.epoch);
10035 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
10036 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
10037 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
10038 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
10039 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
10040 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10041 get_last_committed() + 1));
10042 return true;
10043 } else if (prefix == "osd tier remove-overlay" ||
10044 prefix == "osd tier rm-overlay") {
10045 string poolstr;
10046 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10047 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10048 if (pool_id < 0) {
10049 ss << "unrecognized pool '" << poolstr << "'";
10050 err = -ENOENT;
10051 goto reply;
10052 }
10053 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10054 assert(p);
10055 if (!p->has_read_tier()) {
10056 err = 0;
10057 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
10058 goto reply;
10059 }
10060
10061 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
10062 goto reply;
10063 }
10064
10065 // go
10066 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10067 if (np->has_read_tier()) {
10068 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
10069 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
10070 nop->set_last_force_op_resend(pending_inc.epoch);
10071 }
10072 if (np->has_write_tier()) {
10073 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
10074 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
10075 nop->set_last_force_op_resend(pending_inc.epoch);
10076 }
10077 np->clear_read_tier();
10078 np->clear_write_tier();
10079 np->set_last_force_op_resend(pending_inc.epoch);
10080 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
10081 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10082 get_last_committed() + 1));
10083 return true;
10084 } else if (prefix == "osd tier cache-mode") {
10085 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10086 if (err == -EAGAIN)
10087 goto wait;
10088 if (err)
10089 goto reply;
10090 string poolstr;
10091 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10092 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10093 if (pool_id < 0) {
10094 ss << "unrecognized pool '" << poolstr << "'";
10095 err = -ENOENT;
10096 goto reply;
10097 }
10098 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10099 assert(p);
10100 if (!p->is_tier()) {
10101 ss << "pool '" << poolstr << "' is not a tier";
10102 err = -EINVAL;
10103 goto reply;
10104 }
10105 string modestr;
10106 cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
10107 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
10108 if (mode < 0) {
10109 ss << "'" << modestr << "' is not a valid cache mode";
10110 err = -EINVAL;
10111 goto reply;
10112 }
10113
10114 string sure;
10115 cmd_getval(g_ceph_context, cmdmap, "sure", sure);
10116 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10117 mode != pg_pool_t::CACHEMODE_NONE &&
10118 mode != pg_pool_t::CACHEMODE_PROXY &&
10119 mode != pg_pool_t::CACHEMODE_READPROXY) &&
10120 sure != "--yes-i-really-mean-it") {
10121 ss << "'" << modestr << "' is not a well-supported cache mode and may "
10122 << "corrupt your data. pass --yes-i-really-mean-it to force.";
10123 err = -EPERM;
10124 goto reply;
10125 }
10126
10127 // pool already has this cache-mode set and there are no pending changes
10128 if (p->cache_mode == mode &&
10129 (pending_inc.new_pools.count(pool_id) == 0 ||
10130 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
10131 ss << "set cache-mode for pool '" << poolstr << "'"
10132 << " to " << pg_pool_t::get_cache_mode_name(mode);
10133 err = 0;
10134 goto reply;
10135 }
10136
10137 /* Mode description:
10138 *
10139 * none: No cache-mode defined
10140 * forward: Forward all reads and writes to base pool
10141 * writeback: Cache writes, promote reads from base pool
10142 * readonly: Forward writes to base pool
10143 * readforward: Writes are in writeback mode, Reads are in forward mode
10144 * proxy: Proxy all reads and writes to base pool
10145 * readproxy: Writes are in writeback mode, Reads are in proxy mode
10146 *
10147 * Hence, these are the allowed transitions:
10148 *
10149 * none -> any
10150 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10151 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
10152 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
10153 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
10154 * writeback -> readforward || readproxy || forward || proxy
10155 * readonly -> any
10156 */
10157
10158 // We check if the transition is valid against the current pool mode, as
10159 // it is the only committed state thus far. We will blantly squash
10160 // whatever mode is on the pending state.
10161
10162 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
10163 (mode != pg_pool_t::CACHEMODE_FORWARD &&
10164 mode != pg_pool_t::CACHEMODE_PROXY &&
10165 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10166 mode != pg_pool_t::CACHEMODE_READPROXY)) {
10167 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
10168 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
10169 << "' pool; only '"
10170 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
10171 << "','"
10172 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
10173 << "','"
10174 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
10175 << "','"
10176 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
10177 << "' allowed.";
10178 err = -EINVAL;
10179 goto reply;
10180 }
10181 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
10182 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10183 mode != pg_pool_t::CACHEMODE_FORWARD &&
10184 mode != pg_pool_t::CACHEMODE_PROXY &&
10185 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10186
10187 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
10188 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10189 mode != pg_pool_t::CACHEMODE_FORWARD &&
10190 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10191 mode != pg_pool_t::CACHEMODE_PROXY)) ||
10192
10193 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
10194 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10195 mode != pg_pool_t::CACHEMODE_FORWARD &&
10196 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10197 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
10198
10199 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
10200 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10201 mode != pg_pool_t::CACHEMODE_READFORWARD &&
10202 mode != pg_pool_t::CACHEMODE_PROXY &&
10203 mode != pg_pool_t::CACHEMODE_READPROXY))) {
10204
31f18b77
FG
10205 const pool_stat_t* pstats =
10206 mon->pgservice->get_pool_stat(pool_id);
7c673cae 10207
31f18b77 10208 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
7c673cae
FG
10209 ss << "unable to set cache-mode '"
10210 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
10211 << "': dirty objects found";
10212 err = -EBUSY;
10213 goto reply;
10214 }
10215 }
10216 // go
10217 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10218 np->cache_mode = mode;
10219 // set this both when moving to and from cache_mode NONE. this is to
10220 // capture legacy pools that were set up before this flag existed.
10221 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
10222 ss << "set cache-mode for pool '" << poolstr
10223 << "' to " << pg_pool_t::get_cache_mode_name(mode);
10224 if (mode == pg_pool_t::CACHEMODE_NONE) {
10225 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
10226 assert(base_pool);
10227 if (base_pool->read_tier == pool_id ||
10228 base_pool->write_tier == pool_id)
10229 ss <<" (WARNING: pool is still configured as read or write tier)";
10230 }
10231 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10232 get_last_committed() + 1));
10233 return true;
10234 } else if (prefix == "osd tier add-cache") {
10235 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
10236 if (err == -EAGAIN)
10237 goto wait;
10238 if (err)
10239 goto reply;
10240 string poolstr;
10241 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10242 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10243 if (pool_id < 0) {
10244 ss << "unrecognized pool '" << poolstr << "'";
10245 err = -ENOENT;
10246 goto reply;
10247 }
10248 string tierpoolstr;
10249 cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
10250 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
10251 if (tierpool_id < 0) {
10252 ss << "unrecognized pool '" << tierpoolstr << "'";
10253 err = -ENOENT;
10254 goto reply;
10255 }
10256 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
10257 assert(p);
10258 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
10259 assert(tp);
10260
10261 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
10262 goto reply;
10263 }
10264
10265 int64_t size = 0;
10266 if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
10267 ss << "unable to parse 'size' value '"
10268 << cmd_vartype_stringify(cmdmap["size"]) << "'";
10269 err = -EINVAL;
10270 goto reply;
10271 }
10272 // make sure new tier is empty
31f18b77
FG
10273 const pool_stat_t *pstats =
10274 mon->pgservice->get_pool_stat(tierpool_id);
10275 if (pstats && pstats->stats.sum.num_objects != 0) {
7c673cae
FG
10276 ss << "tier pool '" << tierpoolstr << "' is not empty";
10277 err = -ENOTEMPTY;
10278 goto reply;
10279 }
10280 string modestr = g_conf->osd_tier_default_cache_mode;
10281 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
10282 if (mode < 0) {
10283 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
10284 err = -EINVAL;
10285 goto reply;
10286 }
10287 HitSet::Params hsp;
10288 if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
10289 BloomHitSet::Params *bsp = new BloomHitSet::Params;
10290 bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
10291 hsp = HitSet::Params(bsp);
10292 } else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
10293 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
10294 }
10295 else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
10296 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
10297 } else {
10298 ss << "osd tier cache default hit set type '" <<
10299 g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
10300 err = -EINVAL;
10301 goto reply;
10302 }
10303 // go
10304 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
10305 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
10306 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
10307 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10308 return true;
10309 }
10310 np->tiers.insert(tierpool_id);
10311 np->read_tier = np->write_tier = tierpool_id;
10312 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
10313 np->set_last_force_op_resend(pending_inc.epoch);
10314 ntp->set_last_force_op_resend(pending_inc.epoch);
10315 ntp->tier_of = pool_id;
10316 ntp->cache_mode = mode;
10317 ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
10318 ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
10319 ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
10320 ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
10321 ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
10322 ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
10323 ntp->hit_set_params = hsp;
10324 ntp->target_max_bytes = size;
10325 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
10326 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
10327 get_last_committed() + 1));
10328 return true;
10329 } else if (prefix == "osd pool set-quota") {
10330 string poolstr;
10331 cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
10332 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
10333 if (pool_id < 0) {
10334 ss << "unrecognized pool '" << poolstr << "'";
10335 err = -ENOENT;
10336 goto reply;
10337 }
10338
10339 string field;
10340 cmd_getval(g_ceph_context, cmdmap, "field", field);
10341 if (field != "max_objects" && field != "max_bytes") {
10342 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
10343 err = -EINVAL;
10344 goto reply;
10345 }
10346
10347 // val could contain unit designations, so we treat as a string
10348 string val;
10349 cmd_getval(g_ceph_context, cmdmap, "val", val);
10350 stringstream tss;
10351 int64_t value = unit_to_bytesize(val, &tss);
10352 if (value < 0) {
10353 ss << "error parsing value '" << value << "': " << tss.str();
10354 err = value;
10355 goto reply;
10356 }
10357
10358 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
10359 if (field == "max_objects") {
10360 pi->quota_max_objects = value;
10361 } else if (field == "max_bytes") {
10362 pi->quota_max_bytes = value;
10363 } else {
10364 assert(0 == "unrecognized option");
10365 }
10366 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
10367 rs = ss.str();
10368 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10369 get_last_committed() + 1));
10370 return true;
10371
10372 } else if (prefix == "osd reweight-by-pg" ||
10373 prefix == "osd reweight-by-utilization" ||
10374 prefix == "osd test-reweight-by-pg" ||
10375 prefix == "osd test-reweight-by-utilization") {
10376 bool by_pg =
10377 prefix == "osd reweight-by-pg" || prefix == "osd test-reweight-by-pg";
10378 bool dry_run =
10379 prefix == "osd test-reweight-by-pg" ||
10380 prefix == "osd test-reweight-by-utilization";
10381 int64_t oload;
10382 cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
10383 set<int64_t> pools;
10384 vector<string> poolnamevec;
10385 cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
10386 for (unsigned j = 0; j < poolnamevec.size(); j++) {
10387 int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
10388 if (pool < 0) {
10389 ss << "pool '" << poolnamevec[j] << "' does not exist";
10390 err = -ENOENT;
10391 goto reply;
10392 }
10393 pools.insert(pool);
10394 }
10395 double max_change = g_conf->mon_reweight_max_change;
10396 cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
10397 if (max_change <= 0.0) {
10398 ss << "max_change " << max_change << " must be positive";
10399 err = -EINVAL;
10400 goto reply;
10401 }
10402 int64_t max_osds = g_conf->mon_reweight_max_osds;
10403 cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
10404 if (max_osds <= 0) {
10405 ss << "max_osds " << max_osds << " must be positive";
10406 err = -EINVAL;
10407 goto reply;
10408 }
10409 string no_increasing;
10410 cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
10411 string out_str;
10412 mempool::osdmap::map<int32_t, uint32_t> new_weights;
31f18b77
FG
10413 err = mon->pgservice->reweight_by_utilization(osdmap,
10414 oload,
10415 max_change,
10416 max_osds,
10417 by_pg,
10418 pools.empty() ? NULL : &pools,
10419 no_increasing == "--no-increasing",
10420 &new_weights,
10421 &ss, &out_str, f.get());
7c673cae
FG
10422 if (err >= 0) {
10423 dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
10424 }
10425 if (f)
10426 f->flush(rdata);
10427 else
10428 rdata.append(out_str);
10429 if (err < 0) {
10430 ss << "FAILED reweight-by-pg";
10431 } else if (err == 0 || dry_run) {
10432 ss << "no change";
10433 } else {
10434 ss << "SUCCESSFUL reweight-by-pg";
10435 pending_inc.new_weight = std::move(new_weights);
10436 wait_for_finished_proposal(
10437 op,
10438 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
10439 return true;
10440 }
10441 } else {
10442 err = -EINVAL;
10443 }
10444
10445 reply:
10446 getline(ss, rs);
10447 if (err < 0 && rs.length() == 0)
10448 rs = cpp_strerror(err);
10449 mon->reply_command(op, err, rs, rdata, get_last_committed());
10450 return ret;
10451
10452 update:
10453 getline(ss, rs);
10454 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10455 get_last_committed() + 1));
10456 return true;
10457
10458 wait:
10459 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10460 return true;
10461}
10462
10463bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
10464{
10465 op->mark_osdmon_event(__func__);
10466 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10467
10468 if (m->fsid != mon->monmap->fsid) {
10469 dout(0) << __func__ << " drop message on fsid " << m->fsid
10470 << " != " << mon->monmap->fsid << " for " << *m << dendl;
10471 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10472 return true;
10473 }
10474
10475 if (m->op == POOL_OP_CREATE)
10476 return preprocess_pool_op_create(op);
10477
10478 if (!osdmap.get_pg_pool(m->pool)) {
10479 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
10480 _pool_op_reply(op, 0, osdmap.get_epoch());
10481 return true;
10482 }
10483
10484 // check if the snap and snapname exist
10485 bool snap_exists = false;
10486 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
10487 if (p->snap_exists(m->name.c_str()))
10488 snap_exists = true;
10489
10490 switch (m->op) {
10491 case POOL_OP_CREATE_SNAP:
10492 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
10493 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10494 return true;
10495 }
10496 if (snap_exists) {
10497 _pool_op_reply(op, 0, osdmap.get_epoch());
10498 return true;
10499 }
10500 return false;
10501 case POOL_OP_CREATE_UNMANAGED_SNAP:
10502 if (p->is_pool_snaps_mode()) {
10503 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10504 return true;
10505 }
10506 return false;
10507 case POOL_OP_DELETE_SNAP:
10508 if (p->is_unmanaged_snaps_mode()) {
10509 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10510 return true;
10511 }
10512 if (!snap_exists) {
10513 _pool_op_reply(op, 0, osdmap.get_epoch());
10514 return true;
10515 }
10516 return false;
10517 case POOL_OP_DELETE_UNMANAGED_SNAP:
10518 if (p->is_pool_snaps_mode()) {
10519 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10520 return true;
10521 }
10522 if (p->is_removed_snap(m->snapid)) {
10523 _pool_op_reply(op, 0, osdmap.get_epoch());
10524 return true;
10525 }
10526 return false;
10527 case POOL_OP_DELETE:
10528 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
10529 _pool_op_reply(op, 0, osdmap.get_epoch());
10530 return true;
10531 }
10532 return false;
10533 case POOL_OP_AUID_CHANGE:
10534 return false;
10535 default:
10536 ceph_abort();
10537 break;
10538 }
10539
10540 return false;
10541}
10542
10543bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
10544{
10545 op->mark_osdmon_event(__func__);
10546 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10547 MonSession *session = m->get_session();
10548 if (!session) {
10549 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
10550 return true;
10551 }
10552 if (!session->is_capable("osd", MON_CAP_W)) {
10553 dout(5) << "attempt to create new pool without sufficient auid privileges!"
10554 << "message: " << *m << std::endl
10555 << "caps: " << session->caps << dendl;
10556 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
10557 return true;
10558 }
10559
10560 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
10561 if (pool >= 0) {
10562 _pool_op_reply(op, 0, osdmap.get_epoch());
10563 return true;
10564 }
10565
10566 return false;
10567}
10568
10569bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
10570{
10571 op->mark_osdmon_event(__func__);
10572 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10573 dout(10) << "prepare_pool_op " << *m << dendl;
10574 if (m->op == POOL_OP_CREATE) {
10575 return prepare_pool_op_create(op);
10576 } else if (m->op == POOL_OP_DELETE) {
10577 return prepare_pool_op_delete(op);
10578 }
10579
10580 int ret = 0;
10581 bool changed = false;
10582
10583 if (!osdmap.have_pg_pool(m->pool)) {
10584 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
10585 return false;
10586 }
10587
10588 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
10589
10590 switch (m->op) {
10591 case POOL_OP_CREATE_SNAP:
10592 if (pool->is_tier()) {
10593 ret = -EINVAL;
10594 _pool_op_reply(op, ret, osdmap.get_epoch());
10595 return false;
10596 } // else, fall through
10597 case POOL_OP_DELETE_SNAP:
10598 if (!pool->is_unmanaged_snaps_mode()) {
10599 bool snap_exists = pool->snap_exists(m->name.c_str());
10600 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
10601 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
10602 ret = 0;
10603 } else {
10604 break;
10605 }
10606 } else {
10607 ret = -EINVAL;
10608 }
10609 _pool_op_reply(op, ret, osdmap.get_epoch());
10610 return false;
10611
10612 case POOL_OP_DELETE_UNMANAGED_SNAP:
10613 // we won't allow removal of an unmanaged snapshot from a pool
10614 // not in unmanaged snaps mode.
10615 if (!pool->is_unmanaged_snaps_mode()) {
10616 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
10617 return false;
10618 }
10619 /* fall-thru */
10620 case POOL_OP_CREATE_UNMANAGED_SNAP:
10621 // but we will allow creating an unmanaged snapshot on any pool
10622 // as long as it is not in 'pool' snaps mode.
10623 if (pool->is_pool_snaps_mode()) {
10624 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
10625 return false;
10626 }
10627 }
10628
10629 // projected pool info
10630 pg_pool_t pp;
10631 if (pending_inc.new_pools.count(m->pool))
10632 pp = pending_inc.new_pools[m->pool];
10633 else
10634 pp = *osdmap.get_pg_pool(m->pool);
10635
10636 bufferlist reply_data;
10637
10638 // pool snaps vs unmanaged snaps are mutually exclusive
10639 switch (m->op) {
10640 case POOL_OP_CREATE_SNAP:
10641 case POOL_OP_DELETE_SNAP:
10642 if (pp.is_unmanaged_snaps_mode()) {
10643 ret = -EINVAL;
10644 goto out;
10645 }
10646 break;
10647
10648 case POOL_OP_CREATE_UNMANAGED_SNAP:
10649 case POOL_OP_DELETE_UNMANAGED_SNAP:
10650 if (pp.is_pool_snaps_mode()) {
10651 ret = -EINVAL;
10652 goto out;
10653 }
10654 }
10655
10656 switch (m->op) {
10657 case POOL_OP_CREATE_SNAP:
10658 if (!pp.snap_exists(m->name.c_str())) {
10659 pp.add_snap(m->name.c_str(), ceph_clock_now());
10660 dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
10661 changed = true;
10662 }
10663 break;
10664
10665 case POOL_OP_DELETE_SNAP:
10666 {
10667 snapid_t s = pp.snap_exists(m->name.c_str());
10668 if (s) {
10669 pp.remove_snap(s);
10670 changed = true;
10671 }
10672 }
10673 break;
10674
10675 case POOL_OP_CREATE_UNMANAGED_SNAP:
10676 {
10677 uint64_t snapid;
10678 pp.add_unmanaged_snap(snapid);
10679 ::encode(snapid, reply_data);
10680 changed = true;
10681 }
10682 break;
10683
10684 case POOL_OP_DELETE_UNMANAGED_SNAP:
10685 if (!pp.is_removed_snap(m->snapid)) {
10686 pp.remove_unmanaged_snap(m->snapid);
10687 changed = true;
10688 }
10689 break;
10690
10691 case POOL_OP_AUID_CHANGE:
10692 if (pp.auid != m->auid) {
10693 pp.auid = m->auid;
10694 changed = true;
10695 }
10696 break;
10697
10698 default:
10699 ceph_abort();
10700 break;
10701 }
10702
10703 if (changed) {
10704 pp.set_snap_epoch(pending_inc.epoch);
10705 pending_inc.new_pools[m->pool] = pp;
10706 }
10707
10708 out:
10709 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
10710 return true;
10711}
10712
10713bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
10714{
10715 op->mark_osdmon_event(__func__);
10716 int err = prepare_new_pool(op);
10717 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
10718 return true;
10719}
10720
10721int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
10722 ostream *ss)
10723{
10724 const string& poolstr = osdmap.get_pool_name(pool_id);
10725
10726 // If the Pool is in use by CephFS, refuse to delete it
10727 FSMap const &pending_fsmap = mon->mdsmon()->get_pending();
10728 if (pending_fsmap.pool_in_use(pool_id)) {
10729 *ss << "pool '" << poolstr << "' is in use by CephFS";
10730 return -EBUSY;
10731 }
10732
10733 if (pool.tier_of >= 0) {
10734 *ss << "pool '" << poolstr << "' is a tier of '"
10735 << osdmap.get_pool_name(pool.tier_of) << "'";
10736 return -EBUSY;
10737 }
10738 if (!pool.tiers.empty()) {
10739 *ss << "pool '" << poolstr << "' has tiers";
10740 for(auto tier : pool.tiers) {
10741 *ss << " " << osdmap.get_pool_name(tier);
10742 }
10743 return -EBUSY;
10744 }
10745
10746 if (!g_conf->mon_allow_pool_delete) {
10747 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
10748 return -EPERM;
10749 }
10750
10751 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
10752 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
10753 return -EPERM;
10754 }
10755
10756 *ss << "pool '" << poolstr << "' removed";
10757 return 0;
10758}
10759
10760/**
10761 * Check if it is safe to add a tier to a base pool
10762 *
10763 * @return
10764 * True if the operation should proceed, false if we should abort here
10765 * (abort doesn't necessarily mean error, could be idempotency)
10766 */
10767bool OSDMonitor::_check_become_tier(
10768 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
10769 const int64_t base_pool_id, const pg_pool_t *base_pool,
10770 int *err,
10771 ostream *ss) const
10772{
10773 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
10774 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
10775
10776 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
10777 if (pending_fsmap.pool_in_use(tier_pool_id)) {
10778 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
10779 *err = -EBUSY;
10780 return false;
10781 }
10782
10783 if (base_pool->tiers.count(tier_pool_id)) {
10784 assert(tier_pool->tier_of == base_pool_id);
10785 *err = 0;
10786 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
10787 << base_pool_name << "'";
10788 return false;
10789 }
10790
10791 if (base_pool->is_tier()) {
10792 *ss << "pool '" << base_pool_name << "' is already a tier of '"
10793 << osdmap.get_pool_name(base_pool->tier_of) << "', "
10794 << "multiple tiers are not yet supported.";
10795 *err = -EINVAL;
10796 return false;
10797 }
10798
10799 if (tier_pool->has_tiers()) {
10800 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
10801 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
10802 it != tier_pool->tiers.end(); ++it)
10803 *ss << "'" << osdmap.get_pool_name(*it) << "',";
10804 *ss << " multiple tiers are not yet supported.";
10805 *err = -EINVAL;
10806 return false;
10807 }
10808
10809 if (tier_pool->is_tier()) {
10810 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
10811 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
10812 *err = -EINVAL;
10813 return false;
10814 }
10815
10816 *err = 0;
10817 return true;
10818}
10819
10820
10821/**
10822 * Check if it is safe to remove a tier from this base pool
10823 *
10824 * @return
10825 * True if the operation should proceed, false if we should abort here
10826 * (abort doesn't necessarily mean error, could be idempotency)
10827 */
10828bool OSDMonitor::_check_remove_tier(
10829 const int64_t base_pool_id, const pg_pool_t *base_pool,
10830 const pg_pool_t *tier_pool,
10831 int *err, ostream *ss) const
10832{
10833 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
10834
10835 // Apply CephFS-specific checks
10836 const FSMap &pending_fsmap = mon->mdsmon()->get_pending();
10837 if (pending_fsmap.pool_in_use(base_pool_id)) {
10838 if (base_pool->type != pg_pool_t::TYPE_REPLICATED) {
10839 // If the underlying pool is erasure coded, we can't permit the
10840 // removal of the replicated tier that CephFS relies on to access it
10841 *ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
10842 *err = -EBUSY;
10843 return false;
10844 }
10845
10846 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
10847 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
10848 "tier is still in use as a writeback cache. Change the cache "
10849 "mode and flush the cache before removing it";
10850 *err = -EBUSY;
10851 return false;
10852 }
10853 }
10854
10855 *err = 0;
10856 return true;
10857}
10858
10859int OSDMonitor::_prepare_remove_pool(
10860 int64_t pool, ostream *ss, bool no_fake)
10861{
224ce89b 10862 dout(10) << __func__ << " " << pool << dendl;
7c673cae
FG
10863 const pg_pool_t *p = osdmap.get_pg_pool(pool);
10864 int r = _check_remove_pool(pool, *p, ss);
10865 if (r < 0)
10866 return r;
10867
10868 auto new_pool = pending_inc.new_pools.find(pool);
10869 if (new_pool != pending_inc.new_pools.end()) {
10870 // if there is a problem with the pending info, wait and retry
10871 // this op.
10872 const auto& p = new_pool->second;
10873 int r = _check_remove_pool(pool, p, ss);
10874 if (r < 0)
10875 return -EAGAIN;
10876 }
10877
10878 if (pending_inc.old_pools.count(pool)) {
224ce89b 10879 dout(10) << __func__ << " " << pool << " already pending removal"
7c673cae
FG
10880 << dendl;
10881 return 0;
10882 }
10883
10884 if (g_conf->mon_fake_pool_delete && !no_fake) {
10885 string old_name = osdmap.get_pool_name(pool);
10886 string new_name = old_name + "." + stringify(pool) + ".DELETED";
10887 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
10888 << old_name << " -> " << new_name << dendl;
10889 pending_inc.new_pool_names[pool] = new_name;
10890 return 0;
10891 }
10892
10893 // remove
10894 pending_inc.old_pools.insert(pool);
10895
224ce89b 10896 // remove any pg_temp mappings for this pool
7c673cae
FG
10897 for (auto p = osdmap.pg_temp->begin();
10898 p != osdmap.pg_temp->end();
10899 ++p) {
10900 if (p->first.pool() == (uint64_t)pool) {
224ce89b 10901 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
7c673cae
FG
10902 << p->first << dendl;
10903 pending_inc.new_pg_temp[p->first].clear();
10904 }
10905 }
224ce89b 10906 // remove any primary_temp mappings for this pool
7c673cae
FG
10907 for (auto p = osdmap.primary_temp->begin();
10908 p != osdmap.primary_temp->end();
10909 ++p) {
10910 if (p->first.pool() == (uint64_t)pool) {
224ce89b 10911 dout(10) << __func__ << " " << pool
7c673cae
FG
10912 << " removing obsolete primary_temp" << p->first << dendl;
10913 pending_inc.new_primary_temp[p->first] = -1;
10914 }
10915 }
224ce89b
WB
10916 // remove any pg_upmap mappings for this pool
10917 for (auto& p : osdmap.pg_upmap) {
10918 if (p.first.pool() == (uint64_t)pool) {
10919 dout(10) << __func__ << " " << pool
10920 << " removing obsolete pg_upmap "
10921 << p.first << dendl;
10922 pending_inc.old_pg_upmap.insert(p.first);
10923 }
10924 }
10925 // remove any pg_upmap_items mappings for this pool
10926 for (auto& p : osdmap.pg_upmap_items) {
10927 if (p.first.pool() == (uint64_t)pool) {
10928 dout(10) << __func__ << " " << pool
10929 << " removing obsolete pg_upmap_items " << p.first
10930 << dendl;
10931 pending_inc.old_pg_upmap_items.insert(p.first);
10932 }
10933 }
7c673cae
FG
10934 return 0;
10935}
10936
10937int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
10938{
10939 dout(10) << "_prepare_rename_pool " << pool << dendl;
10940 if (pending_inc.old_pools.count(pool)) {
10941 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
10942 return -ENOENT;
10943 }
10944 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
10945 p != pending_inc.new_pool_names.end();
10946 ++p) {
10947 if (p->second == newname && p->first != pool) {
10948 return -EEXIST;
10949 }
10950 }
10951
10952 pending_inc.new_pool_names[pool] = newname;
10953 return 0;
10954}
10955
10956bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
10957{
10958 op->mark_osdmon_event(__func__);
10959 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10960 ostringstream ss;
10961 int ret = _prepare_remove_pool(m->pool, &ss, false);
10962 if (ret == -EAGAIN) {
10963 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10964 return true;
10965 }
10966 if (ret < 0)
10967 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
10968 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
10969 pending_inc.epoch));
10970 return true;
10971}
10972
10973void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
10974 int ret, epoch_t epoch, bufferlist *blp)
10975{
10976 op->mark_osdmon_event(__func__);
10977 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
10978 dout(20) << "_pool_op_reply " << ret << dendl;
10979 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
10980 ret, epoch, get_last_committed(), blp);
10981 mon->send_reply(op, reply);
10982}