]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/OSDMonitor.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / mon / OSDMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
9 *
10 * Author: Loic Dachary <loic@dachary.org>
11 *
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
16 *
17 */
18
19 #include <algorithm>
20 #include <boost/algorithm/string.hpp>
21 #include <experimental/iterator>
22 #include <locale>
23 #include <sstream>
24
25 #include "mon/OSDMonitor.h"
26 #include "mon/Monitor.h"
27 #include "mon/MDSMonitor.h"
28 #include "mon/MgrStatMonitor.h"
29 #include "mon/AuthMonitor.h"
30 #include "mon/ConfigKeyService.h"
31
32 #include "mon/MonitorDBStore.h"
33 #include "mon/Session.h"
34
35 #include "crush/CrushWrapper.h"
36 #include "crush/CrushTester.h"
37 #include "crush/CrushTreeDumper.h"
38
39 #include "messages/MOSDBeacon.h"
40 #include "messages/MOSDFailure.h"
41 #include "messages/MOSDMarkMeDown.h"
42 #include "messages/MOSDFull.h"
43 #include "messages/MOSDMap.h"
44 #include "messages/MMonGetOSDMap.h"
45 #include "messages/MOSDBoot.h"
46 #include "messages/MOSDAlive.h"
47 #include "messages/MPoolOp.h"
48 #include "messages/MPoolOpReply.h"
49 #include "messages/MOSDPGCreate.h"
50 #include "messages/MOSDPGCreate2.h"
51 #include "messages/MOSDPGCreated.h"
52 #include "messages/MOSDPGTemp.h"
53 #include "messages/MOSDPGReadyToMerge.h"
54 #include "messages/MMonCommand.h"
55 #include "messages/MRemoveSnaps.h"
56 #include "messages/MOSDScrub.h"
57 #include "messages/MRoute.h"
58
59 #include "common/TextTable.h"
60 #include "common/Timer.h"
61 #include "common/ceph_argparse.h"
62 #include "common/perf_counters.h"
63 #include "common/strtol.h"
64 #include "common/numa.h"
65
66 #include "common/config.h"
67 #include "common/errno.h"
68
69 #include "erasure-code/ErasureCodePlugin.h"
70 #include "compressor/Compressor.h"
71 #include "common/Checksummer.h"
72
73 #include "include/compat.h"
74 #include "include/ceph_assert.h"
75 #include "include/stringify.h"
76 #include "include/util.h"
77 #include "common/cmdparse.h"
78 #include "include/str_list.h"
79 #include "include/str_map.h"
80 #include "include/scope_guard.h"
81
82 #include "auth/cephx/CephxKeyServer.h"
83 #include "osd/OSDCap.h"
84
85 #include "json_spirit/json_spirit_reader.h"
86
87 #include <boost/algorithm/string/predicate.hpp>
88
89 #define dout_subsys ceph_subsys_mon
90 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
91 static const string OSD_METADATA_PREFIX("osd_metadata");
92 static const string OSD_SNAP_PREFIX("osd_snap");
93
94 namespace {
95
96 const uint32_t MAX_POOL_APPLICATIONS = 4;
97 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
98 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
99
100 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
101 // Note: this doesn't include support for the application tag match
102 if ((grant.spec.allow & OSD_CAP_W) != 0) {
103 auto& match = grant.match;
104 if (match.is_match_all()) {
105 return true;
106 } else if (pool_name != nullptr &&
107 !match.pool_namespace.pool_name.empty() &&
108 match.pool_namespace.pool_name == *pool_name) {
109 return true;
110 }
111 }
112 return false;
113 }
114
115 bool is_unmanaged_snap_op_permitted(CephContext* cct,
116 const KeyServer& key_server,
117 const EntityName& entity_name,
118 const MonCap& mon_caps,
119 const entity_addr_t& peer_socket_addr,
120 const std::string* pool_name)
121 {
122 typedef std::map<std::string, std::string> CommandArgs;
123
124 if (mon_caps.is_capable(
125 cct, CEPH_ENTITY_TYPE_MON,
126 entity_name, "osd",
127 "osd pool op unmanaged-snap",
128 (pool_name == nullptr ?
129 CommandArgs{} /* pool DNE, require unrestricted cap */ :
130 CommandArgs{{"poolname", *pool_name}}),
131 false, true, false,
132 peer_socket_addr)) {
133 return true;
134 }
135
136 AuthCapsInfo caps_info;
137 if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
138 caps_info)) {
139 dout(10) << "unable to locate OSD cap data for " << entity_name
140 << " in auth db" << dendl;
141 return false;
142 }
143
144 string caps_str;
145 if (caps_info.caps.length() > 0) {
146 auto p = caps_info.caps.cbegin();
147 try {
148 decode(caps_str, p);
149 } catch (const buffer::error &err) {
150 derr << "corrupt OSD cap data for " << entity_name << " in auth db"
151 << dendl;
152 return false;
153 }
154 }
155
156 OSDCap osd_cap;
157 if (!osd_cap.parse(caps_str, nullptr)) {
158 dout(10) << "unable to parse OSD cap data for " << entity_name
159 << " in auth db" << dendl;
160 return false;
161 }
162
163 // if the entity has write permissions in one or all pools, permit
164 // usage of unmanaged-snapshots
165 if (osd_cap.allow_all()) {
166 return true;
167 }
168
169 for (auto& grant : osd_cap.grants) {
170 if (grant.profile.is_valid()) {
171 for (auto& profile_grant : grant.profile_grants) {
172 if (is_osd_writable(profile_grant, pool_name)) {
173 return true;
174 }
175 }
176 } else if (is_osd_writable(grant, pool_name)) {
177 return true;
178 }
179 }
180
181 return false;
182 }
183
184 } // anonymous namespace
185
186 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
187 {
188 if (epoch_by_pg.size() <= ps) {
189 epoch_by_pg.resize(ps + 1, 0);
190 }
191 const auto old_lec = epoch_by_pg[ps];
192 if (old_lec >= last_epoch_clean) {
193 // stale lec
194 return;
195 }
196 epoch_by_pg[ps] = last_epoch_clean;
197 if (last_epoch_clean < floor) {
198 floor = last_epoch_clean;
199 } else if (last_epoch_clean > floor) {
200 if (old_lec == floor) {
201 // probably should increase floor?
202 auto new_floor = std::min_element(std::begin(epoch_by_pg),
203 std::end(epoch_by_pg));
204 floor = *new_floor;
205 }
206 }
207 if (ps != next_missing) {
208 return;
209 }
210 for (; next_missing < epoch_by_pg.size(); next_missing++) {
211 if (epoch_by_pg[next_missing] == 0) {
212 break;
213 }
214 }
215 }
216
217 void LastEpochClean::remove_pool(uint64_t pool)
218 {
219 report_by_pool.erase(pool);
220 }
221
222 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
223 {
224 auto& lec = report_by_pool[pg.pool()];
225 return lec.report(pg.ps(), last_epoch_clean);
226 }
227
228 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
229 {
230 auto floor = latest.get_epoch();
231 for (auto& pool : latest.get_pools()) {
232 auto reported = report_by_pool.find(pool.first);
233 if (reported == report_by_pool.end()) {
234 return 0;
235 }
236 if (reported->second.next_missing < pool.second.get_pg_num()) {
237 return 0;
238 }
239 if (reported->second.floor < floor) {
240 floor = reported->second.floor;
241 }
242 }
243 return floor;
244 }
245
246
247 class C_UpdateCreatingPGs : public Context {
248 public:
249 OSDMonitor *osdmon;
250 utime_t start;
251 epoch_t epoch;
252 C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
253 osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
254 void finish(int r) override {
255 if (r >= 0) {
256 utime_t end = ceph_clock_now();
257 dout(10) << "osdmap epoch " << epoch << " mapping took "
258 << (end - start) << " seconds" << dendl;
259 osdmon->update_creating_pgs();
260 osdmon->check_pg_creates_subs();
261 }
262 }
263 };
264
265 #undef dout_prefix
266 #define dout_prefix _prefix(_dout, mon, osdmap)
267 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
268 return *_dout << "mon." << mon->name << "@" << mon->rank
269 << "(" << mon->get_state_name()
270 << ").osd e" << osdmap.get_epoch() << " ";
271 }
272
273 OSDMonitor::OSDMonitor(
274 CephContext *cct,
275 Monitor *mn,
276 Paxos *p,
277 const string& service_name)
278 : PaxosService(mn, p, service_name),
279 cct(cct),
280 inc_osd_cache(g_conf()->mon_osd_cache_size),
281 full_osd_cache(g_conf()->mon_osd_cache_size),
282 has_osdmap_manifest(false),
283 mapper(mn->cct, &mn->cpu_tp)
284 {}
285
286 bool OSDMonitor::_have_pending_crush()
287 {
288 return pending_inc.crush.length() > 0;
289 }
290
291 CrushWrapper &OSDMonitor::_get_stable_crush()
292 {
293 return *osdmap.crush;
294 }
295
296 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
297 {
298 bufferlist bl;
299 if (pending_inc.crush.length())
300 bl = pending_inc.crush;
301 else
302 osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
303
304 auto p = bl.cbegin();
305 newcrush.decode(p);
306 }
307
308 void OSDMonitor::create_initial()
309 {
310 dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
311
312 OSDMap newmap;
313
314 bufferlist bl;
315 mon->store->get("mkfs", "osdmap", bl);
316
317 if (bl.length()) {
318 newmap.decode(bl);
319 newmap.set_fsid(mon->monmap->fsid);
320 } else {
321 newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
322 }
323 newmap.set_epoch(1);
324 newmap.created = newmap.modified = ceph_clock_now();
325
326 // new clusters should sort bitwise by default.
327 newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
328
329 newmap.flags |=
330 CEPH_OSDMAP_RECOVERY_DELETES |
331 CEPH_OSDMAP_PURGED_SNAPDIRS |
332 CEPH_OSDMAP_PGLOG_HARDLIMIT;
333 newmap.full_ratio = g_conf()->mon_osd_full_ratio;
334 if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
335 newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
336 if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
337 newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
338 if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
339
340 // new cluster should require latest by default
341 if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
342 if (g_conf()->mon_debug_no_require_mimic) {
343 derr << __func__ << " mon_debug_no_require_mimic=true and nautilus=true" << dendl;
344 newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
345 } else {
346 derr << __func__ << " mon_debug_no_require_nautilus=true" << dendl;
347 newmap.require_osd_release = CEPH_RELEASE_MIMIC;
348 }
349 } else {
350 newmap.require_osd_release = CEPH_RELEASE_NAUTILUS;
351 int r = ceph_release_from_name(
352 g_conf()->mon_osd_initial_require_min_compat_client.c_str());
353 if (r <= 0) {
354 ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
355 }
356 newmap.require_min_compat_client = r;
357 }
358
359 // encode into pending incremental
360 uint64_t features = newmap.get_encoding_features();
361 newmap.encode(pending_inc.fullmap,
362 features | CEPH_FEATURE_RESERVED);
363 pending_inc.full_crc = newmap.get_crc();
364 dout(20) << " full crc " << pending_inc.full_crc << dendl;
365 }
366
367 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
368 {
369 s.insert(service_name);
370 s.insert(OSD_PG_CREATING_PREFIX);
371 s.insert(OSD_METADATA_PREFIX);
372 s.insert(OSD_SNAP_PREFIX);
373 }
374
375 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
376 {
377 // we really don't care if the version has been updated, because we may
378 // have trimmed without having increased the last committed; yet, we may
379 // need to update the in-memory manifest.
380 load_osdmap_manifest();
381
382 version_t version = get_last_committed();
383 if (version == osdmap.epoch)
384 return;
385 ceph_assert(version > osdmap.epoch);
386
387 dout(15) << "update_from_paxos paxos e " << version
388 << ", my e " << osdmap.epoch << dendl;
389
390 if (mapping_job) {
391 if (!mapping_job->is_done()) {
392 dout(1) << __func__ << " mapping job "
393 << mapping_job.get() << " did not complete, "
394 << mapping_job->shards << " left, canceling" << dendl;
395 mapping_job->abort();
396 }
397 mapping_job.reset();
398 }
399
400 load_health();
401
402 /*
403 * We will possibly have a stashed latest that *we* wrote, and we will
404 * always be sure to have the oldest full map in the first..last range
405 * due to encode_trim_extra(), which includes the oldest full map in the trim
406 * transaction.
407 *
408 * encode_trim_extra() does not however write the full map's
409 * version to 'full_latest'. This is only done when we are building the
410 * full maps from the incremental versions. But don't panic! We make sure
411 * that the following conditions find whichever full map version is newer.
412 */
413 version_t latest_full = get_version_latest_full();
414 if (latest_full == 0 && get_first_committed() > 1)
415 latest_full = get_first_committed();
416
417 if (get_first_committed() > 1 &&
418 latest_full < get_first_committed()) {
419 // the monitor could be just sync'ed with its peer, and the latest_full key
420 // is not encoded in the paxos commits in encode_pending(), so we need to
421 // make sure we get it pointing to a proper version.
422 version_t lc = get_last_committed();
423 version_t fc = get_first_committed();
424
425 dout(10) << __func__ << " looking for valid full map in interval"
426 << " [" << fc << ", " << lc << "]" << dendl;
427
428 latest_full = 0;
429 for (version_t v = lc; v >= fc; v--) {
430 string full_key = "full_" + stringify(v);
431 if (mon->store->exists(get_service_name(), full_key)) {
432 dout(10) << __func__ << " found latest full map v " << v << dendl;
433 latest_full = v;
434 break;
435 }
436 }
437
438 ceph_assert(latest_full > 0);
439 auto t(std::make_shared<MonitorDBStore::Transaction>());
440 put_version_latest_full(t, latest_full);
441 mon->store->apply_transaction(t);
442 dout(10) << __func__ << " updated the on-disk full map version to "
443 << latest_full << dendl;
444 }
445
446 if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
447 bufferlist latest_bl;
448 get_version_full(latest_full, latest_bl);
449 ceph_assert(latest_bl.length() != 0);
450 dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
451 osdmap = OSDMap();
452 osdmap.decode(latest_bl);
453 }
454
455 bufferlist bl;
456 if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
457 auto p = bl.cbegin();
458 std::lock_guard<std::mutex> l(creating_pgs_lock);
459 creating_pgs.decode(p);
460 dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
461 << creating_pgs.last_scan_epoch
462 << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
463 } else {
464 dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
465 << dendl;
466 }
467
468 // walk through incrementals
469 MonitorDBStore::TransactionRef t;
470 size_t tx_size = 0;
471 while (version > osdmap.epoch) {
472 bufferlist inc_bl;
473 int err = get_version(osdmap.epoch+1, inc_bl);
474 ceph_assert(err == 0);
475 ceph_assert(inc_bl.length());
476
477 dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1
478 << dendl;
479 OSDMap::Incremental inc(inc_bl);
480 err = osdmap.apply_incremental(inc);
481 ceph_assert(err == 0);
482
483 if (!t)
484 t.reset(new MonitorDBStore::Transaction);
485
486 // Write out the full map for all past epochs. Encode the full
487 // map with the same features as the incremental. If we don't
488 // know, use the quorum features. If we don't know those either,
489 // encode with all features.
490 uint64_t f = inc.encode_features;
491 if (!f)
492 f = mon->get_quorum_con_features();
493 if (!f)
494 f = -1;
495 bufferlist full_bl;
496 osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
497 tx_size += full_bl.length();
498
499 bufferlist orig_full_bl;
500 get_version_full(osdmap.epoch, orig_full_bl);
501 if (orig_full_bl.length()) {
502 // the primary provided the full map
503 ceph_assert(inc.have_crc);
504 if (inc.full_crc != osdmap.crc) {
505 // This will happen if the mons were running mixed versions in
506 // the past or some other circumstance made the full encoded
507 // maps divergent. Reloading here will bring us back into
508 // sync with the primary for this and all future maps. OSDs
509 // will also be brought back into sync when they discover the
510 // crc mismatch and request a full map from a mon.
511 derr << __func__ << " full map CRC mismatch, resetting to canonical"
512 << dendl;
513
514 dout(20) << __func__ << " my (bad) full osdmap:\n";
515 JSONFormatter jf(true);
516 jf.dump_object("osdmap", osdmap);
517 jf.flush(*_dout);
518 *_dout << "\nhexdump:\n";
519 full_bl.hexdump(*_dout);
520 *_dout << dendl;
521
522 osdmap = OSDMap();
523 osdmap.decode(orig_full_bl);
524
525 dout(20) << __func__ << " canonical full osdmap:\n";
526 JSONFormatter jf(true);
527 jf.dump_object("osdmap", osdmap);
528 jf.flush(*_dout);
529 *_dout << "\nhexdump:\n";
530 orig_full_bl.hexdump(*_dout);
531 *_dout << dendl;
532 }
533 } else {
534 ceph_assert(!inc.have_crc);
535 put_version_full(t, osdmap.epoch, full_bl);
536 }
537 put_version_latest_full(t, osdmap.epoch);
538
539 // share
540 dout(1) << osdmap << dendl;
541
542 if (osdmap.epoch == 1) {
543 t->erase("mkfs", "osdmap");
544 }
545
546 if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
547 mon->store->apply_transaction(t);
548 t = MonitorDBStore::TransactionRef();
549 tx_size = 0;
550 }
551 for (const auto &osd_state : inc.new_state) {
552 if (osd_state.second & CEPH_OSD_UP) {
553 // could be marked up *or* down, but we're too lazy to check which
554 last_osd_report.erase(osd_state.first);
555 }
556 if (osd_state.second & CEPH_OSD_EXISTS) {
557 // could be created *or* destroyed, but we can safely drop it
558 osd_epochs.erase(osd_state.first);
559 }
560 }
561 }
562
563 if (t) {
564 mon->store->apply_transaction(t);
565 }
566
567 for (int o = 0; o < osdmap.get_max_osd(); o++) {
568 if (osdmap.is_out(o))
569 continue;
570 auto found = down_pending_out.find(o);
571 if (osdmap.is_down(o)) {
572 // populate down -> out map
573 if (found == down_pending_out.end()) {
574 dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
575 down_pending_out[o] = ceph_clock_now();
576 }
577 } else {
578 if (found != down_pending_out.end()) {
579 dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
580 down_pending_out.erase(found);
581 }
582 }
583 }
584 // XXX: need to trim MonSession connected with a osd whose id > max_osd?
585
586 check_osdmap_subs();
587 check_pg_creates_subs();
588
589 share_map_with_random_osd();
590 update_logger();
591
592 process_failures();
593
594 // make sure our feature bits reflect the latest map
595 update_msgr_features();
596
597 if (!mon->is_leader()) {
598 // will be called by on_active() on the leader, avoid doing so twice
599 start_mapping();
600 }
601 }
602
603 void OSDMonitor::start_mapping()
604 {
605 // initiate mapping job
606 if (mapping_job) {
607 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
608 << dendl;
609 mapping_job->abort();
610 }
611 if (!osdmap.get_pools().empty()) {
612 auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
613 mapping_job = mapping.start_update(osdmap, mapper,
614 g_conf()->mon_osd_mapping_pgs_per_chunk);
615 dout(10) << __func__ << " started mapping job " << mapping_job.get()
616 << " at " << fin->start << dendl;
617 mapping_job->set_finish_event(fin);
618 } else {
619 dout(10) << __func__ << " no pools, no mapping job" << dendl;
620 mapping_job = nullptr;
621 }
622 }
623
624 void OSDMonitor::update_msgr_features()
625 {
626 set<int> types;
627 types.insert((int)entity_name_t::TYPE_OSD);
628 types.insert((int)entity_name_t::TYPE_CLIENT);
629 types.insert((int)entity_name_t::TYPE_MDS);
630 types.insert((int)entity_name_t::TYPE_MON);
631 for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
632 uint64_t mask;
633 uint64_t features = osdmap.get_features(*q, &mask);
634 if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
635 dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
636 ceph::net::Policy p = mon->messenger->get_policy(*q);
637 p.features_required = (p.features_required & ~mask) | features;
638 mon->messenger->set_policy(*q, p);
639 }
640 }
641 }
642
643 void OSDMonitor::on_active()
644 {
645 update_logger();
646
647 if (mon->is_leader()) {
648 mon->clog->debug() << "osdmap " << osdmap;
649 if (!priority_convert) {
650 // Only do this once at start-up
651 convert_pool_priorities();
652 priority_convert = true;
653 }
654 } else {
655 list<MonOpRequestRef> ls;
656 take_all_failures(ls);
657 while (!ls.empty()) {
658 MonOpRequestRef op = ls.front();
659 op->mark_osdmon_event(__func__);
660 dispatch(op);
661 ls.pop_front();
662 }
663 }
664 start_mapping();
665 }
666
667 void OSDMonitor::on_restart()
668 {
669 last_osd_report.clear();
670 }
671
672 void OSDMonitor::on_shutdown()
673 {
674 dout(10) << __func__ << dendl;
675 if (mapping_job) {
676 dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
677 << dendl;
678 mapping_job->abort();
679 }
680
681 // discard failure info, waiters
682 list<MonOpRequestRef> ls;
683 take_all_failures(ls);
684 ls.clear();
685 }
686
687 void OSDMonitor::update_logger()
688 {
689 dout(10) << "update_logger" << dendl;
690
691 mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
692 mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
693 mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
694 mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
695 }
696
697 void OSDMonitor::create_pending()
698 {
699 pending_inc = OSDMap::Incremental(osdmap.epoch+1);
700 pending_inc.fsid = mon->monmap->fsid;
701 pending_metadata.clear();
702 pending_metadata_rm.clear();
703
704 dout(10) << "create_pending e " << pending_inc.epoch << dendl;
705
706 // safety checks (this shouldn't really happen)
707 {
708 if (osdmap.backfillfull_ratio <= 0) {
709 pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
710 if (pending_inc.new_backfillfull_ratio > 1.0)
711 pending_inc.new_backfillfull_ratio /= 100;
712 dout(1) << __func__ << " setting backfillfull_ratio = "
713 << pending_inc.new_backfillfull_ratio << dendl;
714 }
715 if (osdmap.full_ratio <= 0) {
716 pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
717 if (pending_inc.new_full_ratio > 1.0)
718 pending_inc.new_full_ratio /= 100;
719 dout(1) << __func__ << " setting full_ratio = "
720 << pending_inc.new_full_ratio << dendl;
721 }
722 if (osdmap.nearfull_ratio <= 0) {
723 pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
724 if (pending_inc.new_nearfull_ratio > 1.0)
725 pending_inc.new_nearfull_ratio /= 100;
726 dout(1) << __func__ << " setting nearfull_ratio = "
727 << pending_inc.new_nearfull_ratio << dendl;
728 }
729 }
730
731 // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
732 // structure.
733 if (osdmap.crush->has_legacy_rule_ids()) {
734 CrushWrapper newcrush;
735 _get_pending_crush(newcrush);
736
737 // First, for all pools, work out which rule they really used
738 // by resolving ruleset to rule.
739 for (const auto &i : osdmap.get_pools()) {
740 const auto pool_id = i.first;
741 const auto &pool = i.second;
742 int new_rule_id = newcrush.find_rule(pool.crush_rule,
743 pool.type, pool.size);
744
745 dout(1) << __func__ << " rewriting pool "
746 << osdmap.get_pool_name(pool_id) << " crush ruleset "
747 << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
748 if (pending_inc.new_pools.count(pool_id) == 0) {
749 pending_inc.new_pools[pool_id] = pool;
750 }
751 pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
752 }
753
754 // Now, go ahead and renumber all the rules so that their
755 // rule_id field corresponds to their position in the array
756 auto old_to_new = newcrush.renumber_rules();
757 dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
758 for (const auto &i : old_to_new) {
759 dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
760 }
761 pending_inc.crush.clear();
762 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
763 }
764 }
765
766 creating_pgs_t
767 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
768 const OSDMap& nextmap)
769 {
770 dout(10) << __func__ << dendl;
771 creating_pgs_t pending_creatings;
772 {
773 std::lock_guard<std::mutex> l(creating_pgs_lock);
774 pending_creatings = creating_pgs;
775 }
776 // check for new or old pools
777 if (pending_creatings.last_scan_epoch < inc.epoch) {
778 unsigned queued = 0;
779 queued += scan_for_creating_pgs(osdmap.get_pools(),
780 inc.old_pools,
781 inc.modified,
782 &pending_creatings);
783 queued += scan_for_creating_pgs(inc.new_pools,
784 inc.old_pools,
785 inc.modified,
786 &pending_creatings);
787 dout(10) << __func__ << " " << queued << " pools queued" << dendl;
788 for (auto deleted_pool : inc.old_pools) {
789 auto removed = pending_creatings.remove_pool(deleted_pool);
790 dout(10) << __func__ << " " << removed
791 << " pg removed because containing pool deleted: "
792 << deleted_pool << dendl;
793 last_epoch_clean.remove_pool(deleted_pool);
794 }
795 // pgmon updates its creating_pgs in check_osd_map() which is called by
796 // on_active() and check_osd_map() could be delayed if lease expires, so its
797 // creating_pgs could be stale in comparison with the one of osdmon. let's
798 // trim them here. otherwise, they will be added back after being erased.
799 unsigned removed = 0;
800 for (auto& pg : pending_created_pgs) {
801 dout(20) << __func__ << " noting created pg " << pg << dendl;
802 pending_creatings.created_pools.insert(pg.pool());
803 removed += pending_creatings.pgs.erase(pg);
804 }
805 pending_created_pgs.clear();
806 dout(10) << __func__ << " " << removed
807 << " pgs removed because they're created" << dendl;
808 pending_creatings.last_scan_epoch = osdmap.get_epoch();
809 }
810
811 // filter out any pgs that shouldn't exist.
812 {
813 auto i = pending_creatings.pgs.begin();
814 while (i != pending_creatings.pgs.end()) {
815 if (!nextmap.pg_exists(i->first)) {
816 dout(10) << __func__ << " removing pg " << i->first
817 << " which should not exist" << dendl;
818 i = pending_creatings.pgs.erase(i);
819 } else {
820 ++i;
821 }
822 }
823 }
824
825 // process queue
826 unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
827 const auto total = pending_creatings.pgs.size();
828 while (pending_creatings.pgs.size() < max &&
829 !pending_creatings.queue.empty()) {
830 auto p = pending_creatings.queue.begin();
831 int64_t poolid = p->first;
832 dout(10) << __func__ << " pool " << poolid
833 << " created " << p->second.created
834 << " modified " << p->second.modified
835 << " [" << p->second.start << "-" << p->second.end << ")"
836 << dendl;
837 int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
838 p->second.end - p->second.start);
839 ps_t first = p->second.start;
840 ps_t end = first + n;
841 for (ps_t ps = first; ps < end; ++ps) {
842 const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
843 // NOTE: use the *current* epoch as the PG creation epoch so that the
844 // OSD does not have to generate a long set of PastIntervals.
845 pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
846 p->second.modified));
847 dout(10) << __func__ << " adding " << pgid << dendl;
848 }
849 p->second.start = end;
850 if (p->second.done()) {
851 dout(10) << __func__ << " done with queue for " << poolid << dendl;
852 pending_creatings.queue.erase(p);
853 } else {
854 dout(10) << __func__ << " pool " << poolid
855 << " now [" << p->second.start << "-" << p->second.end << ")"
856 << dendl;
857 }
858 }
859 dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
860 << " pools" << dendl;
861 dout(10) << __func__
862 << " " << (pending_creatings.pgs.size() - total)
863 << "/" << pending_creatings.pgs.size()
864 << " pgs added from queued pools" << dendl;
865 return pending_creatings;
866 }
867
868 void OSDMonitor::maybe_prime_pg_temp()
869 {
870 bool all = false;
871 if (pending_inc.crush.length()) {
872 dout(10) << __func__ << " new crush map, all" << dendl;
873 all = true;
874 }
875
876 if (!pending_inc.new_up_client.empty()) {
877 dout(10) << __func__ << " new up osds, all" << dendl;
878 all = true;
879 }
880
881 // check for interesting OSDs
882 set<int> osds;
883 for (auto p = pending_inc.new_state.begin();
884 !all && p != pending_inc.new_state.end();
885 ++p) {
886 if ((p->second & CEPH_OSD_UP) &&
887 osdmap.is_up(p->first)) {
888 osds.insert(p->first);
889 }
890 }
891 for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
892 !all && p != pending_inc.new_weight.end();
893 ++p) {
894 if (p->second < osdmap.get_weight(p->first)) {
895 // weight reduction
896 osds.insert(p->first);
897 } else {
898 dout(10) << __func__ << " osd." << p->first << " weight increase, all"
899 << dendl;
900 all = true;
901 }
902 }
903
904 if (!all && osds.empty())
905 return;
906
907 if (!all) {
908 unsigned estimate =
909 mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
910 if (estimate > mapping.get_num_pgs() *
911 g_conf()->mon_osd_prime_pg_temp_max_estimate) {
912 dout(10) << __func__ << " estimate " << estimate << " pgs on "
913 << osds.size() << " osds >= "
914 << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
915 << mapping.get_num_pgs() << " pgs, all"
916 << dendl;
917 all = true;
918 } else {
919 dout(10) << __func__ << " estimate " << estimate << " pgs on "
920 << osds.size() << " osds" << dendl;
921 }
922 }
923
924 OSDMap next;
925 next.deepish_copy_from(osdmap);
926 next.apply_incremental(pending_inc);
927
928 if (next.get_pools().empty()) {
929 dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
930 } else if (all) {
931 PrimeTempJob job(next, this);
932 mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
933 if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
934 dout(10) << __func__ << " done in " << job.get_duration() << dendl;
935 } else {
936 dout(10) << __func__ << " did not finish in "
937 << g_conf()->mon_osd_prime_pg_temp_max_time
938 << ", stopping" << dendl;
939 job.abort();
940 }
941 } else {
942 dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
943 utime_t stop = ceph_clock_now();
944 stop += g_conf()->mon_osd_prime_pg_temp_max_time;
945 const int chunk = 1000;
946 int n = chunk;
947 std::unordered_set<pg_t> did_pgs;
948 for (auto osd : osds) {
949 auto& pgs = mapping.get_osd_acting_pgs(osd);
950 dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
951 for (auto pgid : pgs) {
952 if (!did_pgs.insert(pgid).second) {
953 continue;
954 }
955 prime_pg_temp(next, pgid);
956 if (--n <= 0) {
957 n = chunk;
958 if (ceph_clock_now() > stop) {
959 dout(10) << __func__ << " consumed more than "
960 << g_conf()->mon_osd_prime_pg_temp_max_time
961 << " seconds, stopping"
962 << dendl;
963 return;
964 }
965 }
966 }
967 }
968 }
969 }
970
971 void OSDMonitor::prime_pg_temp(
972 const OSDMap& next,
973 pg_t pgid)
974 {
975 // TODO: remove this creating_pgs direct access?
976 if (creating_pgs.pgs.count(pgid)) {
977 return;
978 }
979 if (!osdmap.pg_exists(pgid)) {
980 return;
981 }
982
983 vector<int> up, acting;
984 mapping.get(pgid, &up, nullptr, &acting, nullptr);
985
986 vector<int> next_up, next_acting;
987 int next_up_primary, next_acting_primary;
988 next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
989 &next_acting, &next_acting_primary);
990 if (acting == next_acting &&
991 !(up != acting && next_up == next_acting))
992 return; // no change since last epoch
993
994 if (acting.empty())
995 return; // if previously empty now we can be no worse off
996 const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
997 if (pool && acting.size() < pool->min_size)
998 return; // can be no worse off than before
999
1000 if (next_up == next_acting) {
1001 acting.clear();
1002 dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1003 << dendl;
1004 }
1005
1006 dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1007 << " -> " << next_up << "/" << next_acting
1008 << ", priming " << acting
1009 << dendl;
1010 {
1011 std::lock_guard l(prime_pg_temp_lock);
1012 // do not touch a mapping if a change is pending
1013 pending_inc.new_pg_temp.emplace(
1014 pgid,
1015 mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1016 }
1017 }
1018
1019 /**
1020 * @note receiving a transaction in this function gives a fair amount of
1021 * freedom to the service implementation if it does need it. It shouldn't.
1022 */
1023 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1024 {
1025 dout(10) << "encode_pending e " << pending_inc.epoch
1026 << dendl;
1027
1028 if (do_prune(t)) {
1029 dout(1) << __func__ << " osdmap full prune encoded e"
1030 << pending_inc.epoch << dendl;
1031 }
1032
1033 // finalize up pending_inc
1034 pending_inc.modified = ceph_clock_now();
1035
1036 int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1037 ceph_assert(r == 0);
1038
1039 if (mapping_job) {
1040 if (!mapping_job->is_done()) {
1041 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1042 << mapping_job.get() << " did not complete, "
1043 << mapping_job->shards << " left" << dendl;
1044 mapping_job->abort();
1045 } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1046 dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1047 << mapping_job.get() << " is prior epoch "
1048 << mapping.get_epoch() << dendl;
1049 } else {
1050 if (g_conf()->mon_osd_prime_pg_temp) {
1051 maybe_prime_pg_temp();
1052 }
1053 }
1054 } else if (g_conf()->mon_osd_prime_pg_temp) {
1055 dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1056 << dendl;
1057 }
1058 mapping_job.reset();
1059
1060 // ensure we don't have blank new_state updates. these are interrpeted as
1061 // CEPH_OSD_UP (and almost certainly not what we want!).
1062 auto p = pending_inc.new_state.begin();
1063 while (p != pending_inc.new_state.end()) {
1064 if (p->second == 0) {
1065 dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1066 p = pending_inc.new_state.erase(p);
1067 } else {
1068 if (p->second & CEPH_OSD_UP) {
1069 pending_inc.new_last_up_change = pending_inc.modified;
1070 }
1071 ++p;
1072 }
1073 }
1074 if (!pending_inc.new_up_client.empty()) {
1075 pending_inc.new_last_up_change = pending_inc.modified;
1076 }
1077 for (auto& i : pending_inc.new_weight) {
1078 if (i.first > osdmap.max_osd) {
1079 if (i.second) {
1080 // new osd is already marked in
1081 pending_inc.new_last_in_change = pending_inc.modified;
1082 }
1083 } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1084 // existing osd marked in or out
1085 pending_inc.new_last_in_change = pending_inc.modified;
1086 }
1087 }
1088
1089 {
1090 OSDMap tmp;
1091 tmp.deepish_copy_from(osdmap);
1092 tmp.apply_incremental(pending_inc);
1093
1094 // clean pg_temp mappings
1095 OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1096
1097 // clean inappropriate pg_upmap/pg_upmap_items (if any)
1098 {
1099 // check every upmapped pg for now
1100 // until we could reliably identify certain cases to ignore,
1101 // which is obviously the hard part TBD..
1102 vector<pg_t> pgs_to_check;
1103 tmp.get_upmap_pgs(&pgs_to_check);
1104 if (pgs_to_check.size() < g_conf()->mon_clean_pg_upmaps_per_chunk * 2) {
1105 // not enough pgs, do it inline
1106 tmp.clean_pg_upmaps(cct, &pending_inc);
1107 } else {
1108 CleanUpmapJob job(cct, tmp, pending_inc);
1109 mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1110 job.wait();
1111 }
1112 }
1113
1114 // update creating pgs first so that we can remove the created pgid and
1115 // process the pool flag removal below in the same osdmap epoch.
1116 auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1117 bufferlist creatings_bl;
1118 encode(pending_creatings, creatings_bl);
1119 t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1120
1121 // remove any old (or incompat) POOL_CREATING flags
1122 for (auto& i : tmp.get_pools()) {
1123 if (tmp.require_osd_release < CEPH_RELEASE_NAUTILUS) {
1124 // pre-nautilus OSDMaps shouldn't get this flag.
1125 if (pending_inc.new_pools.count(i.first)) {
1126 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1127 }
1128 }
1129 if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1130 !pending_creatings.still_creating_pool(i.first)) {
1131 dout(10) << __func__ << " done creating pool " << i.first
1132 << ", clearing CREATING flag" << dendl;
1133 if (pending_inc.new_pools.count(i.first) == 0) {
1134 pending_inc.new_pools[i.first] = i.second;
1135 }
1136 pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1137 }
1138 }
1139
1140 // remove any legacy osdmap nearfull/full flags
1141 {
1142 if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1143 dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1144 << dendl;
1145 remove_flag(CEPH_OSDMAP_NEARFULL);
1146 remove_flag(CEPH_OSDMAP_FULL);
1147 }
1148 }
1149 // collect which pools are currently affected by
1150 // the near/backfill/full osd(s),
1151 // and set per-pool near/backfill/full flag instead
1152 set<int64_t> full_pool_ids;
1153 set<int64_t> backfillfull_pool_ids;
1154 set<int64_t> nearfull_pool_ids;
1155 tmp.get_full_pools(cct,
1156 &full_pool_ids,
1157 &backfillfull_pool_ids,
1158 &nearfull_pool_ids);
1159 if (full_pool_ids.empty() ||
1160 backfillfull_pool_ids.empty() ||
1161 nearfull_pool_ids.empty()) {
1162 // normal case - no nearfull, backfillfull or full osds
1163 // try cancel any improper nearfull/backfillfull/full pool
1164 // flags first
1165 for (auto &pool: tmp.get_pools()) {
1166 auto p = pool.first;
1167 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1168 nearfull_pool_ids.empty()) {
1169 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1170 << "'s nearfull flag" << dendl;
1171 if (pending_inc.new_pools.count(p) == 0) {
1172 // load original pool info first!
1173 pending_inc.new_pools[p] = pool.second;
1174 }
1175 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1176 }
1177 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1178 backfillfull_pool_ids.empty()) {
1179 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1180 << "'s backfillfull flag" << dendl;
1181 if (pending_inc.new_pools.count(p) == 0) {
1182 pending_inc.new_pools[p] = pool.second;
1183 }
1184 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1185 }
1186 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1187 full_pool_ids.empty()) {
1188 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1189 // set by EQUOTA, skipping
1190 continue;
1191 }
1192 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1193 << "'s full flag" << dendl;
1194 if (pending_inc.new_pools.count(p) == 0) {
1195 pending_inc.new_pools[p] = pool.second;
1196 }
1197 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1198 }
1199 }
1200 }
1201 if (!full_pool_ids.empty()) {
1202 dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1203 << " as full" << dendl;
1204 for (auto &p: full_pool_ids) {
1205 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1206 continue;
1207 }
1208 if (pending_inc.new_pools.count(p) == 0) {
1209 pending_inc.new_pools[p] = tmp.pools[p];
1210 }
1211 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1212 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1213 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1214 }
1215 // cancel FLAG_FULL for pools which are no longer full too
1216 for (auto &pool: tmp.get_pools()) {
1217 auto p = pool.first;
1218 if (full_pool_ids.count(p)) {
1219 // skip pools we have just marked as full above
1220 continue;
1221 }
1222 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1223 tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1224 // don't touch if currently is not full
1225 // or is running out of quota (and hence considered as full)
1226 continue;
1227 }
1228 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1229 << "'s full flag" << dendl;
1230 if (pending_inc.new_pools.count(p) == 0) {
1231 pending_inc.new_pools[p] = pool.second;
1232 }
1233 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1234 }
1235 }
1236 if (!backfillfull_pool_ids.empty()) {
1237 for (auto &p: backfillfull_pool_ids) {
1238 if (full_pool_ids.count(p)) {
1239 // skip pools we have already considered as full above
1240 continue;
1241 }
1242 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1243 // make sure FLAG_FULL is truly set, so we are safe not
1244 // to set a extra (redundant) FLAG_BACKFILLFULL flag
1245 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1246 continue;
1247 }
1248 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1249 // don't bother if pool is already marked as backfillfull
1250 continue;
1251 }
1252 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1253 << "'s as backfillfull" << dendl;
1254 if (pending_inc.new_pools.count(p) == 0) {
1255 pending_inc.new_pools[p] = tmp.pools[p];
1256 }
1257 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1258 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1259 }
1260 // cancel FLAG_BACKFILLFULL for pools
1261 // which are no longer backfillfull too
1262 for (auto &pool: tmp.get_pools()) {
1263 auto p = pool.first;
1264 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1265 // skip pools we have just marked as backfillfull/full above
1266 continue;
1267 }
1268 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1269 // and don't touch if currently is not backfillfull
1270 continue;
1271 }
1272 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1273 << "'s backfillfull flag" << dendl;
1274 if (pending_inc.new_pools.count(p) == 0) {
1275 pending_inc.new_pools[p] = pool.second;
1276 }
1277 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1278 }
1279 }
1280 if (!nearfull_pool_ids.empty()) {
1281 for (auto &p: nearfull_pool_ids) {
1282 if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1283 continue;
1284 }
1285 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1286 // make sure FLAG_FULL is truly set, so we are safe not
1287 // to set a extra (redundant) FLAG_NEARFULL flag
1288 ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1289 continue;
1290 }
1291 if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1292 // don't bother if pool is already marked as nearfull
1293 continue;
1294 }
1295 dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1296 << "'s as nearfull" << dendl;
1297 if (pending_inc.new_pools.count(p) == 0) {
1298 pending_inc.new_pools[p] = tmp.pools[p];
1299 }
1300 pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1301 }
1302 // cancel FLAG_NEARFULL for pools
1303 // which are no longer nearfull too
1304 for (auto &pool: tmp.get_pools()) {
1305 auto p = pool.first;
1306 if (full_pool_ids.count(p) ||
1307 backfillfull_pool_ids.count(p) ||
1308 nearfull_pool_ids.count(p)) {
1309 // skip pools we have just marked as
1310 // nearfull/backfillfull/full above
1311 continue;
1312 }
1313 if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1314 // and don't touch if currently is not nearfull
1315 continue;
1316 }
1317 dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1318 << "'s nearfull flag" << dendl;
1319 if (pending_inc.new_pools.count(p) == 0) {
1320 pending_inc.new_pools[p] = pool.second;
1321 }
1322 pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1323 }
1324 }
1325
1326 // min_compat_client?
1327 if (tmp.require_min_compat_client == 0) {
1328 auto mv = tmp.get_min_compat_client();
1329 dout(1) << __func__ << " setting require_min_compat_client to currently "
1330 << "required " << ceph_release_name(mv) << dendl;
1331 mon->clog->info() << "setting require_min_compat_client to currently "
1332 << "required " << ceph_release_name(mv);
1333 pending_inc.new_require_min_compat_client = mv;
1334 }
1335
1336 // upgrade to mimic?
1337 if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC &&
1338 tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1339 dout(10) << __func__ << " first mimic+ epoch" << dendl;
1340 // record this epoch as the deletion for all legacy removed_snaps
1341 for (auto& p : tmp.get_pools()) {
1342 // update every pool
1343 if (pending_inc.new_pools.count(p.first) == 0) {
1344 pending_inc.new_pools[p.first] = p.second;
1345 }
1346 auto& pi = pending_inc.new_pools[p.first];
1347 if (pi.snap_seq == 0) {
1348 // no snaps on this pool
1349 continue;
1350 }
1351 if ((pi.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS |
1352 pg_pool_t::FLAG_POOL_SNAPS)) == 0) {
1353 if (!pi.removed_snaps.empty()) {
1354 pi.flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
1355 } else {
1356 pi.flags |= pg_pool_t::FLAG_POOL_SNAPS;
1357 }
1358 }
1359
1360 // Make all previously removed snaps appear to be removed in this
1361 // epoch. this populates removed_snaps_queue. The OSD will subtract
1362 // off its purged_snaps, as before, and this set will shrink over the
1363 // following epochs as the purged snaps are reported back through the
1364 // mgr.
1365 OSDMap::snap_interval_set_t removed;
1366 if (!p.second.removed_snaps.empty()) {
1367 // different flavor of interval_set :(
1368 for (auto q = p.second.removed_snaps.begin();
1369 q != p.second.removed_snaps.end();
1370 ++q) {
1371 removed.insert(q.get_start(), q.get_len());
1372 }
1373 } else {
1374 for (snapid_t s = 1; s <= pi.get_snap_seq(); s = s + 1) {
1375 if (pi.snaps.count(s) == 0) {
1376 removed.insert(s);
1377 }
1378 }
1379 }
1380 pending_inc.new_removed_snaps[p.first].union_of(removed);
1381
1382 dout(10) << __func__ << " converting pool " << p.first
1383 << " with " << p.second.removed_snaps.size()
1384 << " legacy removed_snaps" << dendl;
1385 string k = make_snap_epoch_key(p.first, pending_inc.epoch);
1386 bufferlist v;
1387 encode(p.second.removed_snaps, v);
1388 t->put(OSD_SNAP_PREFIX, k, v);
1389 for (auto q = p.second.removed_snaps.begin();
1390 q != p.second.removed_snaps.end();
1391 ++q) {
1392 bufferlist v;
1393 string k = make_snap_key_value(p.first, q.get_start(),
1394 q.get_len(), pending_inc.epoch, &v);
1395 t->put(OSD_SNAP_PREFIX, k, v);
1396 }
1397 }
1398 }
1399 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS &&
1400 tmp.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
1401 dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1402 // add creating flags?
1403 for (auto& i : tmp.get_pools()) {
1404 if (pending_creatings.still_creating_pool(i.first)) {
1405 dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1406 << dendl;
1407 if (pending_inc.new_pools.count(i.first) == 0) {
1408 pending_inc.new_pools[i.first] = i.second;
1409 }
1410 pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1411 }
1412 }
1413 // adjust blacklist items to all be TYPE_ANY
1414 for (auto& i : tmp.blacklist) {
1415 auto a = i.first;
1416 a.set_type(entity_addr_t::TYPE_ANY);
1417 pending_inc.new_blacklist[a] = i.second;
1418 pending_inc.old_blacklist.push_back(i.first);
1419 }
1420 }
1421 }
1422
1423 // tell me about it
1424 for (auto i = pending_inc.new_state.begin();
1425 i != pending_inc.new_state.end();
1426 ++i) {
1427 int s = i->second ? i->second : CEPH_OSD_UP;
1428 if (s & CEPH_OSD_UP)
1429 dout(2) << " osd." << i->first << " DOWN" << dendl;
1430 if (s & CEPH_OSD_EXISTS)
1431 dout(2) << " osd." << i->first << " DNE" << dendl;
1432 }
1433 for (auto i = pending_inc.new_up_client.begin();
1434 i != pending_inc.new_up_client.end();
1435 ++i) {
1436 //FIXME: insert cluster addresses too
1437 dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1438 }
1439 for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1440 i != pending_inc.new_weight.end();
1441 ++i) {
1442 if (i->second == CEPH_OSD_OUT) {
1443 dout(2) << " osd." << i->first << " OUT" << dendl;
1444 } else if (i->second == CEPH_OSD_IN) {
1445 dout(2) << " osd." << i->first << " IN" << dendl;
1446 } else {
1447 dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1448 }
1449 }
1450
1451 // features for osdmap and its incremental
1452 uint64_t features;
1453
1454 // encode full map and determine its crc
1455 OSDMap tmp;
1456 {
1457 tmp.deepish_copy_from(osdmap);
1458 tmp.apply_incremental(pending_inc);
1459
1460 // determine appropriate features
1461 features = tmp.get_encoding_features();
1462 dout(10) << __func__ << " encoding full map with "
1463 << ceph_release_name(tmp.require_osd_release)
1464 << " features " << features << dendl;
1465
1466 // the features should be a subset of the mon quorum's features!
1467 ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1468
1469 bufferlist fullbl;
1470 encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1471 pending_inc.full_crc = tmp.get_crc();
1472
1473 // include full map in the txn. note that old monitors will
1474 // overwrite this. new ones will now skip the local full map
1475 // encode and reload from this.
1476 put_version_full(t, pending_inc.epoch, fullbl);
1477 }
1478
1479 // encode
1480 ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1481 bufferlist bl;
1482 encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1483
1484 dout(20) << " full_crc " << tmp.get_crc()
1485 << " inc_crc " << pending_inc.inc_crc << dendl;
1486
1487 /* put everything in the transaction */
1488 put_version(t, pending_inc.epoch, bl);
1489 put_last_committed(t, pending_inc.epoch);
1490
1491 // metadata, too!
1492 for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1493 p != pending_metadata.end();
1494 ++p)
1495 t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1496 for (set<int>::iterator p = pending_metadata_rm.begin();
1497 p != pending_metadata_rm.end();
1498 ++p)
1499 t->erase(OSD_METADATA_PREFIX, stringify(*p));
1500 pending_metadata.clear();
1501 pending_metadata_rm.clear();
1502
1503 // removed_snaps
1504 if (tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1505 for (auto& i : pending_inc.new_removed_snaps) {
1506 {
1507 // all snaps removed this epoch
1508 string k = make_snap_epoch_key(i.first, pending_inc.epoch);
1509 bufferlist v;
1510 encode(i.second, v);
1511 t->put(OSD_SNAP_PREFIX, k, v);
1512 }
1513 for (auto q = i.second.begin();
1514 q != i.second.end();
1515 ++q) {
1516 bufferlist v;
1517 string k = make_snap_key_value(i.first, q.get_start(),
1518 q.get_len(), pending_inc.epoch, &v);
1519 t->put(OSD_SNAP_PREFIX, k, v);
1520 }
1521 }
1522 for (auto& i : pending_inc.new_purged_snaps) {
1523 for (auto q = i.second.begin();
1524 q != i.second.end();
1525 ++q) {
1526 bufferlist v;
1527 string k = make_snap_purged_key_value(i.first, q.get_start(),
1528 q.get_len(), pending_inc.epoch,
1529 &v);
1530 t->put(OSD_SNAP_PREFIX, k, v);
1531 }
1532 }
1533 }
1534
1535 // health
1536 health_check_map_t next;
1537 tmp.check_health(&next);
1538 encode_health(next, t);
1539 }
1540
1541 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1542 {
1543 bufferlist bl;
1544 int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1545 if (r < 0)
1546 return r;
1547 try {
1548 auto p = bl.cbegin();
1549 decode(m, p);
1550 }
1551 catch (buffer::error& e) {
1552 if (err)
1553 *err << "osd." << osd << " metadata is corrupt";
1554 return -EIO;
1555 }
1556 return 0;
1557 }
1558
1559 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
1560 {
1561 for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1562 if (osdmap.is_up(osd)) {
1563 map<string,string> meta;
1564 load_metadata(osd, meta, nullptr);
1565 auto p = meta.find(field);
1566 if (p == meta.end()) {
1567 (*out)["unknown"]++;
1568 } else {
1569 (*out)[p->second]++;
1570 }
1571 }
1572 }
1573 }
1574
1575 void OSDMonitor::count_metadata(const string& field, Formatter *f)
1576 {
1577 map<string,int> by_val;
1578 count_metadata(field, &by_val);
1579 f->open_object_section(field.c_str());
1580 for (auto& p : by_val) {
1581 f->dump_int(p.first.c_str(), p.second);
1582 }
1583 f->close_section();
1584 }
1585
1586 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1587 {
1588 map<string, string> metadata;
1589 int r = load_metadata(osd, metadata, nullptr);
1590 if (r < 0)
1591 return r;
1592
1593 auto it = metadata.find("osd_objectstore");
1594 if (it == metadata.end())
1595 return -ENOENT;
1596 *type = it->second;
1597 return 0;
1598 }
1599
1600 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1601 const pg_pool_t &pool,
1602 ostream *err)
1603 {
1604 // just check a few pgs for efficiency - this can't give a guarantee anyway,
1605 // since filestore osds could always join the pool later
1606 set<int> checked_osds;
1607 for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
1608 vector<int> up, acting;
1609 pg_t pgid(ps, pool_id);
1610 osdmap.pg_to_up_acting_osds(pgid, up, acting);
1611 for (int osd : up) {
1612 if (checked_osds.find(osd) != checked_osds.end())
1613 continue;
1614 string objectstore_type;
1615 int r = get_osd_objectstore_type(osd, &objectstore_type);
1616 // allow with missing metadata, e.g. due to an osd never booting yet
1617 if (r < 0 || objectstore_type == "bluestore") {
1618 checked_osds.insert(osd);
1619 continue;
1620 }
1621 *err << "osd." << osd << " uses " << objectstore_type;
1622 return false;
1623 }
1624 }
1625 return true;
1626 }
1627
1628 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1629 {
1630 map<string,string> m;
1631 if (int r = load_metadata(osd, m, err))
1632 return r;
1633 for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1634 f->dump_string(p->first.c_str(), p->second);
1635 return 0;
1636 }
1637
1638 void OSDMonitor::print_nodes(Formatter *f)
1639 {
1640 // group OSDs by their hosts
1641 map<string, list<int> > osds; // hostname => osd
1642 for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1643 map<string, string> m;
1644 if (load_metadata(osd, m, NULL)) {
1645 continue;
1646 }
1647 map<string, string>::iterator hostname = m.find("hostname");
1648 if (hostname == m.end()) {
1649 // not likely though
1650 continue;
1651 }
1652 osds[hostname->second].push_back(osd);
1653 }
1654
1655 dump_services(f, osds, "osd");
1656 }
1657
1658 void OSDMonitor::share_map_with_random_osd()
1659 {
1660 if (osdmap.get_num_up_osds() == 0) {
1661 dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1662 return;
1663 }
1664
1665 MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1666 if (!s) {
1667 dout(10) << __func__ << " no up osd on our session map" << dendl;
1668 return;
1669 }
1670
1671 dout(10) << "committed, telling random " << s->name
1672 << " all about it" << dendl;
1673
1674 // get feature of the peer
1675 // use quorum_con_features, if it's an anonymous connection.
1676 uint64_t features = s->con_features ? s->con_features :
1677 mon->get_quorum_con_features();
1678 // whatev, they'll request more if they need it
1679 MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
1680 s->con->send_message(m);
1681 // NOTE: do *not* record osd has up to this epoch (as we do
1682 // elsewhere) as they may still need to request older values.
1683 }
1684
1685 version_t OSDMonitor::get_trim_to() const
1686 {
1687 if (mon->get_quorum().empty()) {
1688 dout(10) << __func__ << ": quorum not formed" << dendl;
1689 return 0;
1690 }
1691
1692 {
1693 std::lock_guard<std::mutex> l(creating_pgs_lock);
1694 if (!creating_pgs.pgs.empty()) {
1695 return 0;
1696 }
1697 }
1698
1699 if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
1700 dout(0) << __func__
1701 << " blocking osdmap trim"
1702 " ('mon_debug_block_osdmap_trim' set to 'true')"
1703 << dendl;
1704 return 0;
1705 }
1706
1707 {
1708 epoch_t floor = get_min_last_epoch_clean();
1709 dout(10) << " min_last_epoch_clean " << floor << dendl;
1710 if (g_conf()->mon_osd_force_trim_to > 0 &&
1711 g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
1712 floor = g_conf()->mon_osd_force_trim_to;
1713 dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1714 }
1715 unsigned min = g_conf()->mon_min_osdmap_epochs;
1716 if (floor + min > get_last_committed()) {
1717 if (min < get_last_committed())
1718 floor = get_last_committed() - min;
1719 else
1720 floor = 0;
1721 }
1722 if (floor > get_first_committed())
1723 return floor;
1724 }
1725 return 0;
1726 }
1727
1728 epoch_t OSDMonitor::get_min_last_epoch_clean() const
1729 {
1730 auto floor = last_epoch_clean.get_lower_bound(osdmap);
1731 // also scan osd epochs
1732 // don't trim past the oldest reported osd epoch
1733 for (auto& osd_epoch : osd_epochs) {
1734 if (osd_epoch.second < floor) {
1735 floor = osd_epoch.second;
1736 }
1737 }
1738 return floor;
1739 }
1740
1741 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1742 version_t first)
1743 {
1744 dout(10) << __func__ << " including full map for e " << first << dendl;
1745 bufferlist bl;
1746 get_version_full(first, bl);
1747 put_version_full(tx, first, bl);
1748
1749 if (has_osdmap_manifest &&
1750 first > osdmap_manifest.get_first_pinned()) {
1751 _prune_update_trimmed(tx, first);
1752 }
1753 }
1754
1755
1756 /* full osdmap prune
1757 *
1758 * for more information, please refer to doc/dev/mon-osdmap-prune.rst
1759 */
1760
1761 void OSDMonitor::load_osdmap_manifest()
1762 {
1763 bool store_has_manifest =
1764 mon->store->exists(get_service_name(), "osdmap_manifest");
1765
1766 if (!store_has_manifest) {
1767 if (!has_osdmap_manifest) {
1768 return;
1769 }
1770
1771 dout(20) << __func__
1772 << " dropping osdmap manifest from memory." << dendl;
1773 osdmap_manifest = osdmap_manifest_t();
1774 has_osdmap_manifest = false;
1775 return;
1776 }
1777
1778 dout(20) << __func__
1779 << " osdmap manifest detected in store; reload." << dendl;
1780
1781 bufferlist manifest_bl;
1782 int r = get_value("osdmap_manifest", manifest_bl);
1783 if (r < 0) {
1784 derr << __func__ << " unable to read osdmap version manifest" << dendl;
1785 ceph_abort_msg("error reading manifest");
1786 }
1787 osdmap_manifest.decode(manifest_bl);
1788 has_osdmap_manifest = true;
1789
1790 dout(10) << __func__ << " store osdmap manifest pinned ("
1791 << osdmap_manifest.get_first_pinned()
1792 << " .. "
1793 << osdmap_manifest.get_last_pinned()
1794 << ")"
1795 << dendl;
1796 }
1797
1798 bool OSDMonitor::should_prune() const
1799 {
1800 version_t first = get_first_committed();
1801 version_t last = get_last_committed();
1802 version_t min_osdmap_epochs =
1803 g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
1804 version_t prune_min =
1805 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1806 version_t prune_interval =
1807 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1808 version_t last_pinned = osdmap_manifest.get_last_pinned();
1809 version_t last_to_pin = last - min_osdmap_epochs;
1810
1811 // Make it or break it constraints.
1812 //
1813 // If any of these conditions fails, we will not prune, regardless of
1814 // whether we have an on-disk manifest with an on-going pruning state.
1815 //
1816 if ((last - first) <= min_osdmap_epochs) {
1817 // between the first and last committed epochs, we don't have
1818 // enough epochs to trim, much less to prune.
1819 dout(10) << __func__
1820 << " currently holding only " << (last - first)
1821 << " epochs (min osdmap epochs: " << min_osdmap_epochs
1822 << "); do not prune."
1823 << dendl;
1824 return false;
1825
1826 } else if ((last_to_pin - first) < prune_min) {
1827 // between the first committed epoch and the last epoch we would prune,
1828 // we simply don't have enough versions over the minimum to prune maps.
1829 dout(10) << __func__
1830 << " could only prune " << (last_to_pin - first)
1831 << " epochs (" << first << ".." << last_to_pin << "), which"
1832 " is less than the required minimum (" << prune_min << ")"
1833 << dendl;
1834 return false;
1835
1836 } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
1837 dout(10) << __func__
1838 << " we have pruned as far as we can; do not prune."
1839 << dendl;
1840 return false;
1841
1842 } else if (last_pinned + prune_interval > last_to_pin) {
1843 dout(10) << __func__
1844 << " not enough epochs to form an interval (last pinned: "
1845 << last_pinned << ", last to pin: "
1846 << last_to_pin << ", interval: " << prune_interval << ")"
1847 << dendl;
1848 return false;
1849 }
1850
1851 dout(15) << __func__
1852 << " should prune (" << last_pinned << ".." << last_to_pin << ")"
1853 << " lc (" << first << ".." << last << ")"
1854 << dendl;
1855 return true;
1856 }
1857
1858 void OSDMonitor::_prune_update_trimmed(
1859 MonitorDBStore::TransactionRef tx,
1860 version_t first)
1861 {
1862 dout(10) << __func__
1863 << " first " << first
1864 << " last_pinned " << osdmap_manifest.get_last_pinned()
1865 << " last_pinned " << osdmap_manifest.get_last_pinned()
1866 << dendl;
1867
1868 osdmap_manifest_t manifest = osdmap_manifest;
1869
1870 if (!manifest.is_pinned(first)) {
1871 manifest.pin(first);
1872 }
1873
1874 set<version_t>::iterator p_end = manifest.pinned.find(first);
1875 set<version_t>::iterator p = manifest.pinned.begin();
1876 manifest.pinned.erase(p, p_end);
1877 ceph_assert(manifest.get_first_pinned() == first);
1878
1879 if (manifest.get_last_pinned() == first+1 ||
1880 manifest.pinned.size() == 1) {
1881 // we reached the end of the line, as pinned maps go; clean up our
1882 // manifest, and let `should_prune()` decide whether we should prune
1883 // again.
1884 tx->erase(get_service_name(), "osdmap_manifest");
1885 return;
1886 }
1887
1888 bufferlist bl;
1889 manifest.encode(bl);
1890 tx->put(get_service_name(), "osdmap_manifest", bl);
1891 }
1892
1893 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
1894 {
1895 dout(1) << __func__ << dendl;
1896
1897 version_t pin_first;
1898
1899 // verify constrainsts on stable in-memory state
1900 if (!has_osdmap_manifest) {
1901 // we must have never pruned, OR if we pruned the state must no longer
1902 // be relevant (i.e., the state must have been removed alongside with
1903 // the trim that *must* have removed past the last pinned map in a
1904 // previous prune).
1905 ceph_assert(osdmap_manifest.pinned.empty());
1906 ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
1907 pin_first = get_first_committed();
1908
1909 } else {
1910 // we must have pruned in the past AND its state is still relevant
1911 // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
1912 // and thus we still hold a manifest in the store).
1913 ceph_assert(!osdmap_manifest.pinned.empty());
1914 ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
1915 ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
1916
1917 dout(10) << __func__
1918 << " first_pinned " << osdmap_manifest.get_first_pinned()
1919 << " last_pinned " << osdmap_manifest.get_last_pinned()
1920 << dendl;
1921
1922 pin_first = osdmap_manifest.get_last_pinned();
1923 }
1924
1925 manifest.pin(pin_first);
1926 }
1927
1928 bool OSDMonitor::_prune_sanitize_options() const
1929 {
1930 uint64_t prune_interval =
1931 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1932 uint64_t prune_min =
1933 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1934 uint64_t txsize =
1935 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
1936
1937 bool r = true;
1938
1939 if (prune_interval == 0) {
1940 derr << __func__
1941 << " prune is enabled BUT prune interval is zero; abort."
1942 << dendl;
1943 r = false;
1944 } else if (prune_interval == 1) {
1945 derr << __func__
1946 << " prune interval is equal to one, which essentially means"
1947 " no pruning; abort."
1948 << dendl;
1949 r = false;
1950 }
1951 if (prune_min == 0) {
1952 derr << __func__
1953 << " prune is enabled BUT prune min is zero; abort."
1954 << dendl;
1955 r = false;
1956 }
1957 if (prune_interval > prune_min) {
1958 derr << __func__
1959 << " impossible to ascertain proper prune interval because"
1960 << " it is greater than the minimum prune epochs"
1961 << " (min: " << prune_min << ", interval: " << prune_interval << ")"
1962 << dendl;
1963 r = false;
1964 }
1965
1966 if (txsize < prune_interval - 1) {
1967 derr << __func__
1968 << "'mon_osdmap_full_prune_txsize' (" << txsize
1969 << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
1970 << "); abort." << dendl;
1971 r = false;
1972 }
1973 return r;
1974 }
1975
1976 bool OSDMonitor::is_prune_enabled() const {
1977 return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
1978 }
1979
1980 bool OSDMonitor::is_prune_supported() const {
1981 return mon->get_required_mon_features().contains_any(
1982 ceph::features::mon::FEATURE_OSDMAP_PRUNE);
1983 }
1984
1985 /** do_prune
1986 *
1987 * @returns true if has side-effects; false otherwise.
1988 */
1989 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
1990 {
1991 bool enabled = is_prune_enabled();
1992
1993 dout(1) << __func__ << " osdmap full prune "
1994 << ( enabled ? "enabled" : "disabled")
1995 << dendl;
1996
1997 if (!enabled || !_prune_sanitize_options() || !should_prune()) {
1998 return false;
1999 }
2000
2001 // we are beyond the minimum prune versions, we need to remove maps because
2002 // otherwise the store will grow unbounded and we may end up having issues
2003 // with available disk space or store hangs.
2004
2005 // we will not pin all versions. We will leave a buffer number of versions.
2006 // this allows us the monitor to trim maps without caring too much about
2007 // pinned maps, and then allow us to use another ceph-mon without these
2008 // capabilities, without having to repair the store.
2009
2010 osdmap_manifest_t manifest = osdmap_manifest;
2011
2012 version_t first = get_first_committed();
2013 version_t last = get_last_committed();
2014
2015 version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2016 version_t last_pinned = manifest.get_last_pinned();
2017 uint64_t prune_interval =
2018 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2019 uint64_t txsize =
2020 g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2021
2022 prune_init(manifest);
2023
2024 // we need to get rid of some osdmaps
2025
2026 dout(5) << __func__
2027 << " lc (" << first << " .. " << last << ")"
2028 << " last_pinned " << last_pinned
2029 << " interval " << prune_interval
2030 << " last_to_pin " << last_to_pin
2031 << dendl;
2032
2033 // We will be erasing maps as we go.
2034 //
2035 // We will erase all maps between `last_pinned` and the `next_to_pin`.
2036 //
2037 // If `next_to_pin` happens to be greater than `last_to_pin`, then
2038 // we stop pruning. We could prune the maps between `next_to_pin` and
2039 // `last_to_pin`, but by not doing it we end up with neater pruned
2040 // intervals, aligned with `prune_interval`. Besides, this should not be a
2041 // problem as long as `prune_interval` is set to a sane value, instead of
2042 // hundreds or thousands of maps.
2043
2044 auto map_exists = [this](version_t v) {
2045 string k = mon->store->combine_strings("full", v);
2046 return mon->store->exists(get_service_name(), k);
2047 };
2048
2049 // 'interval' represents the number of maps from the last pinned
2050 // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2051 // version 11 next; all intermediate versions will be removed.
2052 //
2053 // 'txsize' represents the maximum number of versions we'll be removing in
2054 // this iteration. If 'txsize' is large enough to perform multiple passes
2055 // pinning and removing maps, we will do so; if not, we'll do at least one
2056 // pass. We are quite relaxed about honouring 'txsize', but we'll always
2057 // ensure that we never go *over* the maximum.
2058
2059 // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2060 uint64_t removal_interval = prune_interval - 1;
2061
2062 if (txsize < removal_interval) {
2063 dout(5) << __func__
2064 << " setting txsize to removal interval size ("
2065 << removal_interval << " versions"
2066 << dendl;
2067 txsize = removal_interval;
2068 }
2069 ceph_assert(removal_interval > 0);
2070
2071 uint64_t num_pruned = 0;
2072 while (num_pruned + removal_interval <= txsize) {
2073 last_pinned = manifest.get_last_pinned();
2074
2075 if (last_pinned + prune_interval > last_to_pin) {
2076 break;
2077 }
2078 ceph_assert(last_pinned < last_to_pin);
2079
2080 version_t next_pinned = last_pinned + prune_interval;
2081 ceph_assert(next_pinned <= last_to_pin);
2082 manifest.pin(next_pinned);
2083
2084 dout(20) << __func__
2085 << " last_pinned " << last_pinned
2086 << " next_pinned " << next_pinned
2087 << " num_pruned " << num_pruned
2088 << " removal interval (" << (last_pinned+1)
2089 << ".." << (next_pinned-1) << ")"
2090 << " txsize " << txsize << dendl;
2091
2092 ceph_assert(map_exists(last_pinned));
2093 ceph_assert(map_exists(next_pinned));
2094
2095 for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2096 ceph_assert(!manifest.is_pinned(v));
2097
2098 dout(20) << __func__ << " pruning full osdmap e" << v << dendl;
2099 string full_key = mon->store->combine_strings("full", v);
2100 tx->erase(get_service_name(), full_key);
2101 ++num_pruned;
2102 }
2103 }
2104
2105 ceph_assert(num_pruned > 0);
2106
2107 bufferlist bl;
2108 manifest.encode(bl);
2109 tx->put(get_service_name(), "osdmap_manifest", bl);
2110
2111 return true;
2112 }
2113
2114
2115 // -------------
2116
2117 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2118 {
2119 op->mark_osdmon_event(__func__);
2120 Message *m = op->get_req();
2121 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2122
2123 switch (m->get_type()) {
2124 // READs
2125 case MSG_MON_COMMAND:
2126 try {
2127 return preprocess_command(op);
2128 } catch (const bad_cmd_get& e) {
2129 bufferlist bl;
2130 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2131 return true;
2132 }
2133 case CEPH_MSG_MON_GET_OSDMAP:
2134 return preprocess_get_osdmap(op);
2135
2136 // damp updates
2137 case MSG_OSD_MARK_ME_DOWN:
2138 return preprocess_mark_me_down(op);
2139 case MSG_OSD_FULL:
2140 return preprocess_full(op);
2141 case MSG_OSD_FAILURE:
2142 return preprocess_failure(op);
2143 case MSG_OSD_BOOT:
2144 return preprocess_boot(op);
2145 case MSG_OSD_ALIVE:
2146 return preprocess_alive(op);
2147 case MSG_OSD_PG_CREATED:
2148 return preprocess_pg_created(op);
2149 case MSG_OSD_PG_READY_TO_MERGE:
2150 return preprocess_pg_ready_to_merge(op);
2151 case MSG_OSD_PGTEMP:
2152 return preprocess_pgtemp(op);
2153 case MSG_OSD_BEACON:
2154 return preprocess_beacon(op);
2155
2156 case CEPH_MSG_POOLOP:
2157 return preprocess_pool_op(op);
2158
2159 case MSG_REMOVE_SNAPS:
2160 return preprocess_remove_snaps(op);
2161
2162 default:
2163 ceph_abort();
2164 return true;
2165 }
2166 }
2167
2168 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2169 {
2170 op->mark_osdmon_event(__func__);
2171 Message *m = op->get_req();
2172 dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2173
2174 switch (m->get_type()) {
2175 // damp updates
2176 case MSG_OSD_MARK_ME_DOWN:
2177 return prepare_mark_me_down(op);
2178 case MSG_OSD_FULL:
2179 return prepare_full(op);
2180 case MSG_OSD_FAILURE:
2181 return prepare_failure(op);
2182 case MSG_OSD_BOOT:
2183 return prepare_boot(op);
2184 case MSG_OSD_ALIVE:
2185 return prepare_alive(op);
2186 case MSG_OSD_PG_CREATED:
2187 return prepare_pg_created(op);
2188 case MSG_OSD_PGTEMP:
2189 return prepare_pgtemp(op);
2190 case MSG_OSD_PG_READY_TO_MERGE:
2191 return prepare_pg_ready_to_merge(op);
2192 case MSG_OSD_BEACON:
2193 return prepare_beacon(op);
2194
2195 case MSG_MON_COMMAND:
2196 try {
2197 return prepare_command(op);
2198 } catch (const bad_cmd_get& e) {
2199 bufferlist bl;
2200 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2201 return true;
2202 }
2203
2204 case CEPH_MSG_POOLOP:
2205 return prepare_pool_op(op);
2206
2207 case MSG_REMOVE_SNAPS:
2208 return prepare_remove_snaps(op);
2209
2210
2211 default:
2212 ceph_abort();
2213 }
2214
2215 return false;
2216 }
2217
2218 bool OSDMonitor::should_propose(double& delay)
2219 {
2220 dout(10) << "should_propose" << dendl;
2221
2222 // if full map, propose immediately! any subsequent changes will be clobbered.
2223 if (pending_inc.fullmap.length())
2224 return true;
2225
2226 // adjust osd weights?
2227 if (!osd_weight.empty() &&
2228 osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2229 dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2230 osdmap.adjust_osd_weights(osd_weight, pending_inc);
2231 delay = 0.0;
2232 osd_weight.clear();
2233 return true;
2234 }
2235
2236 return PaxosService::should_propose(delay);
2237 }
2238
2239
2240
2241 // ---------------------------
2242 // READs
2243
2244 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2245 {
2246 op->mark_osdmon_event(__func__);
2247 MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
2248
2249 uint64_t features = mon->get_quorum_con_features();
2250 if (op->get_session() && op->get_session()->con_features)
2251 features = op->get_session()->con_features;
2252
2253 dout(10) << __func__ << " " << *m << dendl;
2254 MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2255 epoch_t first = get_first_committed();
2256 epoch_t last = osdmap.get_epoch();
2257 int max = g_conf()->osd_map_message_max;
2258 ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2259 for (epoch_t e = std::max(first, m->get_full_first());
2260 e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2261 ++e, --max) {
2262 bufferlist& bl = reply->maps[e];
2263 int r = get_version_full(e, features, bl);
2264 ceph_assert(r >= 0);
2265 max_bytes -= bl.length();
2266 }
2267 for (epoch_t e = std::max(first, m->get_inc_first());
2268 e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2269 ++e, --max) {
2270 bufferlist& bl = reply->incremental_maps[e];
2271 int r = get_version(e, features, bl);
2272 ceph_assert(r >= 0);
2273 max_bytes -= bl.length();
2274 }
2275 reply->oldest_map = first;
2276 reply->newest_map = last;
2277 mon->send_reply(op, reply);
2278 return true;
2279 }
2280
2281
2282 // ---------------------------
2283 // UPDATEs
2284
2285 // failure --
2286
2287 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2288 // check permissions
2289 MonSession *session = op->get_session();
2290 if (!session)
2291 return true;
2292 if (!session->is_capable("osd", MON_CAP_X)) {
2293 dout(0) << "got MOSDFailure from entity with insufficient caps "
2294 << session->caps << dendl;
2295 return true;
2296 }
2297 if (fsid != mon->monmap->fsid) {
2298 dout(0) << "check_source: on fsid " << fsid
2299 << " != " << mon->monmap->fsid << dendl;
2300 return true;
2301 }
2302 return false;
2303 }
2304
2305
2306 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2307 {
2308 op->mark_osdmon_event(__func__);
2309 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2310 // who is target_osd
2311 int badboy = m->get_target_osd();
2312
2313 // check permissions
2314 if (check_source(op, m->fsid))
2315 goto didit;
2316
2317 // first, verify the reporting host is valid
2318 if (m->get_orig_source().is_osd()) {
2319 int from = m->get_orig_source().num();
2320 if (!osdmap.exists(from) ||
2321 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2322 (osdmap.is_down(from) && m->if_osd_failed())) {
2323 dout(5) << "preprocess_failure from dead osd." << from
2324 << ", ignoring" << dendl;
2325 send_incremental(op, m->get_epoch()+1);
2326 goto didit;
2327 }
2328 }
2329
2330
2331 // weird?
2332 if (osdmap.is_down(badboy)) {
2333 dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2334 << " " << m->get_target_addrs()
2335 << ", from " << m->get_orig_source() << dendl;
2336 if (m->get_epoch() < osdmap.get_epoch())
2337 send_incremental(op, m->get_epoch()+1);
2338 goto didit;
2339 }
2340 if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2341 dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2342 << " " << m->get_target_addrs()
2343 << " != map's " << osdmap.get_addrs(badboy)
2344 << ", from " << m->get_orig_source() << dendl;
2345 if (m->get_epoch() < osdmap.get_epoch())
2346 send_incremental(op, m->get_epoch()+1);
2347 goto didit;
2348 }
2349
2350 // already reported?
2351 if (osdmap.is_down(badboy) ||
2352 osdmap.get_up_from(badboy) > m->get_epoch()) {
2353 dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2354 << " " << m->get_target_addrs()
2355 << ", from " << m->get_orig_source() << dendl;
2356 if (m->get_epoch() < osdmap.get_epoch())
2357 send_incremental(op, m->get_epoch()+1);
2358 goto didit;
2359 }
2360
2361 if (!can_mark_down(badboy)) {
2362 dout(5) << "preprocess_failure ignoring report of osd."
2363 << m->get_target_osd() << " " << m->get_target_addrs()
2364 << " from " << m->get_orig_source() << dendl;
2365 goto didit;
2366 }
2367
2368 dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2369 << " " << m->get_target_addrs()
2370 << ", from " << m->get_orig_source() << dendl;
2371 return false;
2372
2373 didit:
2374 mon->no_reply(op);
2375 return true;
2376 }
2377
2378 class C_AckMarkedDown : public C_MonOp {
2379 OSDMonitor *osdmon;
2380 public:
2381 C_AckMarkedDown(
2382 OSDMonitor *osdmon,
2383 MonOpRequestRef op)
2384 : C_MonOp(op), osdmon(osdmon) {}
2385
2386 void _finish(int) override {
2387 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2388 osdmon->mon->send_reply(
2389 op,
2390 new MOSDMarkMeDown(
2391 m->fsid,
2392 m->target_osd,
2393 m->target_addrs,
2394 m->get_epoch(),
2395 false)); // ACK itself does not request an ack
2396 }
2397 ~C_AckMarkedDown() override {
2398 }
2399 };
2400
2401 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2402 {
2403 op->mark_osdmon_event(__func__);
2404 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2405 int from = m->target_osd;
2406
2407 // check permissions
2408 if (check_source(op, m->fsid))
2409 goto reply;
2410
2411 // first, verify the reporting host is valid
2412 if (!m->get_orig_source().is_osd())
2413 goto reply;
2414
2415 if (!osdmap.exists(from) ||
2416 osdmap.is_down(from) ||
2417 osdmap.get_addrs(from) != m->target_addrs) {
2418 dout(5) << "preprocess_mark_me_down from dead osd."
2419 << from << ", ignoring" << dendl;
2420 send_incremental(op, m->get_epoch()+1);
2421 goto reply;
2422 }
2423
2424 // no down might be set
2425 if (!can_mark_down(from))
2426 goto reply;
2427
2428 dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2429 << " " << m->target_addrs << dendl;
2430 return false;
2431
2432 reply:
2433 if (m->request_ack) {
2434 Context *c(new C_AckMarkedDown(this, op));
2435 c->complete(0);
2436 }
2437 return true;
2438 }
2439
2440 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2441 {
2442 op->mark_osdmon_event(__func__);
2443 MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2444 int target_osd = m->target_osd;
2445
2446 ceph_assert(osdmap.is_up(target_osd));
2447 ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2448
2449 mon->clog->info() << "osd." << target_osd << " marked itself down";
2450 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2451 if (m->request_ack)
2452 wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2453 return true;
2454 }
2455
2456 bool OSDMonitor::can_mark_down(int i)
2457 {
2458 if (osdmap.is_nodown(i)) {
2459 dout(5) << __func__ << " osd." << i << " is marked as nodown, "
2460 << "will not mark it down" << dendl;
2461 return false;
2462 }
2463
2464 int num_osds = osdmap.get_num_osds();
2465 if (num_osds == 0) {
2466 dout(5) << __func__ << " no osds" << dendl;
2467 return false;
2468 }
2469 int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
2470 float up_ratio = (float)up / (float)num_osds;
2471 if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
2472 dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
2473 << g_conf()->mon_osd_min_up_ratio
2474 << ", will not mark osd." << i << " down" << dendl;
2475 return false;
2476 }
2477 return true;
2478 }
2479
2480 bool OSDMonitor::can_mark_up(int i)
2481 {
2482 if (osdmap.is_noup(i)) {
2483 dout(5) << __func__ << " osd." << i << " is marked as noup, "
2484 << "will not mark it up" << dendl;
2485 return false;
2486 }
2487
2488 return true;
2489 }
2490
2491 /**
2492 * @note the parameter @p i apparently only exists here so we can output the
2493 * osd's id on messages.
2494 */
2495 bool OSDMonitor::can_mark_out(int i)
2496 {
2497 if (osdmap.is_noout(i)) {
2498 dout(5) << __func__ << " osd." << i << " is marked as noout, "
2499 << "will not mark it out" << dendl;
2500 return false;
2501 }
2502
2503 int num_osds = osdmap.get_num_osds();
2504 if (num_osds == 0) {
2505 dout(5) << __func__ << " no osds" << dendl;
2506 return false;
2507 }
2508 int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2509 float in_ratio = (float)in / (float)num_osds;
2510 if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
2511 if (i >= 0)
2512 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2513 << g_conf()->mon_osd_min_in_ratio
2514 << ", will not mark osd." << i << " out" << dendl;
2515 else
2516 dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2517 << g_conf()->mon_osd_min_in_ratio
2518 << ", will not mark osds out" << dendl;
2519 return false;
2520 }
2521
2522 return true;
2523 }
2524
2525 bool OSDMonitor::can_mark_in(int i)
2526 {
2527 if (osdmap.is_noin(i)) {
2528 dout(5) << __func__ << " osd." << i << " is marked as noin, "
2529 << "will not mark it in" << dendl;
2530 return false;
2531 }
2532
2533 return true;
2534 }
2535
2536 bool OSDMonitor::check_failures(utime_t now)
2537 {
2538 bool found_failure = false;
2539 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2540 p != failure_info.end();
2541 ++p) {
2542 if (can_mark_down(p->first)) {
2543 found_failure |= check_failure(now, p->first, p->second);
2544 }
2545 }
2546 return found_failure;
2547 }
2548
2549 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2550 {
2551 // already pending failure?
2552 if (pending_inc.new_state.count(target_osd) &&
2553 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2554 dout(10) << " already pending failure" << dendl;
2555 return true;
2556 }
2557
2558 set<string> reporters_by_subtree;
2559 auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2560 utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
2561 utime_t max_failed_since = fi.get_failed_since();
2562 utime_t failed_for = now - max_failed_since;
2563
2564 utime_t grace = orig_grace;
2565 double my_grace = 0, peer_grace = 0;
2566 double decay_k = 0;
2567 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2568 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
2569 decay_k = ::log(.5) / halflife;
2570
2571 // scale grace period based on historical probability of 'lagginess'
2572 // (false positive failures due to slowness).
2573 const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2574 double decay = exp((double)failed_for * decay_k);
2575 dout(20) << " halflife " << halflife << " decay_k " << decay_k
2576 << " failed_for " << failed_for << " decay " << decay << dendl;
2577 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2578 grace += my_grace;
2579 }
2580
2581 // consider the peers reporting a failure a proxy for a potential
2582 // 'subcluster' over the overall cluster that is similarly
2583 // laggy. this is clearly not true in all cases, but will sometimes
2584 // help us localize the grace correction to a subset of the system
2585 // (say, a rack with a bad switch) that is unhappy.
2586 ceph_assert(fi.reporters.size());
2587 for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2588 p != fi.reporters.end();
2589 ++p) {
2590 // get the parent bucket whose type matches with "reporter_subtree_level".
2591 // fall back to OSD if the level doesn't exist.
2592 map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2593 map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2594 if (iter == reporter_loc.end()) {
2595 reporters_by_subtree.insert("osd." + to_string(p->first));
2596 } else {
2597 reporters_by_subtree.insert(iter->second);
2598 }
2599 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2600 const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2601 utime_t elapsed = now - xi.down_stamp;
2602 double decay = exp((double)elapsed * decay_k);
2603 peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2604 }
2605 }
2606
2607 if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2608 peer_grace /= (double)fi.reporters.size();
2609 grace += peer_grace;
2610 }
2611
2612 dout(10) << " osd." << target_osd << " has "
2613 << fi.reporters.size() << " reporters, "
2614 << grace << " grace (" << orig_grace << " + " << my_grace
2615 << " + " << peer_grace << "), max_failed_since " << max_failed_since
2616 << dendl;
2617
2618 if (failed_for >= grace &&
2619 reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
2620 dout(1) << " we have enough reporters to mark osd." << target_osd
2621 << " down" << dendl;
2622 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2623
2624 mon->clog->info() << "osd." << target_osd << " failed ("
2625 << osdmap.crush->get_full_location_ordered_string(
2626 target_osd)
2627 << ") ("
2628 << (int)reporters_by_subtree.size()
2629 << " reporters from different "
2630 << reporter_subtree_level << " after "
2631 << failed_for << " >= grace " << grace << ")";
2632 return true;
2633 }
2634 return false;
2635 }
2636
2637 void OSDMonitor::force_failure(int target_osd, int by)
2638 {
2639 // already pending failure?
2640 if (pending_inc.new_state.count(target_osd) &&
2641 pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2642 dout(10) << " already pending failure" << dendl;
2643 return;
2644 }
2645
2646 dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2647 pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2648
2649 mon->clog->info() << "osd." << target_osd << " failed ("
2650 << osdmap.crush->get_full_location_ordered_string(target_osd)
2651 << ") (connection refused reported by osd." << by << ")";
2652 return;
2653 }
2654
2655 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2656 {
2657 op->mark_osdmon_event(__func__);
2658 MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2659 dout(1) << "prepare_failure osd." << m->get_target_osd()
2660 << " " << m->get_target_addrs()
2661 << " from " << m->get_orig_source()
2662 << " is reporting failure:" << m->if_osd_failed() << dendl;
2663
2664 int target_osd = m->get_target_osd();
2665 int reporter = m->get_orig_source().num();
2666 ceph_assert(osdmap.is_up(target_osd));
2667 ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
2668
2669 if (m->if_osd_failed()) {
2670 // calculate failure time
2671 utime_t now = ceph_clock_now();
2672 utime_t failed_since =
2673 m->get_recv_stamp() - utime_t(m->failed_for, 0);
2674
2675 // add a report
2676 if (m->is_immediate()) {
2677 mon->clog->debug() << "osd." << m->get_target_osd()
2678 << " reported immediately failed by "
2679 << m->get_orig_source();
2680 force_failure(target_osd, reporter);
2681 mon->no_reply(op);
2682 return true;
2683 }
2684 mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
2685 << m->get_orig_source();
2686
2687 failure_info_t& fi = failure_info[target_osd];
2688 MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2689 if (old_op) {
2690 mon->no_reply(old_op);
2691 }
2692
2693 return check_failure(now, target_osd, fi);
2694 } else {
2695 // remove the report
2696 mon->clog->debug() << "osd." << m->get_target_osd()
2697 << " failure report canceled by "
2698 << m->get_orig_source();
2699 if (failure_info.count(target_osd)) {
2700 failure_info_t& fi = failure_info[target_osd];
2701 MonOpRequestRef report_op = fi.cancel_report(reporter);
2702 if (report_op) {
2703 mon->no_reply(report_op);
2704 }
2705 if (fi.reporters.empty()) {
2706 dout(10) << " removing last failure_info for osd." << target_osd
2707 << dendl;
2708 failure_info.erase(target_osd);
2709 } else {
2710 dout(10) << " failure_info for osd." << target_osd << " now "
2711 << fi.reporters.size() << " reporters" << dendl;
2712 }
2713 } else {
2714 dout(10) << " no failure_info for osd." << target_osd << dendl;
2715 }
2716 mon->no_reply(op);
2717 }
2718
2719 return false;
2720 }
2721
2722 void OSDMonitor::process_failures()
2723 {
2724 map<int,failure_info_t>::iterator p = failure_info.begin();
2725 while (p != failure_info.end()) {
2726 if (osdmap.is_up(p->first)) {
2727 ++p;
2728 } else {
2729 dout(10) << "process_failures osd." << p->first << dendl;
2730 list<MonOpRequestRef> ls;
2731 p->second.take_report_messages(ls);
2732 failure_info.erase(p++);
2733
2734 while (!ls.empty()) {
2735 MonOpRequestRef o = ls.front();
2736 if (o) {
2737 o->mark_event(__func__);
2738 MOSDFailure *m = o->get_req<MOSDFailure>();
2739 send_latest(o, m->get_epoch());
2740 mon->no_reply(o);
2741 }
2742 ls.pop_front();
2743 }
2744 }
2745 }
2746 }
2747
2748 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2749 {
2750 dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2751
2752 for (map<int,failure_info_t>::iterator p = failure_info.begin();
2753 p != failure_info.end();
2754 ++p) {
2755 p->second.take_report_messages(ls);
2756 }
2757 failure_info.clear();
2758 }
2759
2760
2761 // boot --
2762
2763 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2764 {
2765 op->mark_osdmon_event(__func__);
2766 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2767 int from = m->get_orig_source_inst().name.num();
2768
2769 // check permissions, ignore if failed (no response expected)
2770 MonSession *session = op->get_session();
2771 if (!session)
2772 goto ignore;
2773 if (!session->is_capable("osd", MON_CAP_X)) {
2774 dout(0) << "got preprocess_boot message from entity with insufficient caps"
2775 << session->caps << dendl;
2776 goto ignore;
2777 }
2778
2779 if (m->sb.cluster_fsid != mon->monmap->fsid) {
2780 dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2781 << " != " << mon->monmap->fsid << dendl;
2782 goto ignore;
2783 }
2784
2785 if (m->get_orig_source_inst().addr.is_blank_ip()) {
2786 dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2787 goto ignore;
2788 }
2789
2790 ceph_assert(m->get_orig_source_inst().name.is_osd());
2791
2792 // force all osds to have gone through luminous prior to upgrade to nautilus
2793 {
2794 vector<string> missing;
2795 if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2796 missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
2797 }
2798 if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
2799 missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
2800 }
2801 if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2802 missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
2803 }
2804 if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
2805 missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
2806 }
2807
2808 if (!missing.empty()) {
2809 using std::experimental::make_ostream_joiner;
2810
2811 stringstream ss;
2812 copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
2813
2814 mon->clog->info() << "disallowing boot of OSD "
2815 << m->get_orig_source_inst()
2816 << " because the osd lacks " << ss.str();
2817 goto ignore;
2818 }
2819 }
2820
2821 // make sure upgrades stop at nautilus
2822 if (HAVE_FEATURE(m->osd_features, SERVER_O) &&
2823 osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
2824 mon->clog->info() << "disallowing boot of post-nautilus OSD "
2825 << m->get_orig_source_inst()
2826 << " because require_osd_release < nautilus";
2827 goto ignore;
2828 }
2829
2830 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2831 // we are reusing a jewel feature bit that was retired in luminous.
2832 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
2833 osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
2834 !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
2835 mon->clog->info() << "disallowing boot of OSD "
2836 << m->get_orig_source_inst()
2837 << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2838 goto ignore;
2839 }
2840
2841 // already booted?
2842 if (osdmap.is_up(from) &&
2843 osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
2844 osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
2845 // yup.
2846 dout(7) << "preprocess_boot dup from " << m->get_orig_source()
2847 << " " << m->get_orig_source_addrs()
2848 << " =~ " << osdmap.get_addrs(from) << dendl;
2849 _booted(op, false);
2850 return true;
2851 }
2852
2853 if (osdmap.exists(from) &&
2854 !osdmap.get_uuid(from).is_zero() &&
2855 osdmap.get_uuid(from) != m->sb.osd_fsid) {
2856 dout(7) << __func__ << " from " << m->get_orig_source_inst()
2857 << " clashes with existing osd: different fsid"
2858 << " (ours: " << osdmap.get_uuid(from)
2859 << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2860 goto ignore;
2861 }
2862
2863 if (osdmap.exists(from) &&
2864 osdmap.get_info(from).up_from > m->version &&
2865 osdmap.get_most_recent_addrs(from).legacy_equals(
2866 m->get_orig_source_addrs())) {
2867 dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2868 send_latest(op, m->sb.current_epoch+1);
2869 return true;
2870 }
2871
2872 // noup?
2873 if (!can_mark_up(from)) {
2874 dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2875 send_latest(op, m->sb.current_epoch+1);
2876 return true;
2877 }
2878
2879 dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2880 return false;
2881
2882 ignore:
2883 return true;
2884 }
2885
2886 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2887 {
2888 op->mark_osdmon_event(__func__);
2889 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2890 dout(7) << __func__ << " from " << m->get_source()
2891 << " sb " << m->sb
2892 << " client_addrs" << m->get_connection()->get_peer_addrs()
2893 << " cluster_addrs " << m->cluster_addrs
2894 << " hb_back_addrs " << m->hb_back_addrs
2895 << " hb_front_addrs " << m->hb_front_addrs
2896 << dendl;
2897
2898 ceph_assert(m->get_orig_source().is_osd());
2899 int from = m->get_orig_source().num();
2900
2901 // does this osd exist?
2902 if (from >= osdmap.get_max_osd()) {
2903 dout(1) << "boot from osd." << from << " >= max_osd "
2904 << osdmap.get_max_osd() << dendl;
2905 return false;
2906 }
2907
2908 int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2909 if (pending_inc.new_state.count(from))
2910 oldstate ^= pending_inc.new_state[from];
2911
2912 // already up? mark down first?
2913 if (osdmap.is_up(from)) {
2914 dout(7) << __func__ << " was up, first marking down osd." << from << " "
2915 << osdmap.get_addrs(from) << dendl;
2916 // preprocess should have caught these; if not, assert.
2917 ceph_assert(!osdmap.get_addrs(from).legacy_equals(
2918 m->get_orig_source_addrs()) ||
2919 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
2920 ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2921
2922 if (pending_inc.new_state.count(from) == 0 ||
2923 (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2924 // mark previous guy down
2925 pending_inc.new_state[from] = CEPH_OSD_UP;
2926 }
2927 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2928 } else if (pending_inc.new_up_client.count(from)) {
2929 // already prepared, just wait
2930 dout(7) << __func__ << " already prepared, waiting on "
2931 << m->get_orig_source_addr() << dendl;
2932 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2933 } else {
2934 // mark new guy up.
2935 pending_inc.new_up_client[from] = m->get_orig_source_addrs();
2936 pending_inc.new_up_cluster[from] = m->cluster_addrs;
2937 pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
2938 pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
2939
2940 down_pending_out.erase(from); // if any
2941
2942 if (m->sb.weight)
2943 osd_weight[from] = m->sb.weight;
2944
2945 // set uuid?
2946 dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2947 << dendl;
2948 if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2949 // preprocess should have caught this; if not, assert.
2950 ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2951 pending_inc.new_uuid[from] = m->sb.osd_fsid;
2952 }
2953
2954 // fresh osd?
2955 if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2956 const osd_info_t& i = osdmap.get_info(from);
2957 if (i.up_from > i.lost_at) {
2958 dout(10) << " fresh osd; marking lost_at too" << dendl;
2959 pending_inc.new_lost[from] = osdmap.get_epoch();
2960 }
2961 }
2962
2963 // metadata
2964 bufferlist osd_metadata;
2965 encode(m->metadata, osd_metadata);
2966 pending_metadata[from] = osd_metadata;
2967 pending_metadata_rm.erase(from);
2968
2969 // adjust last clean unmount epoch?
2970 const osd_info_t& info = osdmap.get_info(from);
2971 dout(10) << " old osd_info: " << info << dendl;
2972 if (m->sb.mounted > info.last_clean_begin ||
2973 (m->sb.mounted == info.last_clean_begin &&
2974 m->sb.clean_thru > info.last_clean_end)) {
2975 epoch_t begin = m->sb.mounted;
2976 epoch_t end = m->sb.clean_thru;
2977
2978 dout(10) << __func__ << " osd." << from << " last_clean_interval "
2979 << "[" << info.last_clean_begin << "," << info.last_clean_end
2980 << ") -> [" << begin << "-" << end << ")"
2981 << dendl;
2982 pending_inc.new_last_clean_interval[from] =
2983 pair<epoch_t,epoch_t>(begin, end);
2984 }
2985
2986 osd_xinfo_t xi = osdmap.get_xinfo(from);
2987 if (m->boot_epoch == 0) {
2988 xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
2989 xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
2990 dout(10) << " not laggy, new xi " << xi << dendl;
2991 } else {
2992 if (xi.down_stamp.sec()) {
2993 int interval = ceph_clock_now().sec() -
2994 xi.down_stamp.sec();
2995 if (g_conf()->mon_osd_laggy_max_interval &&
2996 (interval > g_conf()->mon_osd_laggy_max_interval)) {
2997 interval = g_conf()->mon_osd_laggy_max_interval;
2998 }
2999 xi.laggy_interval =
3000 interval * g_conf()->mon_osd_laggy_weight +
3001 xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3002 }
3003 xi.laggy_probability =
3004 g_conf()->mon_osd_laggy_weight +
3005 xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3006 dout(10) << " laggy, now xi " << xi << dendl;
3007 }
3008
3009 // set features shared by the osd
3010 if (m->osd_features)
3011 xi.features = m->osd_features;
3012 else
3013 xi.features = m->get_connection()->get_features();
3014
3015 // mark in?
3016 if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3017 (oldstate & CEPH_OSD_AUTOOUT)) ||
3018 (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3019 (g_conf()->mon_osd_auto_mark_in)) {
3020 if (can_mark_in(from)) {
3021 if (osdmap.osd_xinfo[from].old_weight > 0) {
3022 pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
3023 xi.old_weight = 0;
3024 } else {
3025 pending_inc.new_weight[from] = CEPH_OSD_IN;
3026 }
3027 } else {
3028 dout(7) << __func__ << " NOIN set, will not mark in "
3029 << m->get_orig_source_addr() << dendl;
3030 }
3031 }
3032
3033 pending_inc.new_xinfo[from] = xi;
3034
3035 // wait
3036 wait_for_finished_proposal(op, new C_Booted(this, op));
3037 }
3038 return true;
3039 }
3040
3041 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3042 {
3043 op->mark_osdmon_event(__func__);
3044 MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
3045 dout(7) << "_booted " << m->get_orig_source_inst()
3046 << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3047
3048 if (logit) {
3049 mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3050 << " boot";
3051 }
3052
3053 send_latest(op, m->sb.current_epoch+1);
3054 }
3055
3056
3057 // -------------
3058 // full
3059
3060 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3061 {
3062 op->mark_osdmon_event(__func__);
3063 MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3064 int from = m->get_orig_source().num();
3065 set<string> state;
3066 unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3067
3068 // check permissions, ignore if failed
3069 MonSession *session = op->get_session();
3070 if (!session)
3071 goto ignore;
3072 if (!session->is_capable("osd", MON_CAP_X)) {
3073 dout(0) << "MOSDFull from entity with insufficient privileges:"
3074 << session->caps << dendl;
3075 goto ignore;
3076 }
3077
3078 // ignore a full message from the osd instance that already went down
3079 if (!osdmap.exists(from)) {
3080 dout(7) << __func__ << " ignoring full message from nonexistent "
3081 << m->get_orig_source_inst() << dendl;
3082 goto ignore;
3083 }
3084 if ((!osdmap.is_up(from) &&
3085 osdmap.get_most_recent_addrs(from).legacy_equals(
3086 m->get_orig_source_addrs())) ||
3087 (osdmap.is_up(from) &&
3088 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3089 dout(7) << __func__ << " ignoring full message from down "
3090 << m->get_orig_source_inst() << dendl;
3091 goto ignore;
3092 }
3093
3094 OSDMap::calc_state_set(osdmap.get_state(from), state);
3095
3096 if ((osdmap.get_state(from) & mask) == m->state) {
3097 dout(7) << __func__ << " state already " << state << " for osd." << from
3098 << " " << m->get_orig_source_inst() << dendl;
3099 _reply_map(op, m->version);
3100 goto ignore;
3101 }
3102
3103 dout(10) << __func__ << " want state " << state << " for osd." << from
3104 << " " << m->get_orig_source_inst() << dendl;
3105 return false;
3106
3107 ignore:
3108 return true;
3109 }
3110
3111 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3112 {
3113 op->mark_osdmon_event(__func__);
3114 const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3115 const int from = m->get_orig_source().num();
3116
3117 const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3118 const unsigned want_state = m->state & mask; // safety first
3119
3120 unsigned cur_state = osdmap.get_state(from);
3121 auto p = pending_inc.new_state.find(from);
3122 if (p != pending_inc.new_state.end()) {
3123 cur_state ^= p->second;
3124 }
3125 cur_state &= mask;
3126
3127 set<string> want_state_set, cur_state_set;
3128 OSDMap::calc_state_set(want_state, want_state_set);
3129 OSDMap::calc_state_set(cur_state, cur_state_set);
3130
3131 if (cur_state != want_state) {
3132 if (p != pending_inc.new_state.end()) {
3133 p->second &= ~mask;
3134 } else {
3135 pending_inc.new_state[from] = 0;
3136 }
3137 pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3138 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3139 << " -> " << want_state_set << dendl;
3140 } else {
3141 dout(7) << __func__ << " osd." << from << " " << cur_state_set
3142 << " = wanted " << want_state_set << ", just waiting" << dendl;
3143 }
3144
3145 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3146 return true;
3147 }
3148
3149 // -------------
3150 // alive
3151
3152 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3153 {
3154 op->mark_osdmon_event(__func__);
3155 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3156 int from = m->get_orig_source().num();
3157
3158 // check permissions, ignore if failed
3159 MonSession *session = op->get_session();
3160 if (!session)
3161 goto ignore;
3162 if (!session->is_capable("osd", MON_CAP_X)) {
3163 dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3164 << session->caps << dendl;
3165 goto ignore;
3166 }
3167
3168 if (!osdmap.is_up(from) ||
3169 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3170 dout(7) << "preprocess_alive ignoring alive message from down "
3171 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3172 << dendl;
3173 goto ignore;
3174 }
3175
3176 if (osdmap.get_up_thru(from) >= m->want) {
3177 // yup.
3178 dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3179 _reply_map(op, m->version);
3180 return true;
3181 }
3182
3183 dout(10) << "preprocess_alive want up_thru " << m->want
3184 << " from " << m->get_orig_source_inst() << dendl;
3185 return false;
3186
3187 ignore:
3188 return true;
3189 }
3190
3191 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3192 {
3193 op->mark_osdmon_event(__func__);
3194 MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3195 int from = m->get_orig_source().num();
3196
3197 if (0) { // we probably don't care much about these
3198 mon->clog->debug() << m->get_orig_source_inst() << " alive";
3199 }
3200
3201 dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3202 << " from " << m->get_orig_source_inst() << dendl;
3203
3204 update_up_thru(from, m->version); // set to the latest map the OSD has
3205 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3206 return true;
3207 }
3208
3209 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3210 {
3211 op->mark_osdmon_event(__func__);
3212 dout(7) << "_reply_map " << e
3213 << " from " << op->get_req()->get_orig_source_inst()
3214 << dendl;
3215 send_latest(op, e);
3216 }
3217
3218 // pg_created
3219 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3220 {
3221 op->mark_osdmon_event(__func__);
3222 auto m = static_cast<MOSDPGCreated*>(op->get_req());
3223 dout(10) << __func__ << " " << *m << dendl;
3224 auto session = op->get_session();
3225 mon->no_reply(op);
3226 if (!session) {
3227 dout(10) << __func__ << ": no monitor session!" << dendl;
3228 return true;
3229 }
3230 if (!session->is_capable("osd", MON_CAP_X)) {
3231 derr << __func__ << " received from entity "
3232 << "with insufficient privileges " << session->caps << dendl;
3233 return true;
3234 }
3235 // always forward the "created!" to the leader
3236 return false;
3237 }
3238
3239 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3240 {
3241 op->mark_osdmon_event(__func__);
3242 auto m = static_cast<MOSDPGCreated*>(op->get_req());
3243 dout(10) << __func__ << " " << *m << dendl;
3244 auto src = m->get_orig_source();
3245 auto from = src.num();
3246 if (!src.is_osd() ||
3247 !mon->osdmon()->osdmap.is_up(from) ||
3248 !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3249 m->get_orig_source_addrs())) {
3250 dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3251 return false;
3252 }
3253 pending_created_pgs.push_back(m->pgid);
3254 return true;
3255 }
3256
3257 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3258 {
3259 op->mark_osdmon_event(__func__);
3260 auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3261 dout(10) << __func__ << " " << *m << dendl;
3262 const pg_pool_t *pi;
3263 auto session = op->get_session();
3264 if (!session) {
3265 dout(10) << __func__ << ": no monitor session!" << dendl;
3266 goto ignore;
3267 }
3268 if (!session->is_capable("osd", MON_CAP_X)) {
3269 derr << __func__ << " received from entity "
3270 << "with insufficient privileges " << session->caps << dendl;
3271 goto ignore;
3272 }
3273 pi = osdmap.get_pg_pool(m->pgid.pool());
3274 if (!pi) {
3275 derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3276 goto ignore;
3277 }
3278 if (pi->get_pg_num() <= m->pgid.ps()) {
3279 dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3280 goto ignore;
3281 }
3282 if (pi->get_pg_num() != m->pgid.ps() + 1) {
3283 derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3284 goto ignore;
3285 }
3286 if (pi->get_pg_num_pending() > m->pgid.ps()) {
3287 dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3288 goto ignore;
3289 }
3290 return false;
3291
3292 ignore:
3293 mon->no_reply(op);
3294 return true;
3295 }
3296
3297 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3298 {
3299 op->mark_osdmon_event(__func__);
3300 auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3301 dout(10) << __func__ << " " << *m << dendl;
3302 pg_pool_t p;
3303 if (pending_inc.new_pools.count(m->pgid.pool()))
3304 p = pending_inc.new_pools[m->pgid.pool()];
3305 else
3306 p = *osdmap.get_pg_pool(m->pgid.pool());
3307 if (p.get_pg_num() != m->pgid.ps() + 1 ||
3308 p.get_pg_num_pending() > m->pgid.ps()) {
3309 dout(10) << __func__
3310 << " race with concurrent pg_num[_pending] update, will retry"
3311 << dendl;
3312 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3313 return true;
3314 }
3315
3316 if (m->ready) {
3317 p.dec_pg_num(m->pgid,
3318 pending_inc.epoch,
3319 m->source_version,
3320 m->target_version,
3321 m->last_epoch_started,
3322 m->last_epoch_clean);
3323 p.last_change = pending_inc.epoch;
3324 } else {
3325 // back off the merge attempt!
3326 p.set_pg_num_pending(p.get_pg_num());
3327 }
3328
3329 // force pre-nautilus clients to resend their ops, since they
3330 // don't understand pg_num_pending changes form a new interval
3331 p.last_force_op_resend_prenautilus = pending_inc.epoch;
3332
3333 pending_inc.new_pools[m->pgid.pool()] = p;
3334
3335 auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3336 if (m->ready &&
3337 prob > 0 &&
3338 prob > (double)(rand() % 1000)/1000.0) {
3339 derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3340 auto n = new MMonCommand(mon->monmap->get_fsid());
3341 n->set_connection(m->get_connection());
3342 n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3343 osdmap.get_pool_name(m->pgid.pool()) +
3344 "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3345 stringify(m->pgid.ps() + 1) + "\"}" };
3346 MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3347 nop->set_type_service();
3348 wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3349 } else {
3350 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3351 }
3352 return true;
3353 }
3354
3355
3356 // -------------
3357 // pg_temp changes
3358
3359 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3360 {
3361 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3362 dout(10) << "preprocess_pgtemp " << *m << dendl;
3363 mempool::osdmap::vector<int> empty;
3364 int from = m->get_orig_source().num();
3365 size_t ignore_cnt = 0;
3366
3367 // check caps
3368 MonSession *session = op->get_session();
3369 if (!session)
3370 goto ignore;
3371 if (!session->is_capable("osd", MON_CAP_X)) {
3372 dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3373 << session->caps << dendl;
3374 goto ignore;
3375 }
3376
3377 if (!osdmap.is_up(from) ||
3378 !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3379 dout(7) << "ignoring pgtemp message from down "
3380 << m->get_orig_source() << " " << m->get_orig_source_addrs()
3381 << dendl;
3382 goto ignore;
3383 }
3384
3385 if (m->forced) {
3386 return false;
3387 }
3388
3389 for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3390 dout(20) << " " << p->first
3391 << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3392 << " -> " << p->second << dendl;
3393
3394 // does the pool exist?
3395 if (!osdmap.have_pg_pool(p->first.pool())) {
3396 /*
3397 * 1. If the osdmap does not have the pool, it means the pool has been
3398 * removed in-between the osd sending this message and us handling it.
3399 * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3400 * not exist in the pending either, as the osds would not send a
3401 * message about a pool they know nothing about (yet).
3402 * 3. However, if the pool does exist in the pending, then it must be a
3403 * new pool, and not relevant to this message (see 1).
3404 */
3405 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3406 << ": pool has been removed" << dendl;
3407 ignore_cnt++;
3408 continue;
3409 }
3410
3411 int acting_primary = -1;
3412 osdmap.pg_to_up_acting_osds(
3413 p->first, nullptr, nullptr, nullptr, &acting_primary);
3414 if (acting_primary != from) {
3415 /* If the source isn't the primary based on the current osdmap, we know
3416 * that the interval changed and that we can discard this message.
3417 * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3418 * which of two pg temp mappings on the same pg is more recent.
3419 */
3420 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3421 << ": primary has changed" << dendl;
3422 ignore_cnt++;
3423 continue;
3424 }
3425
3426 // removal?
3427 if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3428 osdmap.primary_temp->count(p->first)))
3429 return false;
3430 // change?
3431 // NOTE: we assume that this will clear pg_primary, so consider
3432 // an existing pg_primary field to imply a change
3433 if (p->second.size() &&
3434 (osdmap.pg_temp->count(p->first) == 0 ||
3435 osdmap.pg_temp->get(p->first) != p->second ||
3436 osdmap.primary_temp->count(p->first)))
3437 return false;
3438 }
3439
3440 // should we ignore all the pgs?
3441 if (ignore_cnt == m->pg_temp.size())
3442 goto ignore;
3443
3444 dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3445 _reply_map(op, m->map_epoch);
3446 return true;
3447
3448 ignore:
3449 return true;
3450 }
3451
3452 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
3453 {
3454 epoch_t old_up_thru = osdmap.get_up_thru(from);
3455 auto ut = pending_inc.new_up_thru.find(from);
3456 if (ut != pending_inc.new_up_thru.end()) {
3457 old_up_thru = ut->second;
3458 }
3459 if (up_thru > old_up_thru) {
3460 // set up_thru too, so the osd doesn't have to ask again
3461 pending_inc.new_up_thru[from] = up_thru;
3462 }
3463 }
3464
3465 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
3466 {
3467 op->mark_osdmon_event(__func__);
3468 MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3469 int from = m->get_orig_source().num();
3470 dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
3471 for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3472 uint64_t pool = p->first.pool();
3473 if (pending_inc.old_pools.count(pool)) {
3474 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3475 << ": pool pending removal" << dendl;
3476 continue;
3477 }
3478 if (!osdmap.have_pg_pool(pool)) {
3479 dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3480 << ": pool has been removed" << dendl;
3481 continue;
3482 }
3483 pending_inc.new_pg_temp[p->first] =
3484 mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
3485
3486 // unconditionally clear pg_primary (until this message can encode
3487 // a change for that, too.. at which point we need to also fix
3488 // preprocess_pg_temp)
3489 if (osdmap.primary_temp->count(p->first) ||
3490 pending_inc.new_primary_temp.count(p->first))
3491 pending_inc.new_primary_temp[p->first] = -1;
3492 }
3493
3494 // set up_thru too, so the osd doesn't have to ask again
3495 update_up_thru(from, m->map_epoch);
3496
3497 wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
3498 return true;
3499 }
3500
3501
3502 // ---
3503
3504 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
3505 {
3506 op->mark_osdmon_event(__func__);
3507 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3508 dout(7) << "preprocess_remove_snaps " << *m << dendl;
3509
3510 // check privilege, ignore if failed
3511 MonSession *session = op->get_session();
3512 mon->no_reply(op);
3513 if (!session)
3514 goto ignore;
3515 if (!session->caps.is_capable(
3516 cct,
3517 CEPH_ENTITY_TYPE_MON,
3518 session->entity_name,
3519 "osd", "osd pool rmsnap", {}, true, true, false,
3520 session->get_peer_socket_addr())) {
3521 dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3522 << session->caps << dendl;
3523 goto ignore;
3524 }
3525
3526 for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3527 q != m->snaps.end();
3528 ++q) {
3529 if (!osdmap.have_pg_pool(q->first)) {
3530 dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3531 continue;
3532 }
3533 const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3534 for (vector<snapid_t>::iterator p = q->second.begin();
3535 p != q->second.end();
3536 ++p) {
3537 if (*p > pi->get_snap_seq() ||
3538 !pi->removed_snaps.contains(*p))
3539 return false;
3540 }
3541 }
3542
3543 ignore:
3544 return true;
3545 }
3546
3547 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3548 {
3549 op->mark_osdmon_event(__func__);
3550 MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3551 dout(7) << "prepare_remove_snaps " << *m << dendl;
3552
3553 for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3554 p != m->snaps.end();
3555 ++p) {
3556
3557 if (!osdmap.have_pg_pool(p->first)) {
3558 dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3559 continue;
3560 }
3561
3562 pg_pool_t& pi = osdmap.pools[p->first];
3563 for (vector<snapid_t>::iterator q = p->second.begin();
3564 q != p->second.end();
3565 ++q) {
3566 if (!pi.removed_snaps.contains(*q) &&
3567 (!pending_inc.new_pools.count(p->first) ||
3568 !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3569 pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3570 newpi->removed_snaps.insert(*q);
3571 newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
3572 dout(10) << " pool " << p->first << " removed_snaps added " << *q
3573 << " (now " << newpi->removed_snaps << ")" << dendl;
3574 if (*q > newpi->get_snap_seq()) {
3575 dout(10) << " pool " << p->first << " snap_seq "
3576 << newpi->get_snap_seq() << " -> " << *q << dendl;
3577 newpi->set_snap_seq(*q);
3578 }
3579 newpi->set_snap_epoch(pending_inc.epoch);
3580 pending_inc.new_removed_snaps[p->first].insert(*q);
3581 }
3582 }
3583 }
3584 return true;
3585 }
3586
3587 // osd beacon
3588 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3589 {
3590 op->mark_osdmon_event(__func__);
3591 // check caps
3592 auto session = op->get_session();
3593 mon->no_reply(op);
3594 if (!session) {
3595 dout(10) << __func__ << " no monitor session!" << dendl;
3596 return true;
3597 }
3598 if (!session->is_capable("osd", MON_CAP_X)) {
3599 derr << __func__ << " received from entity "
3600 << "with insufficient privileges " << session->caps << dendl;
3601 return true;
3602 }
3603 // Always forward the beacon to the leader, even if they are the same as
3604 // the old one. The leader will mark as down osds that haven't sent
3605 // beacon for a few minutes.
3606 return false;
3607 }
3608
3609 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3610 {
3611 op->mark_osdmon_event(__func__);
3612 const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3613 const auto src = beacon->get_orig_source();
3614 dout(10) << __func__ << " " << *beacon
3615 << " from " << src << dendl;
3616 int from = src.num();
3617
3618 if (!src.is_osd() ||
3619 !osdmap.is_up(from) ||
3620 !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
3621 if (src.is_osd() && !osdmap.is_up(from)) {
3622 // share some new maps with this guy in case it may not be
3623 // aware of its own deadness...
3624 send_latest(op, beacon->version+1);
3625 }
3626 dout(1) << " ignoring beacon from non-active osd." << from << dendl;
3627 return false;
3628 }
3629
3630 last_osd_report[from] = ceph_clock_now();
3631 osd_epochs[from] = beacon->version;
3632
3633 for (const auto& pg : beacon->pgs) {
3634 last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3635 }
3636 return false;
3637 }
3638
3639 // ---------------
3640 // map helpers
3641
3642 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3643 {
3644 op->mark_osdmon_event(__func__);
3645 dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3646 << " start " << start << dendl;
3647 if (start == 0)
3648 send_full(op);
3649 else
3650 send_incremental(op, start);
3651 }
3652
3653
3654 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
3655 {
3656 MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
3657 get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
3658 r->oldest_map = get_first_committed();
3659 r->newest_map = osdmap.get_epoch();
3660 return r;
3661 }
3662
3663 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
3664 {
3665 dout(10) << "build_incremental [" << from << ".." << to << "] with features "
3666 << std::hex << features << std::dec << dendl;
3667 MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
3668 m->oldest_map = get_first_committed();
3669 m->newest_map = osdmap.get_epoch();
3670
3671 for (epoch_t e = to; e >= from && e > 0; e--) {
3672 bufferlist bl;
3673 int err = get_version(e, features, bl);
3674 if (err == 0) {
3675 ceph_assert(bl.length());
3676 // if (get_version(e, bl) > 0) {
3677 dout(20) << "build_incremental inc " << e << " "
3678 << bl.length() << " bytes" << dendl;
3679 m->incremental_maps[e] = bl;
3680 } else {
3681 ceph_assert(err == -ENOENT);
3682 ceph_assert(!bl.length());
3683 get_version_full(e, features, bl);
3684 if (bl.length() > 0) {
3685 //else if (get_version("full", e, bl) > 0) {
3686 dout(20) << "build_incremental full " << e << " "
3687 << bl.length() << " bytes" << dendl;
3688 m->maps[e] = bl;
3689 } else {
3690 ceph_abort(); // we should have all maps.
3691 }
3692 }
3693 }
3694 return m;
3695 }
3696
3697 void OSDMonitor::send_full(MonOpRequestRef op)
3698 {
3699 op->mark_osdmon_event(__func__);
3700 dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
3701 mon->send_reply(op, build_latest_full(op->get_session()->con_features));
3702 }
3703
3704 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3705 {
3706 op->mark_osdmon_event(__func__);
3707
3708 MonSession *s = op->get_session();
3709 ceph_assert(s);
3710
3711 if (s->proxy_con) {
3712 // oh, we can tell the other mon to do it
3713 dout(10) << __func__ << " asking proxying mon to send_incremental from "
3714 << first << dendl;
3715 MRoute *r = new MRoute(s->proxy_tid, NULL);
3716 r->send_osdmap_first = first;
3717 s->proxy_con->send_message(r);
3718 op->mark_event("reply: send routed send_osdmap_first reply");
3719 } else {
3720 // do it ourselves
3721 send_incremental(first, s, false, op);
3722 }
3723 }
3724
3725 void OSDMonitor::send_incremental(epoch_t first,
3726 MonSession *session,
3727 bool onetime,
3728 MonOpRequestRef req)
3729 {
3730 dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3731 << " to " << session->name << dendl;
3732
3733 // get feature of the peer
3734 // use quorum_con_features, if it's an anonymous connection.
3735 uint64_t features = session->con_features ? session->con_features :
3736 mon->get_quorum_con_features();
3737
3738 if (first <= session->osd_epoch) {
3739 dout(10) << __func__ << " " << session->name << " should already have epoch "
3740 << session->osd_epoch << dendl;
3741 first = session->osd_epoch + 1;
3742 }
3743
3744 if (first < get_first_committed()) {
3745 MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
3746 m->oldest_map = get_first_committed();
3747 m->newest_map = osdmap.get_epoch();
3748
3749 // share removed snaps during the gap
3750 get_removed_snaps_range(first, m->oldest_map, &m->gap_removed_snaps);
3751
3752 first = get_first_committed();
3753 bufferlist bl;
3754 int err = get_version_full(first, features, bl);
3755 ceph_assert(err == 0);
3756 ceph_assert(bl.length());
3757 dout(20) << "send_incremental starting with base full "
3758 << first << " " << bl.length() << " bytes" << dendl;
3759 m->maps[first] = bl;
3760
3761 if (req) {
3762 mon->send_reply(req, m);
3763 session->osd_epoch = first;
3764 return;
3765 } else {
3766 session->con->send_message(m);
3767 session->osd_epoch = first;
3768 }
3769 first++;
3770 }
3771
3772 while (first <= osdmap.get_epoch()) {
3773 epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
3774 osdmap.get_epoch());
3775 MOSDMap *m = build_incremental(first, last, features);
3776
3777 if (req) {
3778 // send some maps. it may not be all of them, but it will get them
3779 // started.
3780 mon->send_reply(req, m);
3781 } else {
3782 session->con->send_message(m);
3783 first = last + 1;
3784 }
3785 session->osd_epoch = last;
3786 if (onetime || req)
3787 break;
3788 }
3789 }
3790
3791 void OSDMonitor::get_removed_snaps_range(
3792 epoch_t start, epoch_t end,
3793 mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps)
3794 {
3795 // we only care about pools that exist now.
3796 for (auto& p : osdmap.get_pools()) {
3797 auto& t = (*gap_removed_snaps)[p.first];
3798 for (epoch_t epoch = start; epoch < end; ++epoch) {
3799 string k = make_snap_epoch_key(p.first, epoch);
3800 bufferlist v;
3801 mon->store->get(OSD_SNAP_PREFIX, k, v);
3802 if (v.length()) {
3803 auto q = v.cbegin();
3804 OSDMap::snap_interval_set_t snaps;
3805 decode(snaps, q);
3806 t.union_of(snaps);
3807 }
3808 }
3809 dout(10) << __func__ << " " << p.first << " " << t << dendl;
3810 }
3811 }
3812
3813 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3814 {
3815 return get_version(ver, mon->get_quorum_con_features(), bl);
3816 }
3817
3818 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
3819 {
3820 OSDMap::Incremental inc;
3821 auto q = bl.cbegin();
3822 inc.decode(q);
3823 // always encode with subset of osdmap's canonical features
3824 uint64_t f = features & inc.encode_features;
3825 dout(20) << __func__ << " " << inc.epoch << " with features " << f
3826 << dendl;
3827 bl.clear();
3828 if (inc.fullmap.length()) {
3829 // embedded full map?
3830 OSDMap m;
3831 m.decode(inc.fullmap);
3832 inc.fullmap.clear();
3833 m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
3834 }
3835 if (inc.crush.length()) {
3836 // embedded crush map
3837 CrushWrapper c;
3838 auto p = inc.crush.cbegin();
3839 c.decode(p);
3840 inc.crush.clear();
3841 c.encode(inc.crush, f);
3842 }
3843 inc.encode(bl, f | CEPH_FEATURE_RESERVED);
3844 }
3845
3846 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
3847 {
3848 OSDMap m;
3849 auto q = bl.cbegin();
3850 m.decode(q);
3851 // always encode with subset of osdmap's canonical features
3852 uint64_t f = features & m.get_encoding_features();
3853 dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
3854 << dendl;
3855 bl.clear();
3856 m.encode(bl, f | CEPH_FEATURE_RESERVED);
3857 }
3858
3859 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
3860 {
3861 uint64_t significant_features = OSDMap::get_significant_features(features);
3862 if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
3863 return 0;
3864 }
3865 int ret = PaxosService::get_version(ver, bl);
3866 if (ret < 0) {
3867 return ret;
3868 }
3869 // NOTE: this check is imprecise; the OSDMap encoding features may
3870 // be a subset of the latest mon quorum features, but worst case we
3871 // reencode once and then cache the (identical) result under both
3872 // feature masks.
3873 if (significant_features !=
3874 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3875 reencode_incremental_map(bl, features);
3876 }
3877 inc_osd_cache.add({ver, significant_features}, bl);
3878 return 0;
3879 }
3880
3881 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
3882 {
3883 bufferlist inc_bl;
3884 int err = get_version(ver, inc_bl);
3885 ceph_assert(err == 0);
3886 ceph_assert(inc_bl.length());
3887
3888 auto p = inc_bl.cbegin();
3889 inc.decode(p);
3890 dout(10) << __func__ << " "
3891 << " epoch " << inc.epoch
3892 << " inc_crc " << inc.inc_crc
3893 << " full_crc " << inc.full_crc
3894 << " encode_features " << inc.encode_features << dendl;
3895 return 0;
3896 }
3897
3898 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
3899 {
3900 dout(10) << __func__ << " ver " << ver << dendl;
3901
3902 version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
3903 if (closest_pinned == 0) {
3904 return -ENOENT;
3905 }
3906 if (closest_pinned > ver) {
3907 dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
3908 }
3909 ceph_assert(closest_pinned <= ver);
3910
3911 dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
3912
3913 // get osdmap incremental maps and apply on top of this one.
3914 bufferlist osdm_bl;
3915 bool has_cached_osdmap = false;
3916 for (version_t v = ver-1; v >= closest_pinned; --v) {
3917 if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
3918 &osdm_bl)) {
3919 dout(10) << __func__ << " found map in cache ver " << v << dendl;
3920 closest_pinned = v;
3921 has_cached_osdmap = true;
3922 break;
3923 }
3924 }
3925
3926 if (!has_cached_osdmap) {
3927 int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
3928 if (err != 0) {
3929 derr << __func__ << " closest pinned map ver " << closest_pinned
3930 << " not available! error: " << cpp_strerror(err) << dendl;
3931 }
3932 ceph_assert(err == 0);
3933 }
3934
3935 ceph_assert(osdm_bl.length());
3936
3937 OSDMap osdm;
3938 osdm.decode(osdm_bl);
3939
3940 dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
3941 << " e" << osdm.epoch
3942 << " crc " << osdm.get_crc()
3943 << " -- applying incremental maps." << dendl;
3944
3945 uint64_t encode_features = 0;
3946 for (version_t v = closest_pinned + 1; v <= ver; ++v) {
3947 dout(20) << __func__ << " applying inc epoch " << v << dendl;
3948
3949 OSDMap::Incremental inc;
3950 int err = get_inc(v, inc);
3951 ceph_assert(err == 0);
3952
3953 encode_features = inc.encode_features;
3954
3955 err = osdm.apply_incremental(inc);
3956 ceph_assert(err == 0);
3957
3958 // this block performs paranoid checks on map retrieval
3959 if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
3960 inc.full_crc != 0) {
3961
3962 uint64_t f = encode_features;
3963 if (!f) {
3964 f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
3965 }
3966
3967 // encode osdmap to force calculating crcs
3968 bufferlist tbl;
3969 osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
3970 // decode osdmap to compare crcs with what's expected by incremental
3971 OSDMap tosdm;
3972 tosdm.decode(tbl);
3973
3974 if (tosdm.get_crc() != inc.full_crc) {
3975 derr << __func__
3976 << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
3977 << ", expected " << inc.full_crc << ")" << dendl;
3978 ceph_abort_msg("osdmap crc mismatch");
3979 }
3980 }
3981
3982 // note: we cannot add the recently computed map to the cache, as is,
3983 // because we have not encoded the map into a bl.
3984 }
3985
3986 if (!encode_features) {
3987 dout(10) << __func__
3988 << " last incremental map didn't have features;"
3989 << " defaulting to quorum's or all" << dendl;
3990 encode_features =
3991 (mon->quorum_con_features ? mon->quorum_con_features : -1);
3992 }
3993 osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
3994
3995 return 0;
3996 }
3997
3998 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3999 {
4000 return get_version_full(ver, mon->get_quorum_con_features(), bl);
4001 }
4002
4003 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4004 bufferlist& bl)
4005 {
4006 uint64_t significant_features = OSDMap::get_significant_features(features);
4007 if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4008 return 0;
4009 }
4010 int ret = PaxosService::get_version_full(ver, bl);
4011 if (ret == -ENOENT) {
4012 // build map?
4013 ret = get_full_from_pinned_map(ver, bl);
4014 }
4015 if (ret < 0) {
4016 return ret;
4017 }
4018 // NOTE: this check is imprecise; the OSDMap encoding features may
4019 // be a subset of the latest mon quorum features, but worst case we
4020 // reencode once and then cache the (identical) result under both
4021 // feature masks.
4022 if (significant_features !=
4023 OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4024 reencode_full_map(bl, features);
4025 }
4026 full_osd_cache.add({ver, significant_features}, bl);
4027 return 0;
4028 }
4029
4030 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4031 {
4032 dout(10) << "blacklist " << av << " until " << until << dendl;
4033 for (auto a : av.v) {
4034 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4035 a.set_type(entity_addr_t::TYPE_ANY);
4036 } else {
4037 a.set_type(entity_addr_t::TYPE_LEGACY);
4038 }
4039 pending_inc.new_blacklist[a] = until;
4040 }
4041 return pending_inc.epoch;
4042 }
4043
4044 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4045 {
4046 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4047 a.set_type(entity_addr_t::TYPE_ANY);
4048 } else {
4049 a.set_type(entity_addr_t::TYPE_LEGACY);
4050 }
4051 dout(10) << "blacklist " << a << " until " << until << dendl;
4052 pending_inc.new_blacklist[a] = until;
4053 return pending_inc.epoch;
4054 }
4055
4056
4057 void OSDMonitor::check_osdmap_subs()
4058 {
4059 dout(10) << __func__ << dendl;
4060 if (!osdmap.get_epoch()) {
4061 return;
4062 }
4063 auto osdmap_subs = mon->session_map.subs.find("osdmap");
4064 if (osdmap_subs == mon->session_map.subs.end()) {
4065 return;
4066 }
4067 auto p = osdmap_subs->second->begin();
4068 while (!p.end()) {
4069 auto sub = *p;
4070 ++p;
4071 check_osdmap_sub(sub);
4072 }
4073 }
4074
4075 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4076 {
4077 dout(10) << __func__ << " " << sub << " next " << sub->next
4078 << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4079 if (sub->next <= osdmap.get_epoch()) {
4080 if (sub->next >= 1)
4081 send_incremental(sub->next, sub->session, sub->incremental_onetime);
4082 else
4083 sub->session->con->send_message(build_latest_full(sub->session->con_features));
4084 if (sub->onetime)
4085 mon->session_map.remove_sub(sub);
4086 else
4087 sub->next = osdmap.get_epoch() + 1;
4088 }
4089 }
4090
4091 void OSDMonitor::check_pg_creates_subs()
4092 {
4093 if (!osdmap.get_num_up_osds()) {
4094 return;
4095 }
4096 ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4097 mon->with_session_map([this](const MonSessionMap& session_map) {
4098 auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4099 if (pg_creates_subs == session_map.subs.end()) {
4100 return;
4101 }
4102 for (auto sub : *pg_creates_subs->second) {
4103 check_pg_creates_sub(sub);
4104 }
4105 });
4106 }
4107
4108 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4109 {
4110 dout(20) << __func__ << " .. " << sub->session->name << dendl;
4111 ceph_assert(sub->type == "osd_pg_creates");
4112 // only send these if the OSD is up. we will check_subs() when they do
4113 // come up so they will get the creates then.
4114 if (sub->session->name.is_osd() &&
4115 mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4116 sub->next = send_pg_creates(sub->session->name.num(),
4117 sub->session->con.get(),
4118 sub->next);
4119 }
4120 }
4121
4122 void OSDMonitor::do_application_enable(int64_t pool_id,
4123 const std::string &app_name,
4124 const std::string &app_key,
4125 const std::string &app_value)
4126 {
4127 ceph_assert(paxos->is_plugged() && is_writeable());
4128
4129 dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4130 << dendl;
4131
4132 ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
4133
4134 auto pp = osdmap.get_pg_pool(pool_id);
4135 ceph_assert(pp != nullptr);
4136
4137 pg_pool_t p = *pp;
4138 if (pending_inc.new_pools.count(pool_id)) {
4139 p = pending_inc.new_pools[pool_id];
4140 }
4141
4142 if (app_key.empty()) {
4143 p.application_metadata.insert({app_name, {}});
4144 } else {
4145 p.application_metadata.insert({app_name, {{app_key, app_value}}});
4146 }
4147 p.last_change = pending_inc.epoch;
4148 pending_inc.new_pools[pool_id] = p;
4149 }
4150
4151 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4152 pool_opts_t::key_t opt,
4153 pool_opts_t::value_t val)
4154 {
4155 auto p = pending_inc.new_pools.try_emplace(
4156 pool_id, *osdmap.get_pg_pool(pool_id));
4157 p.first->second.opts.set(opt, val);
4158 }
4159
4160 unsigned OSDMonitor::scan_for_creating_pgs(
4161 const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4162 const mempool::osdmap::set<int64_t>& removed_pools,
4163 utime_t modified,
4164 creating_pgs_t* creating_pgs) const
4165 {
4166 unsigned queued = 0;
4167 for (auto& p : pools) {
4168 int64_t poolid = p.first;
4169 if (creating_pgs->created_pools.count(poolid)) {
4170 dout(10) << __func__ << " already created " << poolid << dendl;
4171 continue;
4172 }
4173 const pg_pool_t& pool = p.second;
4174 int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4175 pool.get_type(), pool.get_size());
4176 if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4177 continue;
4178
4179 const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4180 const auto created = pool.get_last_change();
4181 if (last_scan_epoch && created <= last_scan_epoch) {
4182 dout(10) << __func__ << " no change in pool " << poolid
4183 << " " << pool << dendl;
4184 continue;
4185 }
4186 if (removed_pools.count(poolid)) {
4187 dout(10) << __func__ << " pool is being removed: " << poolid
4188 << " " << pool << dendl;
4189 continue;
4190 }
4191 dout(10) << __func__ << " queueing pool create for " << poolid
4192 << " " << pool << dendl;
4193 creating_pgs->create_pool(poolid, pool.get_pg_num(),
4194 created, modified);
4195 queued++;
4196 }
4197 return queued;
4198 }
4199
4200 void OSDMonitor::update_creating_pgs()
4201 {
4202 dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4203 << creating_pgs.queue.size() << " pools in queue" << dendl;
4204 decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4205 std::lock_guard<std::mutex> l(creating_pgs_lock);
4206 for (const auto& pg : creating_pgs.pgs) {
4207 int acting_primary = -1;
4208 auto pgid = pg.first;
4209 if (!osdmap.pg_exists(pgid)) {
4210 dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4211 << dendl;
4212 continue;
4213 }
4214 auto mapped = pg.second.first;
4215 dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4216 spg_t spgid(pgid);
4217 mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4218 // check the previous creating_pgs, look for the target to whom the pg was
4219 // previously mapped
4220 for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4221 const auto last_acting_primary = pgs_by_epoch.first;
4222 for (auto& pgs: pgs_by_epoch.second) {
4223 if (pgs.second.count(spgid)) {
4224 if (last_acting_primary == acting_primary) {
4225 mapped = pgs.first;
4226 } else {
4227 dout(20) << __func__ << " " << pgid << " "
4228 << " acting_primary:" << last_acting_primary
4229 << " -> " << acting_primary << dendl;
4230 // note epoch if the target of the create message changed.
4231 mapped = mapping.get_epoch();
4232 }
4233 break;
4234 } else {
4235 // newly creating
4236 mapped = mapping.get_epoch();
4237 }
4238 }
4239 }
4240 dout(10) << __func__ << " will instruct osd." << acting_primary
4241 << " to create " << pgid << "@" << mapped << dendl;
4242 new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4243 }
4244 creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4245 creating_pgs_epoch = mapping.get_epoch();
4246 }
4247
4248 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4249 {
4250 dout(30) << __func__ << " osd." << osd << " next=" << next
4251 << " " << creating_pgs_by_osd_epoch << dendl;
4252 std::lock_guard<std::mutex> l(creating_pgs_lock);
4253 if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4254 dout(20) << __func__
4255 << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4256 // the subscribers will be updated when the mapping is completed anyway
4257 return next;
4258 }
4259 auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4260 if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4261 return next;
4262 ceph_assert(!creating_pgs_by_epoch->second.empty());
4263
4264 MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4265 MOSDPGCreate2 *m = nullptr;
4266
4267 bool old = osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS;
4268
4269 epoch_t last = 0;
4270 for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4271 epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4272 auto epoch = epoch_pgs->first;
4273 auto& pgs = epoch_pgs->second;
4274 dout(20) << __func__ << " osd." << osd << " from " << next
4275 << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4276 last = epoch;
4277 for (auto& pg : pgs) {
4278 // Need the create time from the monitor using its clock to set
4279 // last_scrub_stamp upon pg creation.
4280 auto create = creating_pgs.pgs.find(pg.pgid);
4281 ceph_assert(create != creating_pgs.pgs.end());
4282 if (old) {
4283 if (!oldm) {
4284 oldm = new MOSDPGCreate(creating_pgs_epoch);
4285 }
4286 oldm->mkpg.emplace(pg.pgid,
4287 pg_create_t{create->second.first, pg.pgid, 0});
4288 oldm->ctimes.emplace(pg.pgid, create->second.second);
4289 } else {
4290 if (!m) {
4291 m = new MOSDPGCreate2(creating_pgs_epoch);
4292 }
4293 m->pgs.emplace(pg, create->second);
4294 }
4295 dout(20) << __func__ << " will create " << pg
4296 << " at " << create->second.first << dendl;
4297 }
4298 }
4299 if (m) {
4300 con->send_message(m);
4301 } else if (oldm) {
4302 con->send_message(oldm);
4303 } else {
4304 dout(20) << __func__ << " osd." << osd << " from " << next
4305 << " has nothing to send" << dendl;
4306 return next;
4307 }
4308
4309 // sub is current through last + 1
4310 return last + 1;
4311 }
4312
4313 // TICK
4314
4315
4316 void OSDMonitor::tick()
4317 {
4318 if (!is_active()) return;
4319
4320 dout(10) << osdmap << dendl;
4321
4322 // always update osdmap manifest, regardless of being the leader.
4323 load_osdmap_manifest();
4324
4325 if (!mon->is_leader()) return;
4326
4327 bool do_propose = false;
4328 utime_t now = ceph_clock_now();
4329
4330 if (handle_osd_timeouts(now, last_osd_report)) {
4331 do_propose = true;
4332 }
4333
4334 // mark osds down?
4335 if (check_failures(now)) {
4336 do_propose = true;
4337 }
4338
4339 // Force a proposal if we need to prune; pruning is performed on
4340 // ``encode_pending()``, hence why we need to regularly trigger a proposal
4341 // even if there's nothing going on.
4342 if (is_prune_enabled() && should_prune()) {
4343 do_propose = true;
4344 }
4345
4346 // mark down osds out?
4347
4348 /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4349 * influence at all. The decision is made based on the ratio of "in" osds,
4350 * and the function returns false if this ratio is lower that the minimum
4351 * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4352 */
4353 if (can_mark_out(-1)) {
4354 string down_out_subtree_limit = g_conf().get_val<string>(
4355 "mon_osd_down_out_subtree_limit");
4356 set<int> down_cache; // quick cache of down subtrees
4357
4358 map<int,utime_t>::iterator i = down_pending_out.begin();
4359 while (i != down_pending_out.end()) {
4360 int o = i->first;
4361 utime_t down = now;
4362 down -= i->second;
4363 ++i;
4364
4365 if (osdmap.is_down(o) &&
4366 osdmap.is_in(o) &&
4367 can_mark_out(o)) {
4368 utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
4369 utime_t grace = orig_grace;
4370 double my_grace = 0.0;
4371
4372 if (g_conf()->mon_osd_adjust_down_out_interval) {
4373 // scale grace period the same way we do the heartbeat grace.
4374 const osd_xinfo_t& xi = osdmap.get_xinfo(o);
4375 double halflife = (double)g_conf()->mon_osd_laggy_halflife;
4376 double decay_k = ::log(.5) / halflife;
4377 double decay = exp((double)down * decay_k);
4378 dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
4379 << " down for " << down << " decay " << decay << dendl;
4380 my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
4381 grace += my_grace;
4382 }
4383
4384 // is this an entire large subtree down?
4385 if (down_out_subtree_limit.length()) {
4386 int type = osdmap.crush->get_type_id(down_out_subtree_limit);
4387 if (type > 0) {
4388 if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
4389 dout(10) << "tick entire containing " << down_out_subtree_limit
4390 << " subtree for osd." << o
4391 << " is down; resetting timer" << dendl;
4392 // reset timer, too.
4393 down_pending_out[o] = now;
4394 continue;
4395 }
4396 }
4397 }
4398
4399 bool down_out = !osdmap.is_destroyed(o) &&
4400 g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
4401 bool destroyed_out = osdmap.is_destroyed(o) &&
4402 g_conf()->mon_osd_destroyed_out_interval > 0 &&
4403 // this is not precise enough as we did not make a note when this osd
4404 // was marked as destroyed, but let's not bother with that
4405 // complexity for now.
4406 down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
4407 if (down_out || destroyed_out) {
4408 dout(10) << "tick marking osd." << o << " OUT after " << down
4409 << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
4410 pending_inc.new_weight[o] = CEPH_OSD_OUT;
4411
4412 // set the AUTOOUT bit.
4413 if (pending_inc.new_state.count(o) == 0)
4414 pending_inc.new_state[o] = 0;
4415 pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
4416
4417 // remember previous weight
4418 if (pending_inc.new_xinfo.count(o) == 0)
4419 pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
4420 pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
4421
4422 do_propose = true;
4423
4424 mon->clog->info() << "Marking osd." << o << " out (has been down for "
4425 << int(down.sec()) << " seconds)";
4426 } else
4427 continue;
4428 }
4429
4430 down_pending_out.erase(o);
4431 }
4432 } else {
4433 dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
4434 }
4435
4436 // expire blacklisted items?
4437 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4438 p != osdmap.blacklist.end();
4439 ++p) {
4440 if (p->second < now) {
4441 dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
4442 pending_inc.old_blacklist.push_back(p->first);
4443 do_propose = true;
4444 }
4445 }
4446
4447 if (try_prune_purged_snaps()) {
4448 do_propose = true;
4449 }
4450
4451 if (update_pools_status())
4452 do_propose = true;
4453
4454 if (do_propose ||
4455 !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp
4456 propose_pending();
4457 }
4458
4459 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
4460 std::map<int,utime_t> &last_osd_report)
4461 {
4462 utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
4463 if (now - mon->get_leader_since() < timeo) {
4464 // We haven't been the leader for long enough to consider OSD timeouts
4465 return false;
4466 }
4467
4468 int max_osd = osdmap.get_max_osd();
4469 bool new_down = false;
4470
4471 for (int i=0; i < max_osd; ++i) {
4472 dout(30) << __func__ << ": checking up on osd " << i << dendl;
4473 if (!osdmap.exists(i)) {
4474 last_osd_report.erase(i); // if any
4475 continue;
4476 }
4477 if (!osdmap.is_up(i))
4478 continue;
4479 const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
4480 if (t == last_osd_report.end()) {
4481 // it wasn't in the map; start the timer.
4482 last_osd_report[i] = now;
4483 } else if (can_mark_down(i)) {
4484 utime_t diff = now - t->second;
4485 if (diff > timeo) {
4486 mon->clog->info() << "osd." << i << " marked down after no beacon for "
4487 << diff << " seconds";
4488 derr << "no beacon from osd." << i << " since " << t->second
4489 << ", " << diff << " seconds ago. marking down" << dendl;
4490 pending_inc.new_state[i] = CEPH_OSD_UP;
4491 new_down = true;
4492 }
4493 }
4494 }
4495 return new_down;
4496 }
4497
4498 static void dump_cpu_list(Formatter *f, const char *name,
4499 const string& strlist)
4500 {
4501 cpu_set_t cpu_set;
4502 size_t cpu_set_size;
4503 if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
4504 return;
4505 }
4506 set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
4507 f->open_array_section(name);
4508 for (auto cpu : cpus) {
4509 f->dump_int("cpu", cpu);
4510 }
4511 f->close_section();
4512 }
4513
4514 void OSDMonitor::dump_info(Formatter *f)
4515 {
4516 f->open_object_section("osdmap");
4517 osdmap.dump(f);
4518 f->close_section();
4519
4520 f->open_array_section("osd_metadata");
4521 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4522 if (osdmap.exists(i)) {
4523 f->open_object_section("osd");
4524 f->dump_unsigned("id", i);
4525 dump_osd_metadata(i, f, NULL);
4526 f->close_section();
4527 }
4528 }
4529 f->close_section();
4530
4531 f->dump_unsigned("osdmap_first_committed", get_first_committed());
4532 f->dump_unsigned("osdmap_last_committed", get_last_committed());
4533
4534 f->open_object_section("crushmap");
4535 osdmap.crush->dump(f);
4536 f->close_section();
4537
4538 if (has_osdmap_manifest) {
4539 f->open_object_section("osdmap_manifest");
4540 osdmap_manifest.dump(f);
4541 f->close_section();
4542 }
4543 }
4544
4545 namespace {
4546 enum osd_pool_get_choices {
4547 SIZE, MIN_SIZE,
4548 PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
4549 NODELETE, NOPGCHANGE, NOSIZECHANGE,
4550 WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4551 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4552 USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4553 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4554 CACHE_TARGET_FULL_RATIO,
4555 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4556 ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4557 MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4558 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4559 SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4560 RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4561 COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4562 COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4563 CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
4564 PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
4565 PG_AUTOSCALE_BIAS };
4566
4567 std::set<osd_pool_get_choices>
4568 subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4569 const std::set<osd_pool_get_choices>& second)
4570 {
4571 std::set<osd_pool_get_choices> result;
4572 std::set_difference(first.begin(), first.end(),
4573 second.begin(), second.end(),
4574 std::inserter(result, result.end()));
4575 return result;
4576 }
4577 }
4578
4579
4580 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4581 {
4582 op->mark_osdmon_event(__func__);
4583 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4584 int r = 0;
4585 bufferlist rdata;
4586 stringstream ss, ds;
4587
4588 cmdmap_t cmdmap;
4589 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4590 string rs = ss.str();
4591 mon->reply_command(op, -EINVAL, rs, get_last_committed());
4592 return true;
4593 }
4594
4595 MonSession *session = op->get_session();
4596 if (!session) {
4597 derr << __func__ << " no session" << dendl;
4598 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4599 return true;
4600 }
4601
4602 string prefix;
4603 cmd_getval(cct, cmdmap, "prefix", prefix);
4604
4605 string format;
4606 cmd_getval(cct, cmdmap, "format", format, string("plain"));
4607 boost::scoped_ptr<Formatter> f(Formatter::create(format));
4608
4609 if (prefix == "osd stat") {
4610 osdmap.print_summary(f.get(), ds, "", true);
4611 if (f)
4612 f->flush(rdata);
4613 else
4614 rdata.append(ds);
4615 }
4616 else if (prefix == "osd dump" ||
4617 prefix == "osd tree" ||
4618 prefix == "osd tree-from" ||
4619 prefix == "osd ls" ||
4620 prefix == "osd getmap" ||
4621 prefix == "osd getcrushmap" ||
4622 prefix == "osd ls-tree") {
4623 string val;
4624
4625 epoch_t epoch = 0;
4626 int64_t epochnum;
4627 cmd_getval(cct, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
4628 epoch = epochnum;
4629
4630 bufferlist osdmap_bl;
4631 int err = get_version_full(epoch, osdmap_bl);
4632 if (err == -ENOENT) {
4633 r = -ENOENT;
4634 ss << "there is no map for epoch " << epoch;
4635 goto reply;
4636 }
4637 ceph_assert(err == 0);
4638 ceph_assert(osdmap_bl.length());
4639
4640 OSDMap *p;
4641 if (epoch == osdmap.get_epoch()) {
4642 p = &osdmap;
4643 } else {
4644 p = new OSDMap;
4645 p->decode(osdmap_bl);
4646 }
4647
4648 auto sg = make_scope_guard([&] {
4649 if (p != &osdmap) {
4650 delete p;
4651 }
4652 });
4653
4654 if (prefix == "osd dump") {
4655 stringstream ds;
4656 if (f) {
4657 f->open_object_section("osdmap");
4658 p->dump(f.get());
4659 f->close_section();
4660 f->flush(ds);
4661 } else {
4662 p->print(ds);
4663 }
4664 rdata.append(ds);
4665 if (!f)
4666 ds << " ";
4667 } else if (prefix == "osd ls") {
4668 if (f) {
4669 f->open_array_section("osds");
4670 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4671 if (osdmap.exists(i)) {
4672 f->dump_int("osd", i);
4673 }
4674 }
4675 f->close_section();
4676 f->flush(ds);
4677 } else {
4678 bool first = true;
4679 for (int i = 0; i < osdmap.get_max_osd(); i++) {
4680 if (osdmap.exists(i)) {
4681 if (!first)
4682 ds << "\n";
4683 first = false;
4684 ds << i;
4685 }
4686 }
4687 }
4688 rdata.append(ds);
4689 } else if (prefix == "osd tree" || prefix == "osd tree-from") {
4690 string bucket;
4691 if (prefix == "osd tree-from") {
4692 cmd_getval(cct, cmdmap, "bucket", bucket);
4693 if (!osdmap.crush->name_exists(bucket)) {
4694 ss << "bucket '" << bucket << "' does not exist";
4695 r = -ENOENT;
4696 goto reply;
4697 }
4698 int id = osdmap.crush->get_item_id(bucket);
4699 if (id >= 0) {
4700 ss << "\"" << bucket << "\" is not a bucket";
4701 r = -EINVAL;
4702 goto reply;
4703 }
4704 }
4705
4706 vector<string> states;
4707 cmd_getval(cct, cmdmap, "states", states);
4708 unsigned filter = 0;
4709 for (auto& s : states) {
4710 if (s == "up") {
4711 filter |= OSDMap::DUMP_UP;
4712 } else if (s == "down") {
4713 filter |= OSDMap::DUMP_DOWN;
4714 } else if (s == "in") {
4715 filter |= OSDMap::DUMP_IN;
4716 } else if (s == "out") {
4717 filter |= OSDMap::DUMP_OUT;
4718 } else if (s == "destroyed") {
4719 filter |= OSDMap::DUMP_DESTROYED;
4720 } else {
4721 ss << "unrecognized state '" << s << "'";
4722 r = -EINVAL;
4723 goto reply;
4724 }
4725 }
4726 if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
4727 (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4728 ss << "cannot specify both 'in' and 'out'";
4729 r = -EINVAL;
4730 goto reply;
4731 }
4732 if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4733 (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4734 ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4735 (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4736 ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4737 (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4738 ss << "can specify only one of 'up', 'down' and 'destroyed'";
4739 r = -EINVAL;
4740 goto reply;
4741 }
4742 if (f) {
4743 f->open_object_section("tree");
4744 p->print_tree(f.get(), NULL, filter, bucket);
4745 f->close_section();
4746 f->flush(ds);
4747 } else {
4748 p->print_tree(NULL, &ds, filter, bucket);
4749 }
4750 rdata.append(ds);
4751 } else if (prefix == "osd getmap") {
4752 rdata.append(osdmap_bl);
4753 ss << "got osdmap epoch " << p->get_epoch();
4754 } else if (prefix == "osd getcrushmap") {
4755 p->crush->encode(rdata, mon->get_quorum_con_features());
4756 ss << p->get_crush_version();
4757 } else if (prefix == "osd ls-tree") {
4758 string bucket_name;
4759 cmd_getval(cct, cmdmap, "name", bucket_name);
4760 set<int> osds;
4761 r = p->get_osds_by_bucket_name(bucket_name, &osds);
4762 if (r == -ENOENT) {
4763 ss << "\"" << bucket_name << "\" does not exist";
4764 goto reply;
4765 } else if (r < 0) {
4766 ss << "can not parse bucket name:\"" << bucket_name << "\"";
4767 goto reply;
4768 }
4769
4770 if (f) {
4771 f->open_array_section("osds");
4772 for (auto &i : osds) {
4773 if (osdmap.exists(i)) {
4774 f->dump_int("osd", i);
4775 }
4776 }
4777 f->close_section();
4778 f->flush(ds);
4779 } else {
4780 bool first = true;
4781 for (auto &i : osds) {
4782 if (osdmap.exists(i)) {
4783 if (!first)
4784 ds << "\n";
4785 first = false;
4786 ds << i;
4787 }
4788 }
4789 }
4790
4791 rdata.append(ds);
4792 }
4793 } else if (prefix == "osd getmaxosd") {
4794 if (f) {
4795 f->open_object_section("getmaxosd");
4796 f->dump_unsigned("epoch", osdmap.get_epoch());
4797 f->dump_int("max_osd", osdmap.get_max_osd());
4798 f->close_section();
4799 f->flush(rdata);
4800 } else {
4801 ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4802 rdata.append(ds);
4803 }
4804 } else if (prefix == "osd utilization") {
4805 string out;
4806 osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4807 if (f)
4808 f->flush(rdata);
4809 else
4810 rdata.append(out);
4811 r = 0;
4812 goto reply;
4813 } else if (prefix == "osd find") {
4814 int64_t osd;
4815 if (!cmd_getval(cct, cmdmap, "id", osd)) {
4816 ss << "unable to parse osd id value '"
4817 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4818 r = -EINVAL;
4819 goto reply;
4820 }
4821 if (!osdmap.exists(osd)) {
4822 ss << "osd." << osd << " does not exist";
4823 r = -ENOENT;
4824 goto reply;
4825 }
4826 string format;
4827 cmd_getval(cct, cmdmap, "format", format);
4828 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4829 f->open_object_section("osd_location");
4830 f->dump_int("osd", osd);
4831 f->dump_object("addrs", osdmap.get_addrs(osd));
4832 f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
4833
4834 // try to identify host, pod/container name, etc.
4835 map<string,string> m;
4836 load_metadata(osd, m, nullptr);
4837 if (auto p = m.find("hostname"); p != m.end()) {
4838 f->dump_string("host", p->second);
4839 }
4840 for (auto& k : {
4841 "pod_name", "pod_namespace", // set by rook
4842 "container_name" // set by ceph-ansible
4843 }) {
4844 if (auto p = m.find(k); p != m.end()) {
4845 f->dump_string(k, p->second);
4846 }
4847 }
4848
4849 // crush is helpful too
4850 f->open_object_section("crush_location");
4851 map<string,string> loc = osdmap.crush->get_full_location(osd);
4852 for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4853 f->dump_string(p->first.c_str(), p->second);
4854 f->close_section();
4855 f->close_section();
4856 f->flush(rdata);
4857 } else if (prefix == "osd metadata") {
4858 int64_t osd = -1;
4859 if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4860 !cmd_getval(cct, cmdmap, "id", osd)) {
4861 ss << "unable to parse osd id value '"
4862 << cmd_vartype_stringify(cmdmap["id"]) << "'";
4863 r = -EINVAL;
4864 goto reply;
4865 }
4866 if (osd >= 0 && !osdmap.exists(osd)) {
4867 ss << "osd." << osd << " does not exist";
4868 r = -ENOENT;
4869 goto reply;
4870 }
4871 string format;
4872 cmd_getval(cct, cmdmap, "format", format);
4873 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4874 if (osd >= 0) {
4875 f->open_object_section("osd_metadata");
4876 f->dump_unsigned("id", osd);
4877 r = dump_osd_metadata(osd, f.get(), &ss);
4878 if (r < 0)
4879 goto reply;
4880 f->close_section();
4881 } else {
4882 r = 0;
4883 f->open_array_section("osd_metadata");
4884 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4885 if (osdmap.exists(i)) {
4886 f->open_object_section("osd");
4887 f->dump_unsigned("id", i);
4888 r = dump_osd_metadata(i, f.get(), NULL);
4889 if (r == -EINVAL || r == -ENOENT) {
4890 // Drop error, continue to get other daemons' metadata
4891 dout(4) << "No metadata for osd." << i << dendl;
4892 r = 0;
4893 } else if (r < 0) {
4894 // Unexpected error
4895 goto reply;
4896 }
4897 f->close_section();
4898 }
4899 }
4900 f->close_section();
4901 }
4902 f->flush(rdata);
4903 } else if (prefix == "osd versions") {
4904 if (!f)
4905 f.reset(Formatter::create("json-pretty"));
4906 count_metadata("ceph_version", f.get());
4907 f->flush(rdata);
4908 r = 0;
4909 } else if (prefix == "osd count-metadata") {
4910 if (!f)
4911 f.reset(Formatter::create("json-pretty"));
4912 string field;
4913 cmd_getval(cct, cmdmap, "property", field);
4914 count_metadata(field, f.get());
4915 f->flush(rdata);
4916 r = 0;
4917 } else if (prefix == "osd numa-status") {
4918 TextTable tbl;
4919 if (f) {
4920 f->open_array_section("osds");
4921 } else {
4922 tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
4923 tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
4924 tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
4925 tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
4926 tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
4927 tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
4928 }
4929 for (int i=0; i<osdmap.get_max_osd(); ++i) {
4930 if (osdmap.exists(i)) {
4931 map<string,string> m;
4932 ostringstream err;
4933 if (load_metadata(i, m, &err) < 0) {
4934 continue;
4935 }
4936 string host;
4937 auto p = m.find("hostname");
4938 if (p != m.end()) {
4939 host = p->second;
4940 }
4941 if (f) {
4942 f->open_object_section("osd");
4943 f->dump_int("osd", i);
4944 f->dump_string("host", host);
4945 for (auto n : { "network_numa_node", "objectstore_numa_node",
4946 "numa_node" }) {
4947 p = m.find(n);
4948 if (p != m.end()) {
4949 f->dump_int(n, atoi(p->second.c_str()));
4950 }
4951 }
4952 for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
4953 p = m.find(n);
4954 if (p != m.end()) {
4955 list<string> ls = get_str_list(p->second, ",");
4956 f->open_array_section(n);
4957 for (auto node : ls) {
4958 f->dump_int("node", atoi(node.c_str()));
4959 }
4960 f->close_section();
4961 }
4962 }
4963 for (auto n : { "numa_node_cpus" }) {
4964 p = m.find(n);
4965 if (p != m.end()) {
4966 dump_cpu_list(f.get(), n, p->second);
4967 }
4968 }
4969 f->close_section();
4970 } else {
4971 tbl << i;
4972 tbl << host;
4973 p = m.find("network_numa_nodes");
4974 if (p != m.end()) {
4975 tbl << p->second;
4976 } else {
4977 tbl << "-";
4978 }
4979 p = m.find("objectstore_numa_nodes");
4980 if (p != m.end()) {
4981 tbl << p->second;
4982 } else {
4983 tbl << "-";
4984 }
4985 p = m.find("numa_node");
4986 auto q = m.find("numa_node_cpus");
4987 if (p != m.end() && q != m.end()) {
4988 tbl << p->second;
4989 tbl << q->second;
4990 } else {
4991 tbl << "-";
4992 tbl << "-";
4993 }
4994 tbl << TextTable::endrow;
4995 }
4996 }
4997 }
4998 if (f) {
4999 f->close_section();
5000 f->flush(rdata);
5001 } else {
5002 rdata.append(stringify(tbl));
5003 }
5004 } else if (prefix == "osd map") {
5005 string poolstr, objstr, namespacestr;
5006 cmd_getval(cct, cmdmap, "pool", poolstr);
5007 cmd_getval(cct, cmdmap, "object", objstr);
5008 cmd_getval(cct, cmdmap, "nspace", namespacestr);
5009
5010 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5011 if (pool < 0) {
5012 ss << "pool " << poolstr << " does not exist";
5013 r = -ENOENT;
5014 goto reply;
5015 }
5016 object_locator_t oloc(pool, namespacestr);
5017 object_t oid(objstr);
5018 pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5019 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5020 vector<int> up, acting;
5021 int up_p, acting_p;
5022 osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5023
5024 string fullobjname;
5025 if (!namespacestr.empty())
5026 fullobjname = namespacestr + string("/") + oid.name;
5027 else
5028 fullobjname = oid.name;
5029 if (f) {
5030 f->open_object_section("osd_map");
5031 f->dump_unsigned("epoch", osdmap.get_epoch());
5032 f->dump_string("pool", poolstr);
5033 f->dump_int("pool_id", pool);
5034 f->dump_stream("objname") << fullobjname;
5035 f->dump_stream("raw_pgid") << pgid;
5036 f->dump_stream("pgid") << mpgid;
5037 f->open_array_section("up");
5038 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5039 f->dump_int("osd", *p);
5040 f->close_section();
5041 f->dump_int("up_primary", up_p);
5042 f->open_array_section("acting");
5043 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5044 f->dump_int("osd", *p);
5045 f->close_section();
5046 f->dump_int("acting_primary", acting_p);
5047 f->close_section(); // osd_map
5048 f->flush(rdata);
5049 } else {
5050 ds << "osdmap e" << osdmap.get_epoch()
5051 << " pool '" << poolstr << "' (" << pool << ")"
5052 << " object '" << fullobjname << "' ->"
5053 << " pg " << pgid << " (" << mpgid << ")"
5054 << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5055 << pg_vector_string(acting) << ", p" << acting_p << ")";
5056 rdata.append(ds);
5057 }
5058
5059 } else if (prefix == "pg map") {
5060 pg_t pgid;
5061 string pgidstr;
5062 cmd_getval(cct, cmdmap, "pgid", pgidstr);
5063 if (!pgid.parse(pgidstr.c_str())) {
5064 ss << "invalid pgid '" << pgidstr << "'";
5065 r = -EINVAL;
5066 goto reply;
5067 }
5068 vector<int> up, acting;
5069 if (!osdmap.have_pg_pool(pgid.pool())) {
5070 ss << "pg '" << pgidstr << "' does not exist";
5071 r = -ENOENT;
5072 goto reply;
5073 }
5074 pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5075 osdmap.pg_to_up_acting_osds(pgid, up, acting);
5076 if (f) {
5077 f->open_object_section("pg_map");
5078 f->dump_unsigned("epoch", osdmap.get_epoch());
5079 f->dump_stream("raw_pgid") << pgid;
5080 f->dump_stream("pgid") << mpgid;
5081 f->open_array_section("up");
5082 for (auto osd : up) {
5083 f->dump_int("up_osd", osd);
5084 }
5085 f->close_section();
5086 f->open_array_section("acting");
5087 for (auto osd : acting) {
5088 f->dump_int("acting_osd", osd);
5089 }
5090 f->close_section();
5091 f->close_section();
5092 f->flush(rdata);
5093 } else {
5094 ds << "osdmap e" << osdmap.get_epoch()
5095 << " pg " << pgid << " (" << mpgid << ")"
5096 << " -> up " << up << " acting " << acting;
5097 rdata.append(ds);
5098 }
5099 goto reply;
5100
5101 } else if (prefix == "osd lspools") {
5102 if (f)
5103 f->open_array_section("pools");
5104 for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5105 p != osdmap.pools.end();
5106 ++p) {
5107 if (f) {
5108 f->open_object_section("pool");
5109 f->dump_int("poolnum", p->first);
5110 f->dump_string("poolname", osdmap.pool_name[p->first]);
5111 f->close_section();
5112 } else {
5113 ds << p->first << ' ' << osdmap.pool_name[p->first];
5114 if (next(p) != osdmap.pools.end()) {
5115 ds << '\n';
5116 }
5117 }
5118 }
5119 if (f) {
5120 f->close_section();
5121 f->flush(ds);
5122 }
5123 rdata.append(ds);
5124 } else if (prefix == "osd blacklist ls") {
5125 if (f)
5126 f->open_array_section("blacklist");
5127
5128 for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5129 p != osdmap.blacklist.end();
5130 ++p) {
5131 if (f) {
5132 f->open_object_section("entry");
5133 f->dump_string("addr", p->first.get_legacy_str());
5134 f->dump_stream("until") << p->second;
5135 f->close_section();
5136 } else {
5137 stringstream ss;
5138 string s;
5139 ss << p->first << " " << p->second;
5140 getline(ss, s);
5141 s += "\n";
5142 rdata.append(s);
5143 }
5144 }
5145 if (f) {
5146 f->close_section();
5147 f->flush(rdata);
5148 }
5149 ss << "listed " << osdmap.blacklist.size() << " entries";
5150
5151 } else if (prefix == "osd pool ls") {
5152 string detail;
5153 cmd_getval(cct, cmdmap, "detail", detail);
5154 if (!f && detail == "detail") {
5155 ostringstream ss;
5156 osdmap.print_pools(ss);
5157 rdata.append(ss.str());
5158 } else {
5159 if (f)
5160 f->open_array_section("pools");
5161 for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5162 it != osdmap.get_pools().end();
5163 ++it) {
5164 if (f) {
5165 if (detail == "detail") {
5166 f->open_object_section("pool");
5167 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5168 it->second.dump(f.get());
5169 f->close_section();
5170 } else {
5171 f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5172 }
5173 } else {
5174 rdata.append(osdmap.get_pool_name(it->first) + "\n");
5175 }
5176 }
5177 if (f) {
5178 f->close_section();
5179 f->flush(rdata);
5180 }
5181 }
5182
5183 } else if (prefix == "osd crush get-tunable") {
5184 string tunable;
5185 cmd_getval(cct, cmdmap, "tunable", tunable);
5186 ostringstream rss;
5187 if (f)
5188 f->open_object_section("tunable");
5189 if (tunable == "straw_calc_version") {
5190 if (f)
5191 f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5192 else
5193 rss << osdmap.crush->get_straw_calc_version() << "\n";
5194 } else {
5195 r = -EINVAL;
5196 goto reply;
5197 }
5198 if (f) {
5199 f->close_section();
5200 f->flush(rdata);
5201 } else {
5202 rdata.append(rss.str());
5203 }
5204 r = 0;
5205
5206 } else if (prefix == "osd pool get") {
5207 string poolstr;
5208 cmd_getval(cct, cmdmap, "pool", poolstr);
5209 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5210 if (pool < 0) {
5211 ss << "unrecognized pool '" << poolstr << "'";
5212 r = -ENOENT;
5213 goto reply;
5214 }
5215
5216 const pg_pool_t *p = osdmap.get_pg_pool(pool);
5217 string var;
5218 cmd_getval(cct, cmdmap, "var", var);
5219
5220 typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5221 const choices_map_t ALL_CHOICES = {
5222 {"size", SIZE},
5223 {"min_size", MIN_SIZE},
5224 {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5225 {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5226 {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5227 {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5228 {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5229 {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5230 {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5231 {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5232 {"use_gmt_hitset", USE_GMT_HITSET},
5233 {"target_max_objects", TARGET_MAX_OBJECTS},
5234 {"target_max_bytes", TARGET_MAX_BYTES},
5235 {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5236 {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5237 {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5238 {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5239 {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5240 {"erasure_code_profile", ERASURE_CODE_PROFILE},
5241 {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5242 {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5243 {"fast_read", FAST_READ},
5244 {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5245 {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5246 {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5247 {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5248 {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5249 {"recovery_priority", RECOVERY_PRIORITY},
5250 {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5251 {"scrub_priority", SCRUB_PRIORITY},
5252 {"compression_mode", COMPRESSION_MODE},
5253 {"compression_algorithm", COMPRESSION_ALGORITHM},
5254 {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5255 {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5256 {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5257 {"csum_type", CSUM_TYPE},
5258 {"csum_max_block", CSUM_MAX_BLOCK},
5259 {"csum_min_block", CSUM_MIN_BLOCK},
5260 {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5261 {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5262 {"pg_num_min", PG_NUM_MIN},
5263 {"target_size_bytes", TARGET_SIZE_BYTES},
5264 {"target_size_ratio", TARGET_SIZE_RATIO},
5265 {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5266 };
5267
5268 typedef std::set<osd_pool_get_choices> choices_set_t;
5269
5270 const choices_set_t ONLY_TIER_CHOICES = {
5271 HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5272 TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5273 CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5274 CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5275 MIN_READ_RECENCY_FOR_PROMOTE,
5276 MIN_WRITE_RECENCY_FOR_PROMOTE,
5277 HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
5278 };
5279 const choices_set_t ONLY_ERASURE_CHOICES = {
5280 EC_OVERWRITES, ERASURE_CODE_PROFILE
5281 };
5282
5283 choices_set_t selected_choices;
5284 if (var == "all") {
5285 for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
5286 it != ALL_CHOICES.end(); ++it) {
5287 selected_choices.insert(it->second);
5288 }
5289
5290 if(!p->is_tier()) {
5291 selected_choices = subtract_second_from_first(selected_choices,
5292 ONLY_TIER_CHOICES);
5293 }
5294
5295 if(!p->is_erasure()) {
5296 selected_choices = subtract_second_from_first(selected_choices,
5297 ONLY_ERASURE_CHOICES);
5298 }
5299 } else /* var != "all" */ {
5300 choices_map_t::const_iterator found = ALL_CHOICES.find(var);
5301 osd_pool_get_choices selected = found->second;
5302
5303 if (!p->is_tier() &&
5304 ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
5305 ss << "pool '" << poolstr
5306 << "' is not a tier pool: variable not applicable";
5307 r = -EACCES;
5308 goto reply;
5309 }
5310
5311 if (!p->is_erasure() &&
5312 ONLY_ERASURE_CHOICES.find(selected)
5313 != ONLY_ERASURE_CHOICES.end()) {
5314 ss << "pool '" << poolstr
5315 << "' is not a erasure pool: variable not applicable";
5316 r = -EACCES;
5317 goto reply;
5318 }
5319
5320 if (pool_opts_t::is_opt_name(var) &&
5321 !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
5322 ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
5323 r = -ENOENT;
5324 goto reply;
5325 }
5326
5327 selected_choices.insert(selected);
5328 }
5329
5330 if (f) {
5331 f->open_object_section("pool");
5332 f->dump_string("pool", poolstr);
5333 f->dump_int("pool_id", pool);
5334 for(choices_set_t::const_iterator it = selected_choices.begin();
5335 it != selected_choices.end(); ++it) {
5336 choices_map_t::const_iterator i;
5337 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5338 if (i->second == *it) {
5339 break;
5340 }
5341 }
5342 ceph_assert(i != ALL_CHOICES.end());
5343 switch(*it) {
5344 case PG_NUM:
5345 f->dump_int("pg_num", p->get_pg_num());
5346 break;
5347 case PGP_NUM:
5348 f->dump_int("pgp_num", p->get_pgp_num());
5349 break;
5350 case SIZE:
5351 f->dump_int("size", p->get_size());
5352 break;
5353 case MIN_SIZE:
5354 f->dump_int("min_size", p->get_min_size());
5355 break;
5356 case CRUSH_RULE:
5357 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5358 f->dump_string("crush_rule", osdmap.crush->get_rule_name(
5359 p->get_crush_rule()));
5360 } else {
5361 f->dump_string("crush_rule", stringify(p->get_crush_rule()));
5362 }
5363 break;
5364 case EC_OVERWRITES:
5365 f->dump_bool("allow_ec_overwrites",
5366 p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
5367 break;
5368 case PG_AUTOSCALE_MODE:
5369 f->dump_string("pg_autoscale_mode",
5370 pg_pool_t::get_pg_autoscale_mode_name(
5371 p->pg_autoscale_mode));
5372 break;
5373 case HASHPSPOOL:
5374 case NODELETE:
5375 case NOPGCHANGE:
5376 case NOSIZECHANGE:
5377 case WRITE_FADVISE_DONTNEED:
5378 case NOSCRUB:
5379 case NODEEP_SCRUB:
5380 f->dump_bool(i->first.c_str(),
5381 p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
5382 break;
5383 case HIT_SET_PERIOD:
5384 f->dump_int("hit_set_period", p->hit_set_period);
5385 break;
5386 case HIT_SET_COUNT:
5387 f->dump_int("hit_set_count", p->hit_set_count);
5388 break;
5389 case HIT_SET_TYPE:
5390 f->dump_string("hit_set_type",
5391 HitSet::get_type_name(p->hit_set_params.get_type()));
5392 break;
5393 case HIT_SET_FPP:
5394 {
5395 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5396 BloomHitSet::Params *bloomp =
5397 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5398 f->dump_float("hit_set_fpp", bloomp->get_fpp());
5399 } else if(var != "all") {
5400 f->close_section();
5401 ss << "hit set is not of type Bloom; " <<
5402 "invalid to get a false positive rate!";
5403 r = -EINVAL;
5404 goto reply;
5405 }
5406 }
5407 break;
5408 case USE_GMT_HITSET:
5409 f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
5410 break;
5411 case TARGET_MAX_OBJECTS:
5412 f->dump_unsigned("target_max_objects", p->target_max_objects);
5413 break;
5414 case TARGET_MAX_BYTES:
5415 f->dump_unsigned("target_max_bytes", p->target_max_bytes);
5416 break;
5417 case CACHE_TARGET_DIRTY_RATIO:
5418 f->dump_unsigned("cache_target_dirty_ratio_micro",
5419 p->cache_target_dirty_ratio_micro);
5420 f->dump_float("cache_target_dirty_ratio",
5421 ((float)p->cache_target_dirty_ratio_micro/1000000));
5422 break;
5423 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5424 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
5425 p->cache_target_dirty_high_ratio_micro);
5426 f->dump_float("cache_target_dirty_high_ratio",
5427 ((float)p->cache_target_dirty_high_ratio_micro/1000000));
5428 break;
5429 case CACHE_TARGET_FULL_RATIO:
5430 f->dump_unsigned("cache_target_full_ratio_micro",
5431 p->cache_target_full_ratio_micro);
5432 f->dump_float("cache_target_full_ratio",
5433 ((float)p->cache_target_full_ratio_micro/1000000));
5434 break;
5435 case CACHE_MIN_FLUSH_AGE:
5436 f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
5437 break;
5438 case CACHE_MIN_EVICT_AGE:
5439 f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
5440 break;
5441 case ERASURE_CODE_PROFILE:
5442 f->dump_string("erasure_code_profile", p->erasure_code_profile);
5443 break;
5444 case MIN_READ_RECENCY_FOR_PROMOTE:
5445 f->dump_int("min_read_recency_for_promote",
5446 p->min_read_recency_for_promote);
5447 break;
5448 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5449 f->dump_int("min_write_recency_for_promote",
5450 p->min_write_recency_for_promote);
5451 break;
5452 case FAST_READ:
5453 f->dump_int("fast_read", p->fast_read);
5454 break;
5455 case HIT_SET_GRADE_DECAY_RATE:
5456 f->dump_int("hit_set_grade_decay_rate",
5457 p->hit_set_grade_decay_rate);
5458 break;
5459 case HIT_SET_SEARCH_LAST_N:
5460 f->dump_int("hit_set_search_last_n",
5461 p->hit_set_search_last_n);
5462 break;
5463 case SCRUB_MIN_INTERVAL:
5464 case SCRUB_MAX_INTERVAL:
5465 case DEEP_SCRUB_INTERVAL:
5466 case RECOVERY_PRIORITY:
5467 case RECOVERY_OP_PRIORITY:
5468 case SCRUB_PRIORITY:
5469 case COMPRESSION_MODE:
5470 case COMPRESSION_ALGORITHM:
5471 case COMPRESSION_REQUIRED_RATIO:
5472 case COMPRESSION_MAX_BLOB_SIZE:
5473 case COMPRESSION_MIN_BLOB_SIZE:
5474 case CSUM_TYPE:
5475 case CSUM_MAX_BLOCK:
5476 case CSUM_MIN_BLOCK:
5477 case FINGERPRINT_ALGORITHM:
5478 case PG_NUM_MIN:
5479 case TARGET_SIZE_BYTES:
5480 case TARGET_SIZE_RATIO:
5481 case PG_AUTOSCALE_BIAS:
5482 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5483 if (p->opts.is_set(key)) {
5484 if(*it == CSUM_TYPE) {
5485 int64_t val;
5486 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
5487 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
5488 } else {
5489 p->opts.dump(i->first, f.get());
5490 }
5491 }
5492 break;
5493 }
5494 }
5495 f->close_section();
5496 f->flush(rdata);
5497 } else /* !f */ {
5498 for(choices_set_t::const_iterator it = selected_choices.begin();
5499 it != selected_choices.end(); ++it) {
5500 choices_map_t::const_iterator i;
5501 switch(*it) {
5502 case PG_NUM:
5503 ss << "pg_num: " << p->get_pg_num() << "\n";
5504 break;
5505 case PGP_NUM:
5506 ss << "pgp_num: " << p->get_pgp_num() << "\n";
5507 break;
5508 case SIZE:
5509 ss << "size: " << p->get_size() << "\n";
5510 break;
5511 case MIN_SIZE:
5512 ss << "min_size: " << p->get_min_size() << "\n";
5513 break;
5514 case CRUSH_RULE:
5515 if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5516 ss << "crush_rule: " << osdmap.crush->get_rule_name(
5517 p->get_crush_rule()) << "\n";
5518 } else {
5519 ss << "crush_rule: " << p->get_crush_rule() << "\n";
5520 }
5521 break;
5522 case PG_AUTOSCALE_MODE:
5523 ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
5524 p->pg_autoscale_mode) <<"\n";
5525 break;
5526 case HIT_SET_PERIOD:
5527 ss << "hit_set_period: " << p->hit_set_period << "\n";
5528 break;
5529 case HIT_SET_COUNT:
5530 ss << "hit_set_count: " << p->hit_set_count << "\n";
5531 break;
5532 case HIT_SET_TYPE:
5533 ss << "hit_set_type: " <<
5534 HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
5535 break;
5536 case HIT_SET_FPP:
5537 {
5538 if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5539 BloomHitSet::Params *bloomp =
5540 static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5541 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
5542 } else if(var != "all") {
5543 ss << "hit set is not of type Bloom; " <<
5544 "invalid to get a false positive rate!";
5545 r = -EINVAL;
5546 goto reply;
5547 }
5548 }
5549 break;
5550 case USE_GMT_HITSET:
5551 ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
5552 break;
5553 case TARGET_MAX_OBJECTS:
5554 ss << "target_max_objects: " << p->target_max_objects << "\n";
5555 break;
5556 case TARGET_MAX_BYTES:
5557 ss << "target_max_bytes: " << p->target_max_bytes << "\n";
5558 break;
5559 case CACHE_TARGET_DIRTY_RATIO:
5560 ss << "cache_target_dirty_ratio: "
5561 << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
5562 break;
5563 case CACHE_TARGET_DIRTY_HIGH_RATIO:
5564 ss << "cache_target_dirty_high_ratio: "
5565 << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
5566 break;
5567 case CACHE_TARGET_FULL_RATIO:
5568 ss << "cache_target_full_ratio: "
5569 << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
5570 break;
5571 case CACHE_MIN_FLUSH_AGE:
5572 ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5573 break;
5574 case CACHE_MIN_EVICT_AGE:
5575 ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5576 break;
5577 case ERASURE_CODE_PROFILE:
5578 ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5579 break;
5580 case MIN_READ_RECENCY_FOR_PROMOTE:
5581 ss << "min_read_recency_for_promote: " <<
5582 p->min_read_recency_for_promote << "\n";
5583 break;
5584 case HIT_SET_GRADE_DECAY_RATE:
5585 ss << "hit_set_grade_decay_rate: " <<
5586 p->hit_set_grade_decay_rate << "\n";
5587 break;
5588 case HIT_SET_SEARCH_LAST_N:
5589 ss << "hit_set_search_last_n: " <<
5590 p->hit_set_search_last_n << "\n";
5591 break;
5592 case EC_OVERWRITES:
5593 ss << "allow_ec_overwrites: " <<
5594 (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
5595 "\n";
5596 break;
5597 case HASHPSPOOL:
5598 case NODELETE:
5599 case NOPGCHANGE:
5600 case NOSIZECHANGE:
5601 case WRITE_FADVISE_DONTNEED:
5602 case NOSCRUB:
5603 case NODEEP_SCRUB:
5604 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5605 if (i->second == *it)
5606 break;
5607 }
5608 ceph_assert(i != ALL_CHOICES.end());
5609 ss << i->first << ": " <<
5610 (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5611 "true" : "false") << "\n";
5612 break;
5613 case MIN_WRITE_RECENCY_FOR_PROMOTE:
5614 ss << "min_write_recency_for_promote: " <<
5615 p->min_write_recency_for_promote << "\n";
5616 break;
5617 case FAST_READ:
5618 ss << "fast_read: " << p->fast_read << "\n";
5619 break;
5620 case SCRUB_MIN_INTERVAL:
5621 case SCRUB_MAX_INTERVAL:
5622 case DEEP_SCRUB_INTERVAL:
5623 case RECOVERY_PRIORITY:
5624 case RECOVERY_OP_PRIORITY:
5625 case SCRUB_PRIORITY:
5626 case COMPRESSION_MODE:
5627 case COMPRESSION_ALGORITHM:
5628 case COMPRESSION_REQUIRED_RATIO:
5629 case COMPRESSION_MAX_BLOB_SIZE:
5630 case COMPRESSION_MIN_BLOB_SIZE:
5631 case CSUM_TYPE:
5632 case CSUM_MAX_BLOCK:
5633 case CSUM_MIN_BLOCK:
5634 case FINGERPRINT_ALGORITHM:
5635 case PG_NUM_MIN:
5636 case TARGET_SIZE_BYTES:
5637 case TARGET_SIZE_RATIO:
5638 case PG_AUTOSCALE_BIAS:
5639 for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5640 if (i->second == *it)
5641 break;
5642 }
5643 ceph_assert(i != ALL_CHOICES.end());
5644 {
5645 pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5646 if (p->opts.is_set(key)) {
5647 if(key == pool_opts_t::CSUM_TYPE) {
5648 int64_t val;
5649 p->opts.get(key, &val);
5650 ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5651 } else {
5652 ss << i->first << ": " << p->opts.get(key) << "\n";
5653 }
5654 }
5655 }
5656 break;
5657 }
5658 rdata.append(ss.str());
5659 ss.str("");
5660 }
5661 }
5662 r = 0;
5663 } else if (prefix == "osd pool get-quota") {
5664 string pool_name;
5665 cmd_getval(cct, cmdmap, "pool", pool_name);
5666
5667 int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5668 if (poolid < 0) {
5669 ceph_assert(poolid == -ENOENT);
5670 ss << "unrecognized pool '" << pool_name << "'";
5671 r = -ENOENT;
5672 goto reply;
5673 }
5674 const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5675
5676 if (f) {
5677 f->open_object_section("pool_quotas");
5678 f->dump_string("pool_name", pool_name);
5679 f->dump_unsigned("pool_id", poolid);
5680 f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5681 f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5682 f->close_section();
5683 f->flush(rdata);
5684 } else {
5685 stringstream rs;
5686 rs << "quotas for pool '" << pool_name << "':\n"
5687 << " max objects: ";
5688 if (p->quota_max_objects == 0)
5689 rs << "N/A";
5690 else
5691 rs << si_u_t(p->quota_max_objects) << " objects";
5692 rs << "\n"
5693 << " max bytes : ";
5694 if (p->quota_max_bytes == 0)
5695 rs << "N/A";
5696 else
5697 rs << byte_u_t(p->quota_max_bytes);
5698 rdata.append(rs.str());
5699 }
5700 rdata.append("\n");
5701 r = 0;
5702 } else if (prefix == "osd crush rule list" ||
5703 prefix == "osd crush rule ls") {
5704 if (f) {
5705 f->open_array_section("rules");
5706 osdmap.crush->list_rules(f.get());
5707 f->close_section();
5708 f->flush(rdata);
5709 } else {
5710 ostringstream ss;
5711 osdmap.crush->list_rules(&ss);
5712 rdata.append(ss.str());
5713 }
5714 } else if (prefix == "osd crush rule ls-by-class") {
5715 string class_name;
5716 cmd_getval(cct, cmdmap, "class", class_name);
5717 if (class_name.empty()) {
5718 ss << "no class specified";
5719 r = -EINVAL;
5720 goto reply;
5721 }
5722 set<int> rules;
5723 r = osdmap.crush->get_rules_by_class(class_name, &rules);
5724 if (r < 0) {
5725 ss << "failed to get rules by class '" << class_name << "'";
5726 goto reply;
5727 }
5728 if (f) {
5729 f->open_array_section("rules");
5730 for (auto &rule: rules) {
5731 f->dump_string("name", osdmap.crush->get_rule_name(rule));
5732 }
5733 f->close_section();
5734 f->flush(rdata);
5735 } else {
5736 ostringstream rs;
5737 for (auto &rule: rules) {
5738 rs << osdmap.crush->get_rule_name(rule) << "\n";
5739 }
5740 rdata.append(rs.str());
5741 }
5742 } else if (prefix == "osd crush rule dump") {
5743 string name;
5744 cmd_getval(cct, cmdmap, "name", name);
5745 string format;
5746 cmd_getval(cct, cmdmap, "format", format);
5747 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5748 if (name == "") {
5749 f->open_array_section("rules");
5750 osdmap.crush->dump_rules(f.get());
5751 f->close_section();
5752 } else {
5753 int ruleno = osdmap.crush->get_rule_id(name);
5754 if (ruleno < 0) {
5755 ss << "unknown crush rule '" << name << "'";
5756 r = ruleno;
5757 goto reply;
5758 }
5759 osdmap.crush->dump_rule(ruleno, f.get());
5760 }
5761 ostringstream rs;
5762 f->flush(rs);
5763 rs << "\n";
5764 rdata.append(rs.str());
5765 } else if (prefix == "osd crush dump") {
5766 string format;
5767 cmd_getval(cct, cmdmap, "format", format);
5768 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5769 f->open_object_section("crush_map");
5770 osdmap.crush->dump(f.get());
5771 f->close_section();
5772 ostringstream rs;
5773 f->flush(rs);
5774 rs << "\n";
5775 rdata.append(rs.str());
5776 } else if (prefix == "osd crush show-tunables") {
5777 string format;
5778 cmd_getval(cct, cmdmap, "format", format);
5779 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5780 f->open_object_section("crush_map_tunables");
5781 osdmap.crush->dump_tunables(f.get());
5782 f->close_section();
5783 ostringstream rs;
5784 f->flush(rs);
5785 rs << "\n";
5786 rdata.append(rs.str());
5787 } else if (prefix == "osd crush tree") {
5788 string shadow;
5789 cmd_getval(cct, cmdmap, "shadow", shadow);
5790 bool show_shadow = shadow == "--show-shadow";
5791 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5792 if (f) {
5793 f->open_object_section("crush_tree");
5794 osdmap.crush->dump_tree(nullptr,
5795 f.get(),
5796 osdmap.get_pool_names(),
5797 show_shadow);
5798 f->close_section();
5799 f->flush(rdata);
5800 } else {
5801 ostringstream ss;
5802 osdmap.crush->dump_tree(&ss,
5803 nullptr,
5804 osdmap.get_pool_names(),
5805 show_shadow);
5806 rdata.append(ss.str());
5807 }
5808 } else if (prefix == "osd crush ls") {
5809 string name;
5810 if (!cmd_getval(cct, cmdmap, "node", name)) {
5811 ss << "no node specified";
5812 r = -EINVAL;
5813 goto reply;
5814 }
5815 if (!osdmap.crush->name_exists(name)) {
5816 ss << "node '" << name << "' does not exist";
5817 r = -ENOENT;
5818 goto reply;
5819 }
5820 int id = osdmap.crush->get_item_id(name);
5821 list<int> result;
5822 if (id >= 0) {
5823 result.push_back(id);
5824 } else {
5825 int num = osdmap.crush->get_bucket_size(id);
5826 for (int i = 0; i < num; ++i) {
5827 result.push_back(osdmap.crush->get_bucket_item(id, i));
5828 }
5829 }
5830 if (f) {
5831 f->open_array_section("items");
5832 for (auto i : result) {
5833 f->dump_string("item", osdmap.crush->get_item_name(i));
5834 }
5835 f->close_section();
5836 f->flush(rdata);
5837 } else {
5838 ostringstream ss;
5839 for (auto i : result) {
5840 ss << osdmap.crush->get_item_name(i) << "\n";
5841 }
5842 rdata.append(ss.str());
5843 }
5844 r = 0;
5845 } else if (prefix == "osd crush class ls") {
5846 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5847 f->open_array_section("crush_classes");
5848 for (auto i : osdmap.crush->class_name)
5849 f->dump_string("class", i.second);
5850 f->close_section();
5851 f->flush(rdata);
5852 } else if (prefix == "osd crush class ls-osd") {
5853 string name;
5854 cmd_getval(cct, cmdmap, "class", name);
5855 set<int> osds;
5856 osdmap.crush->get_devices_by_class(name, &osds);
5857 if (f) {
5858 f->open_array_section("osds");
5859 for (auto &osd: osds)
5860 f->dump_int("osd", osd);
5861 f->close_section();
5862 f->flush(rdata);
5863 } else {
5864 bool first = true;
5865 for (auto &osd : osds) {
5866 if (!first)
5867 ds << "\n";
5868 first = false;
5869 ds << osd;
5870 }
5871 rdata.append(ds);
5872 }
5873 } else if (prefix == "osd crush get-device-class") {
5874 vector<string> idvec;
5875 cmd_getval(cct, cmdmap, "ids", idvec);
5876 map<int, string> class_by_osd;
5877 for (auto& id : idvec) {
5878 ostringstream ts;
5879 long osd = parse_osd_id(id.c_str(), &ts);
5880 if (osd < 0) {
5881 ss << "unable to parse osd id:'" << id << "'";
5882 r = -EINVAL;
5883 goto reply;
5884 }
5885 auto device_class = osdmap.crush->get_item_class(osd);
5886 if (device_class)
5887 class_by_osd[osd] = device_class;
5888 else
5889 class_by_osd[osd] = ""; // no class
5890 }
5891 if (f) {
5892 f->open_array_section("osd_device_classes");
5893 for (auto& i : class_by_osd) {
5894 f->open_object_section("osd_device_class");
5895 f->dump_int("osd", i.first);
5896 f->dump_string("device_class", i.second);
5897 f->close_section();
5898 }
5899 f->close_section();
5900 f->flush(rdata);
5901 } else {
5902 if (class_by_osd.size() == 1) {
5903 // for single input, make a clean output
5904 ds << class_by_osd.begin()->second;
5905 } else {
5906 // note that we do not group osds by class here
5907 for (auto it = class_by_osd.begin();
5908 it != class_by_osd.end();
5909 it++) {
5910 ds << "osd." << it->first << ' ' << it->second;
5911 if (next(it) != class_by_osd.end())
5912 ds << '\n';
5913 }
5914 }
5915 rdata.append(ds);
5916 }
5917 } else if (prefix == "osd erasure-code-profile ls") {
5918 const auto &profiles = osdmap.get_erasure_code_profiles();
5919 if (f)
5920 f->open_array_section("erasure-code-profiles");
5921 for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5922 if (f)
5923 f->dump_string("profile", i->first.c_str());
5924 else
5925 rdata.append(i->first + "\n");
5926 }
5927 if (f) {
5928 f->close_section();
5929 ostringstream rs;
5930 f->flush(rs);
5931 rs << "\n";
5932 rdata.append(rs.str());
5933 }
5934 } else if (prefix == "osd crush weight-set ls") {
5935 boost::scoped_ptr<Formatter> f(Formatter::create(format));
5936 if (f) {
5937 f->open_array_section("weight_sets");
5938 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5939 f->dump_string("pool", "(compat)");
5940 }
5941 for (auto& i : osdmap.crush->choose_args) {
5942 if (i.first >= 0) {
5943 f->dump_string("pool", osdmap.get_pool_name(i.first));
5944 }
5945 }
5946 f->close_section();
5947 f->flush(rdata);
5948 } else {
5949 ostringstream rs;
5950 if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5951 rs << "(compat)\n";
5952 }
5953 for (auto& i : osdmap.crush->choose_args) {
5954 if (i.first >= 0) {
5955 rs << osdmap.get_pool_name(i.first) << "\n";
5956 }
5957 }
5958 rdata.append(rs.str());
5959 }
5960 } else if (prefix == "osd crush weight-set dump") {
5961 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5962 "json-pretty"));
5963 osdmap.crush->dump_choose_args(f.get());
5964 f->flush(rdata);
5965 } else if (prefix == "osd erasure-code-profile get") {
5966 string name;
5967 cmd_getval(cct, cmdmap, "name", name);
5968 if (!osdmap.has_erasure_code_profile(name)) {
5969 ss << "unknown erasure code profile '" << name << "'";
5970 r = -ENOENT;
5971 goto reply;
5972 }
5973 const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5974 if (f)
5975 f->open_object_section("profile");
5976 for (map<string,string>::const_iterator i = profile.begin();
5977 i != profile.end();
5978 ++i) {
5979 if (f)
5980 f->dump_string(i->first.c_str(), i->second.c_str());
5981 else
5982 rdata.append(i->first + "=" + i->second + "\n");
5983 }
5984 if (f) {
5985 f->close_section();
5986 ostringstream rs;
5987 f->flush(rs);
5988 rs << "\n";
5989 rdata.append(rs.str());
5990 }
5991 } else if (prefix == "osd pool application get") {
5992 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5993 "json-pretty"));
5994 string pool_name;
5995 cmd_getval(cct, cmdmap, "pool", pool_name);
5996 string app;
5997 cmd_getval(cct, cmdmap, "app", app);
5998 string key;
5999 cmd_getval(cct, cmdmap, "key", key);
6000
6001 if (pool_name.empty()) {
6002 // all
6003 f->open_object_section("pools");
6004 for (const auto &pool : osdmap.pools) {
6005 std::string name("<unknown>");
6006 const auto &pni = osdmap.pool_name.find(pool.first);
6007 if (pni != osdmap.pool_name.end())
6008 name = pni->second;
6009 f->open_object_section(name.c_str());
6010 for (auto &app_pair : pool.second.application_metadata) {
6011 f->open_object_section(app_pair.first.c_str());
6012 for (auto &kv_pair : app_pair.second) {
6013 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6014 }
6015 f->close_section();
6016 }
6017 f->close_section(); // name
6018 }
6019 f->close_section(); // pools
6020 f->flush(rdata);
6021 } else {
6022 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6023 if (pool < 0) {
6024 ss << "unrecognized pool '" << pool_name << "'";
6025 r = -ENOENT;
6026 goto reply;
6027 }
6028 auto p = osdmap.get_pg_pool(pool);
6029 // filter by pool
6030 if (app.empty()) {
6031 f->open_object_section(pool_name.c_str());
6032 for (auto &app_pair : p->application_metadata) {
6033 f->open_object_section(app_pair.first.c_str());
6034 for (auto &kv_pair : app_pair.second) {
6035 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6036 }
6037 f->close_section(); // application
6038 }
6039 f->close_section(); // pool_name
6040 f->flush(rdata);
6041 goto reply;
6042 }
6043
6044 auto app_it = p->application_metadata.find(app);
6045 if (app_it == p->application_metadata.end()) {
6046 ss << "pool '" << pool_name << "' has no application '" << app << "'";
6047 r = -ENOENT;
6048 goto reply;
6049 }
6050 // filter by pool + app
6051 if (key.empty()) {
6052 f->open_object_section(app_it->first.c_str());
6053 for (auto &kv_pair : app_it->second) {
6054 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6055 }
6056 f->close_section(); // application
6057 f->flush(rdata);
6058 goto reply;
6059 }
6060 // filter by pool + app + key
6061 auto key_it = app_it->second.find(key);
6062 if (key_it == app_it->second.end()) {
6063 ss << "application '" << app << "' on pool '" << pool_name
6064 << "' does not have key '" << key << "'";
6065 r = -ENOENT;
6066 goto reply;
6067 }
6068 ss << key_it->second << "\n";
6069 rdata.append(ss.str());
6070 ss.str("");
6071 }
6072 } else if (prefix == "osd get-require-min-compat-client") {
6073 ss << ceph_release_name(osdmap.require_min_compat_client) << std::endl;
6074 rdata.append(ss.str());
6075 ss.str("");
6076 goto reply;
6077 } else if (prefix == "osd pool application enable" ||
6078 prefix == "osd pool application disable" ||
6079 prefix == "osd pool application set" ||
6080 prefix == "osd pool application rm") {
6081 bool changed = false;
6082 r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6083 if (r != 0) {
6084 // Error, reply.
6085 goto reply;
6086 } else if (changed) {
6087 // Valid mutation, proceed to prepare phase
6088 return false;
6089 } else {
6090 // Idempotent case, reply
6091 goto reply;
6092 }
6093 } else {
6094 // try prepare update
6095 return false;
6096 }
6097
6098 reply:
6099 string rs;
6100 getline(ss, rs);
6101 mon->reply_command(op, r, rs, rdata, get_last_committed());
6102 return true;
6103 }
6104
6105 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6106 {
6107 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6108 osdmap.get_pg_pool(pool_id));
6109 ceph_assert(pool);
6110 pool->set_flag(flags);
6111 }
6112
6113 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6114 {
6115 pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6116 osdmap.get_pg_pool(pool_id));
6117 ceph_assert(pool);
6118 pool->unset_flag(flags);
6119 }
6120
6121 string OSDMonitor::make_snap_epoch_key(int64_t pool, epoch_t epoch)
6122 {
6123 char k[80];
6124 snprintf(k, sizeof(k), "removed_epoch_%llu_%08lx",
6125 (unsigned long long)pool, (unsigned long)epoch);
6126 return k;
6127 }
6128
6129 string OSDMonitor::make_snap_key(int64_t pool, snapid_t snap)
6130 {
6131 char k[80];
6132 snprintf(k, sizeof(k), "removed_snap_%llu_%016llx",
6133 (unsigned long long)pool, (unsigned long long)snap);
6134 return k;
6135 }
6136
6137
6138 string OSDMonitor::make_snap_key_value(
6139 int64_t pool, snapid_t snap, snapid_t num,
6140 epoch_t epoch, bufferlist *v)
6141 {
6142 // encode the *last* epoch in the key so that we can use forward
6143 // iteration only to search for an epoch in an interval.
6144 encode(snap, *v);
6145 encode(snap + num, *v);
6146 encode(epoch, *v);
6147 return make_snap_key(pool, snap + num - 1);
6148 }
6149
6150 string OSDMonitor::make_snap_purged_key(int64_t pool, snapid_t snap)
6151 {
6152 char k[80];
6153 snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6154 (unsigned long long)pool, (unsigned long long)snap);
6155 return k;
6156 }
6157 string OSDMonitor::make_snap_purged_key_value(
6158 int64_t pool, snapid_t snap, snapid_t num,
6159 epoch_t epoch, bufferlist *v)
6160 {
6161 // encode the *last* epoch in the key so that we can use forward
6162 // iteration only to search for an epoch in an interval.
6163 encode(snap, *v);
6164 encode(snap + num, *v);
6165 encode(epoch, *v);
6166 return make_snap_purged_key(pool, snap + num - 1);
6167 }
6168
6169 int OSDMonitor::lookup_pruned_snap(int64_t pool, snapid_t snap,
6170 snapid_t *begin, snapid_t *end)
6171 {
6172 string k = make_snap_key(pool, snap);
6173 auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6174 it->lower_bound(k);
6175 if (!it->valid()) {
6176 return -ENOENT;
6177 }
6178 if (it->key().find(OSD_SNAP_PREFIX) != 0) {
6179 return -ENOENT;
6180 }
6181 bufferlist v = it->value();
6182 auto p = v.cbegin();
6183 decode(*begin, p);
6184 decode(*end, p);
6185 if (snap < *begin || snap >= *end) {
6186 return -ENOENT;
6187 }
6188 return 0;
6189 }
6190
6191 bool OSDMonitor::try_prune_purged_snaps()
6192 {
6193 if (!mon->mgrstatmon()->is_readable()) {
6194 return false;
6195 }
6196 if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC) {
6197 return false;
6198 }
6199 if (!pending_inc.new_purged_snaps.empty()) {
6200 return false; // we already pruned for this epoch
6201 }
6202
6203 unsigned max_prune = cct->_conf.get_val<uint64_t>(
6204 "mon_max_snap_prune_per_epoch");
6205 if (!max_prune) {
6206 max_prune = 100000;
6207 }
6208 dout(10) << __func__ << " max_prune " << max_prune << dendl;
6209
6210 unsigned actually_pruned = 0;
6211 auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
6212 for (auto& p : osdmap.get_pools()) {
6213 auto q = purged_snaps.find(p.first);
6214 if (q == purged_snaps.end()) {
6215 continue;
6216 }
6217 auto& purged = q->second;
6218 if (purged.empty()) {
6219 dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
6220 continue;
6221 }
6222 dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
6223 OSDMap::snap_interval_set_t to_prune;
6224 unsigned maybe_pruned = actually_pruned;
6225 for (auto i = purged.begin(); i != purged.end(); ++i) {
6226 snapid_t begin = i.get_start();
6227 auto end = i.get_start() + i.get_len();
6228 snapid_t pbegin = 0, pend = 0;
6229 int r = lookup_pruned_snap(p.first, begin, &pbegin, &pend);
6230 if (r == 0) {
6231 // already purged.
6232 // be a bit aggressive about backing off here, because the mon may
6233 // do a lot of work going through this set, and if we know the
6234 // purged set from the OSDs is at least *partly* stale we may as
6235 // well wait for it to be fresh.
6236 dout(20) << __func__ << " we've already pruned " << pbegin
6237 << "~" << (pend - pbegin) << dendl;
6238 break; // next pool
6239 }
6240 if (pbegin && pbegin < end) {
6241 // the tail of [begin,end) is purged; shorten the range
6242 ceph_assert(pbegin > begin);
6243 end = pbegin;
6244 }
6245 to_prune.insert(begin, end - begin);
6246 maybe_pruned += end - begin;
6247 if (maybe_pruned >= max_prune) {
6248 break;
6249 }
6250 }
6251 if (!to_prune.empty()) {
6252 // PGs may still be reporting things as purged that we have already
6253 // pruned from removed_snaps_queue.
6254 OSDMap::snap_interval_set_t actual;
6255 auto r = osdmap.removed_snaps_queue.find(p.first);
6256 if (r != osdmap.removed_snaps_queue.end()) {
6257 actual.intersection_of(to_prune, r->second);
6258 }
6259 actually_pruned += actual.size();
6260 dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
6261 << ", actual pruned " << actual << dendl;
6262 if (!actual.empty()) {
6263 pending_inc.new_purged_snaps[p.first].swap(actual);
6264 }
6265 }
6266 if (actually_pruned >= max_prune) {
6267 break;
6268 }
6269 }
6270 dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
6271 return !!actually_pruned;
6272 }
6273
6274 bool OSDMonitor::update_pools_status()
6275 {
6276 if (!mon->mgrstatmon()->is_readable())
6277 return false;
6278
6279 bool ret = false;
6280
6281 auto& pools = osdmap.get_pools();
6282 for (auto it = pools.begin(); it != pools.end(); ++it) {
6283 const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
6284 if (!pstat)
6285 continue;
6286 const object_stat_sum_t& sum = pstat->stats.sum;
6287 const pg_pool_t &pool = it->second;
6288 const string& pool_name = osdmap.get_pool_name(it->first);
6289
6290 bool pool_is_full =
6291 (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
6292 (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
6293
6294 if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
6295 if (pool_is_full)
6296 continue;
6297
6298 mon->clog->info() << "pool '" << pool_name
6299 << "' no longer out of quota; removing NO_QUOTA flag";
6300 // below we cancel FLAG_FULL too, we'll set it again in
6301 // OSDMonitor::encode_pending if it still fails the osd-full checking.
6302 clear_pool_flags(it->first,
6303 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6304 ret = true;
6305 } else {
6306 if (!pool_is_full)
6307 continue;
6308
6309 if (pool.quota_max_bytes > 0 &&
6310 (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
6311 mon->clog->warn() << "pool '" << pool_name << "' is full"
6312 << " (reached quota's max_bytes: "
6313 << byte_u_t(pool.quota_max_bytes) << ")";
6314 }
6315 if (pool.quota_max_objects > 0 &&
6316 (uint64_t)sum.num_objects >= pool.quota_max_objects) {
6317 mon->clog->warn() << "pool '" << pool_name << "' is full"
6318 << " (reached quota's max_objects: "
6319 << pool.quota_max_objects << ")";
6320 }
6321 // set both FLAG_FULL_QUOTA and FLAG_FULL
6322 // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
6323 // since FLAG_FULL should always take precedence
6324 set_pool_flags(it->first,
6325 pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6326 clear_pool_flags(it->first,
6327 pg_pool_t::FLAG_NEARFULL |
6328 pg_pool_t::FLAG_BACKFILLFULL);
6329 ret = true;
6330 }
6331 }
6332 return ret;
6333 }
6334
6335 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
6336 {
6337 op->mark_osdmon_event(__func__);
6338 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
6339 dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
6340 MonSession *session = op->get_session();
6341 if (!session)
6342 return -EPERM;
6343 string erasure_code_profile;
6344 stringstream ss;
6345 string rule_name;
6346 int ret = 0;
6347 ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
6348 0, 0, 0, 0, 0, 0.0,
6349 erasure_code_profile,
6350 pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
6351
6352 if (ret < 0) {
6353 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
6354 }
6355 return ret;
6356 }
6357
6358 int OSDMonitor::crush_rename_bucket(const string& srcname,
6359 const string& dstname,
6360 ostream *ss)
6361 {
6362 int ret;
6363 //
6364 // Avoid creating a pending crush if it does not already exists and
6365 // the rename would fail.
6366 //
6367 if (!_have_pending_crush()) {
6368 ret = _get_stable_crush().can_rename_bucket(srcname,
6369 dstname,
6370 ss);
6371 if (ret)
6372 return ret;
6373 }
6374
6375 CrushWrapper newcrush;
6376 _get_pending_crush(newcrush);
6377
6378 ret = newcrush.rename_bucket(srcname,
6379 dstname,
6380 ss);
6381 if (ret)
6382 return ret;
6383
6384 pending_inc.crush.clear();
6385 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6386 *ss << "renamed bucket " << srcname << " into " << dstname;
6387 return 0;
6388 }
6389
6390 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
6391 {
6392 string replacement = "";
6393
6394 if (plugin == "jerasure_generic" ||
6395 plugin == "jerasure_sse3" ||
6396 plugin == "jerasure_sse4" ||
6397 plugin == "jerasure_neon") {
6398 replacement = "jerasure";
6399 } else if (plugin == "shec_generic" ||
6400 plugin == "shec_sse3" ||
6401 plugin == "shec_sse4" ||
6402 plugin == "shec_neon") {
6403 replacement = "shec";
6404 }
6405
6406 if (replacement != "") {
6407 dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
6408 << plugin << " that has been deprecated. Please use "
6409 << replacement << " instead." << dendl;
6410 }
6411 }
6412
6413 int OSDMonitor::normalize_profile(const string& profilename,
6414 ErasureCodeProfile &profile,
6415 bool force,
6416 ostream *ss)
6417 {
6418 ErasureCodeInterfaceRef erasure_code;
6419 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6420 ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
6421 check_legacy_ec_plugin(plugin->second, profilename);
6422 int err = instance.factory(plugin->second,
6423 g_conf().get_val<std::string>("erasure_code_dir"),
6424 profile, &erasure_code, ss);
6425 if (err) {
6426 return err;
6427 }
6428
6429 err = erasure_code->init(profile, ss);
6430 if (err) {
6431 return err;
6432 }
6433
6434 auto it = profile.find("stripe_unit");
6435 if (it != profile.end()) {
6436 string err_str;
6437 uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6438 if (!err_str.empty()) {
6439 *ss << "could not parse stripe_unit '" << it->second
6440 << "': " << err_str << std::endl;
6441 return -EINVAL;
6442 }
6443 uint32_t data_chunks = erasure_code->get_data_chunk_count();
6444 uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
6445 if (chunk_size != stripe_unit) {
6446 *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
6447 << "alignment. Would be padded to " << chunk_size
6448 << std::endl;
6449 return -EINVAL;
6450 }
6451 if ((stripe_unit % 4096) != 0 && !force) {
6452 *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
6453 << "use --force to override this check" << std::endl;
6454 return -EINVAL;
6455 }
6456 }
6457 return 0;
6458 }
6459
6460 int OSDMonitor::crush_rule_create_erasure(const string &name,
6461 const string &profile,
6462 int *rule,
6463 ostream *ss)
6464 {
6465 int ruleid = osdmap.crush->get_rule_id(name);
6466 if (ruleid != -ENOENT) {
6467 *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
6468 return -EEXIST;
6469 }
6470
6471 CrushWrapper newcrush;
6472 _get_pending_crush(newcrush);
6473
6474 ruleid = newcrush.get_rule_id(name);
6475 if (ruleid != -ENOENT) {
6476 *rule = newcrush.get_rule_mask_ruleset(ruleid);
6477 return -EALREADY;
6478 } else {
6479 ErasureCodeInterfaceRef erasure_code;
6480 int err = get_erasure_code(profile, &erasure_code, ss);
6481 if (err) {
6482 *ss << "failed to load plugin using profile " << profile << std::endl;
6483 return err;
6484 }
6485
6486 err = erasure_code->create_rule(name, newcrush, ss);
6487 erasure_code.reset();
6488 if (err < 0)
6489 return err;
6490 *rule = err;
6491 pending_inc.crush.clear();
6492 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6493 return 0;
6494 }
6495 }
6496
6497 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
6498 ErasureCodeInterfaceRef *erasure_code,
6499 ostream *ss) const
6500 {
6501 if (pending_inc.has_erasure_code_profile(erasure_code_profile))
6502 return -EAGAIN;
6503 ErasureCodeProfile profile =
6504 osdmap.get_erasure_code_profile(erasure_code_profile);
6505 ErasureCodeProfile::const_iterator plugin =
6506 profile.find("plugin");
6507 if (plugin == profile.end()) {
6508 *ss << "cannot determine the erasure code plugin"
6509 << " because there is no 'plugin' entry in the erasure_code_profile "
6510 << profile << std::endl;
6511 return -EINVAL;
6512 }
6513 check_legacy_ec_plugin(plugin->second, erasure_code_profile);
6514 ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6515 return instance.factory(plugin->second,
6516 g_conf().get_val<std::string>("erasure_code_dir"),
6517 profile, erasure_code, ss);
6518 }
6519
6520 int OSDMonitor::check_cluster_features(uint64_t features,
6521 stringstream &ss)
6522 {
6523 stringstream unsupported_ss;
6524 int unsupported_count = 0;
6525 if ((mon->get_quorum_con_features() & features) != features) {
6526 unsupported_ss << "the monitor cluster";
6527 ++unsupported_count;
6528 }
6529
6530 set<int32_t> up_osds;
6531 osdmap.get_up_osds(up_osds);
6532 for (set<int32_t>::iterator it = up_osds.begin();
6533 it != up_osds.end(); ++it) {
6534 const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
6535 if ((xi.features & features) != features) {
6536 if (unsupported_count > 0)
6537 unsupported_ss << ", ";
6538 unsupported_ss << "osd." << *it;
6539 unsupported_count ++;
6540 }
6541 }
6542
6543 if (unsupported_count > 0) {
6544 ss << "features " << features << " unsupported by: "
6545 << unsupported_ss.str();
6546 return -ENOTSUP;
6547 }
6548
6549 // check pending osd state, too!
6550 for (map<int32_t,osd_xinfo_t>::const_iterator p =
6551 pending_inc.new_xinfo.begin();
6552 p != pending_inc.new_xinfo.end(); ++p) {
6553 const osd_xinfo_t &xi = p->second;
6554 if ((xi.features & features) != features) {
6555 dout(10) << __func__ << " pending osd." << p->first
6556 << " features are insufficient; retry" << dendl;
6557 return -EAGAIN;
6558 }
6559 }
6560
6561 return 0;
6562 }
6563
6564 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
6565 stringstream& ss)
6566 {
6567 OSDMap::Incremental new_pending = pending_inc;
6568 encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
6569 OSDMap newmap;
6570 newmap.deepish_copy_from(osdmap);
6571 newmap.apply_incremental(new_pending);
6572
6573 // client compat
6574 if (newmap.require_min_compat_client > 0) {
6575 auto mv = newmap.get_min_compat_client();
6576 if (mv > newmap.require_min_compat_client) {
6577 ss << "new crush map requires client version " << ceph_release_name(mv)
6578 << " but require_min_compat_client is "
6579 << ceph_release_name(newmap.require_min_compat_client);
6580 return false;
6581 }
6582 }
6583
6584 // osd compat
6585 uint64_t features =
6586 newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
6587 newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
6588 stringstream features_ss;
6589 int r = check_cluster_features(features, features_ss);
6590 if (r) {
6591 ss << "Could not change CRUSH: " << features_ss.str();
6592 return false;
6593 }
6594
6595 return true;
6596 }
6597
6598 bool OSDMonitor::erasure_code_profile_in_use(
6599 const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
6600 const string &profile,
6601 ostream *ss)
6602 {
6603 bool found = false;
6604 for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
6605 p != pools.end();
6606 ++p) {
6607 if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
6608 *ss << osdmap.pool_name[p->first] << " ";
6609 found = true;
6610 }
6611 }
6612 if (found) {
6613 *ss << "pool(s) are using the erasure code profile '" << profile << "'";
6614 }
6615 return found;
6616 }
6617
6618 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
6619 map<string,string> *erasure_code_profile_map,
6620 ostream *ss)
6621 {
6622 int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
6623 get_json_str_map,
6624 *ss,
6625 erasure_code_profile_map,
6626 true);
6627 if (r)
6628 return r;
6629 ceph_assert((*erasure_code_profile_map).count("plugin"));
6630 string default_plugin = (*erasure_code_profile_map)["plugin"];
6631 map<string,string> user_map;
6632 for (vector<string>::const_iterator i = erasure_code_profile.begin();
6633 i != erasure_code_profile.end();
6634 ++i) {
6635 size_t equal = i->find('=');
6636 if (equal == string::npos) {
6637 user_map[*i] = string();
6638 (*erasure_code_profile_map)[*i] = string();
6639 } else {
6640 const string key = i->substr(0, equal);
6641 equal++;
6642 const string value = i->substr(equal);
6643 if (key.find("ruleset-") == 0) {
6644 *ss << "property '" << key << "' is no longer supported; try "
6645 << "'crush-" << key.substr(8) << "' instead";
6646 return -EINVAL;
6647 }
6648 user_map[key] = value;
6649 (*erasure_code_profile_map)[key] = value;
6650 }
6651 }
6652
6653 if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
6654 (*erasure_code_profile_map) = user_map;
6655
6656 return 0;
6657 }
6658
6659 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
6660 const string &erasure_code_profile,
6661 uint8_t repl_size,
6662 unsigned *size, unsigned *min_size,
6663 ostream *ss)
6664 {
6665 int err = 0;
6666 switch (pool_type) {
6667 case pg_pool_t::TYPE_REPLICATED:
6668 if (repl_size == 0) {
6669 repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
6670 }
6671 *size = repl_size;
6672 *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
6673 break;
6674 case pg_pool_t::TYPE_ERASURE:
6675 {
6676 ErasureCodeInterfaceRef erasure_code;
6677 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6678 if (err == 0) {
6679 *size = erasure_code->get_chunk_count();
6680 *min_size =
6681 erasure_code->get_data_chunk_count() +
6682 std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
6683 assert(*min_size <= *size);
6684 assert(*min_size >= erasure_code->get_data_chunk_count());
6685 }
6686 }
6687 break;
6688 default:
6689 *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
6690 err = -EINVAL;
6691 break;
6692 }
6693 return err;
6694 }
6695
6696 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
6697 const string &erasure_code_profile,
6698 uint32_t *stripe_width,
6699 ostream *ss)
6700 {
6701 int err = 0;
6702 switch (pool_type) {
6703 case pg_pool_t::TYPE_REPLICATED:
6704 // ignored
6705 break;
6706 case pg_pool_t::TYPE_ERASURE:
6707 {
6708 ErasureCodeProfile profile =
6709 osdmap.get_erasure_code_profile(erasure_code_profile);
6710 ErasureCodeInterfaceRef erasure_code;
6711 err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6712 if (err)
6713 break;
6714 uint32_t data_chunks = erasure_code->get_data_chunk_count();
6715 uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
6716 auto it = profile.find("stripe_unit");
6717 if (it != profile.end()) {
6718 string err_str;
6719 stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6720 ceph_assert(err_str.empty());
6721 }
6722 *stripe_width = data_chunks *
6723 erasure_code->get_chunk_size(stripe_unit * data_chunks);
6724 }
6725 break;
6726 default:
6727 *ss << "prepare_pool_stripe_width: "
6728 << pool_type << " is not a known pool type";
6729 err = -EINVAL;
6730 break;
6731 }
6732 return err;
6733 }
6734
6735 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
6736 const string &erasure_code_profile,
6737 const string &rule_name,
6738 int *crush_rule,
6739 ostream *ss)
6740 {
6741
6742 if (*crush_rule < 0) {
6743 switch (pool_type) {
6744 case pg_pool_t::TYPE_REPLICATED:
6745 {
6746 if (rule_name == "") {
6747 // Use default rule
6748 *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
6749 if (*crush_rule < 0) {
6750 // Errors may happen e.g. if no valid rule is available
6751 *ss << "No suitable CRUSH rule exists, check "
6752 << "'osd pool default crush *' config options";
6753 return -ENOENT;
6754 }
6755 } else {
6756 return get_crush_rule(rule_name, crush_rule, ss);
6757 }
6758 }
6759 break;
6760 case pg_pool_t::TYPE_ERASURE:
6761 {
6762 int err = crush_rule_create_erasure(rule_name,
6763 erasure_code_profile,
6764 crush_rule, ss);
6765 switch (err) {
6766 case -EALREADY:
6767 dout(20) << "prepare_pool_crush_rule: rule "
6768 << rule_name << " try again" << dendl;
6769 // fall through
6770 case 0:
6771 // need to wait for the crush rule to be proposed before proceeding
6772 err = -EAGAIN;
6773 break;
6774 case -EEXIST:
6775 err = 0;
6776 break;
6777 }
6778 return err;
6779 }
6780 break;
6781 default:
6782 *ss << "prepare_pool_crush_rule: " << pool_type
6783 << " is not a known pool type";
6784 return -EINVAL;
6785 break;
6786 }
6787 } else {
6788 if (!osdmap.crush->ruleset_exists(*crush_rule)) {
6789 *ss << "CRUSH rule " << *crush_rule << " not found";
6790 return -ENOENT;
6791 }
6792 }
6793
6794 return 0;
6795 }
6796
6797 int OSDMonitor::get_crush_rule(const string &rule_name,
6798 int *crush_rule,
6799 ostream *ss)
6800 {
6801 int ret;
6802 ret = osdmap.crush->get_rule_id(rule_name);
6803 if (ret != -ENOENT) {
6804 // found it, use it
6805 *crush_rule = ret;
6806 } else {
6807 CrushWrapper newcrush;
6808 _get_pending_crush(newcrush);
6809
6810 ret = newcrush.get_rule_id(rule_name);
6811 if (ret != -ENOENT) {
6812 // found it, wait for it to be proposed
6813 dout(20) << __func__ << ": rule " << rule_name
6814 << " try again" << dendl;
6815 return -EAGAIN;
6816 } else {
6817 // Cannot find it , return error
6818 *ss << "specified rule " << rule_name << " doesn't exist";
6819 return ret;
6820 }
6821 }
6822 return 0;
6823 }
6824
6825 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6826 {
6827 auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
6828 auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3
6829 auto max_pgs = max_pgs_per_osd * num_osds;
6830 uint64_t projected = 0;
6831 if (pool < 0) {
6832 projected += pg_num * size;
6833 }
6834 for (const auto& i : osdmap.get_pools()) {
6835 if (i.first == pool) {
6836 projected += pg_num * size;
6837 } else {
6838 projected += i.second.get_pg_num_target() * i.second.get_size();
6839 }
6840 }
6841 if (projected > max_pgs) {
6842 if (pool >= 0) {
6843 *ss << "pool id " << pool;
6844 }
6845 *ss << " pg_num " << pg_num << " size " << size
6846 << " would mean " << projected
6847 << " total pgs, which exceeds max " << max_pgs
6848 << " (mon_max_pg_per_osd " << max_pgs_per_osd
6849 << " * num_in_osds " << num_osds << ")";
6850 return -ERANGE;
6851 }
6852 return 0;
6853 }
6854
6855 /**
6856 * @param name The name of the new pool
6857 * @param crush_rule The crush rule to use. If <0, will use the system default
6858 * @param crush_rule_name The crush rule to use, if crush_rulset <0
6859 * @param pg_num The pg_num to use. If set to 0, will use the system default
6860 * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6861 * @param repl_size Replication factor, or 0 for default
6862 * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6863 * @param pool_type TYPE_ERASURE, or TYPE_REP
6864 * @param expected_num_objects expected number of objects on the pool
6865 * @param fast_read fast read type.
6866 * @param ss human readable error message, if any.
6867 *
6868 * @return 0 on success, negative errno on failure.
6869 */
6870 int OSDMonitor::prepare_new_pool(string& name,
6871 int crush_rule,
6872 const string &crush_rule_name,
6873 unsigned pg_num, unsigned pgp_num,
6874 unsigned pg_num_min,
6875 const uint64_t repl_size,
6876 const uint64_t target_size_bytes,
6877 const float target_size_ratio,
6878 const string &erasure_code_profile,
6879 const unsigned pool_type,
6880 const uint64_t expected_num_objects,
6881 FastReadType fast_read,
6882 ostream *ss)
6883 {
6884 if (name.length() == 0)
6885 return -EINVAL;
6886 if (pg_num == 0)
6887 pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
6888 if (pgp_num == 0)
6889 pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
6890 if (!pgp_num)
6891 pgp_num = pg_num;
6892 if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
6893 *ss << "'pg_num' must be greater than 0 and less than or equal to "
6894 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
6895 << " (you may adjust 'mon max pool pg num' for higher values)";
6896 return -ERANGE;
6897 }
6898 if (pgp_num > pg_num) {
6899 *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6900 << ", which in this case is " << pg_num;
6901 return -ERANGE;
6902 }
6903 if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6904 *ss << "'fast_read' can only apply to erasure coding pool";
6905 return -EINVAL;
6906 }
6907 int r;
6908 r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6909 crush_rule_name, &crush_rule, ss);
6910 if (r) {
6911 dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
6912 return r;
6913 }
6914 if (g_conf()->mon_osd_crush_smoke_test) {
6915 CrushWrapper newcrush;
6916 _get_pending_crush(newcrush);
6917 ostringstream err;
6918 CrushTester tester(newcrush, err);
6919 tester.set_min_x(0);
6920 tester.set_max_x(50);
6921 tester.set_rule(crush_rule);
6922 auto start = ceph::coarse_mono_clock::now();
6923 r = tester.test_with_fork(g_conf()->mon_lease);
6924 auto duration = ceph::coarse_mono_clock::now() - start;
6925 if (r < 0) {
6926 dout(10) << "tester.test_with_fork returns " << r
6927 << ": " << err.str() << dendl;
6928 *ss << "crush test failed with " << r << ": " << err.str();
6929 return r;
6930 }
6931 dout(10) << __func__ << " crush smoke test duration: "
6932 << duration << dendl;
6933 }
6934 unsigned size, min_size;
6935 r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
6936 &size, &min_size, ss);
6937 if (r) {
6938 dout(10) << "prepare_pool_size returns " << r << dendl;
6939 return r;
6940 }
6941 r = check_pg_num(-1, pg_num, size, ss);
6942 if (r) {
6943 dout(10) << "check_pg_num returns " << r << dendl;
6944 return r;
6945 }
6946
6947 if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
6948 return -EINVAL;
6949 }
6950
6951 uint32_t stripe_width = 0;
6952 r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6953 if (r) {
6954 dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
6955 return r;
6956 }
6957
6958 bool fread = false;
6959 if (pool_type == pg_pool_t::TYPE_ERASURE) {
6960 switch (fast_read) {
6961 case FAST_READ_OFF:
6962 fread = false;
6963 break;
6964 case FAST_READ_ON:
6965 fread = true;
6966 break;
6967 case FAST_READ_DEFAULT:
6968 fread = g_conf()->osd_pool_default_ec_fast_read;
6969 break;
6970 default:
6971 *ss << "invalid fast_read setting: " << fast_read;
6972 return -EINVAL;
6973 }
6974 }
6975
6976 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6977 p != pending_inc.new_pool_names.end();
6978 ++p) {
6979 if (p->second == name)
6980 return 0;
6981 }
6982
6983 if (-1 == pending_inc.new_pool_max)
6984 pending_inc.new_pool_max = osdmap.pool_max;
6985 int64_t pool = ++pending_inc.new_pool_max;
6986 pg_pool_t empty;
6987 pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6988 pi->create_time = ceph_clock_now();
6989 pi->type = pool_type;
6990 pi->fast_read = fread;
6991 pi->flags = g_conf()->osd_pool_default_flags;
6992 if (g_conf()->osd_pool_default_flag_hashpspool)
6993 pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6994 if (g_conf()->osd_pool_default_flag_nodelete)
6995 pi->set_flag(pg_pool_t::FLAG_NODELETE);
6996 if (g_conf()->osd_pool_default_flag_nopgchange)
6997 pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6998 if (g_conf()->osd_pool_default_flag_nosizechange)
6999 pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7000 pi->set_flag(pg_pool_t::FLAG_CREATING);
7001 if (g_conf()->osd_pool_use_gmt_hitset)
7002 pi->use_gmt_hitset = true;
7003 else
7004 pi->use_gmt_hitset = false;
7005
7006 pi->size = size;
7007 pi->min_size = min_size;
7008 pi->crush_rule = crush_rule;
7009 pi->expected_num_objects = expected_num_objects;
7010 pi->object_hash = CEPH_STR_HASH_RJENKINS;
7011
7012 {
7013 auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7014 g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7015 pi->pg_autoscale_mode = m >= 0 ? m : 0;
7016 }
7017 auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7018 pi->set_pg_num(
7019 max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7020 : pg_num);
7021 pi->set_pg_num_pending(pi->get_pg_num());
7022 pi->set_pg_num_target(pg_num);
7023 pi->set_pgp_num(pi->get_pg_num());
7024 pi->set_pgp_num_target(pgp_num);
7025 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7026 pg_num_min) {
7027 pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7028 }
7029
7030 pi->last_change = pending_inc.epoch;
7031 pi->auid = 0;
7032
7033 if (pool_type == pg_pool_t::TYPE_ERASURE) {
7034 pi->erasure_code_profile = erasure_code_profile;
7035 } else {
7036 pi->erasure_code_profile = "";
7037 }
7038 pi->stripe_width = stripe_width;
7039
7040 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7041 target_size_bytes) {
7042 // only store for nautilus+ because TARGET_SIZE_BYTES may be
7043 // larger than int32_t max.
7044 pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7045 }
7046 if (target_size_ratio > 0.0 &&
7047 osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
7048 // only store for nautilus+, just to be consistent and tidy.
7049 pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7050 }
7051
7052 pi->cache_target_dirty_ratio_micro =
7053 g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7054 pi->cache_target_dirty_high_ratio_micro =
7055 g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7056 pi->cache_target_full_ratio_micro =
7057 g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7058 pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7059 pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7060
7061 pending_inc.new_pool_names[pool] = name;
7062 return 0;
7063 }
7064
7065 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7066 {
7067 op->mark_osdmon_event(__func__);
7068 ostringstream ss;
7069 if (pending_inc.new_flags < 0)
7070 pending_inc.new_flags = osdmap.get_flags();
7071 pending_inc.new_flags |= flag;
7072 ss << OSDMap::get_flag_string(flag) << " is set";
7073 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7074 get_last_committed() + 1));
7075 return true;
7076 }
7077
7078 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7079 {
7080 op->mark_osdmon_event(__func__);
7081 ostringstream ss;
7082 if (pending_inc.new_flags < 0)
7083 pending_inc.new_flags = osdmap.get_flags();
7084 pending_inc.new_flags &= ~flag;
7085 ss << OSDMap::get_flag_string(flag) << " is unset";
7086 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7087 get_last_committed() + 1));
7088 return true;
7089 }
7090
7091 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7092 stringstream& ss)
7093 {
7094 string poolstr;
7095 cmd_getval(cct, cmdmap, "pool", poolstr);
7096 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7097 if (pool < 0) {
7098 ss << "unrecognized pool '" << poolstr << "'";
7099 return -ENOENT;
7100 }
7101 string var;
7102 cmd_getval(cct, cmdmap, "var", var);
7103
7104 pg_pool_t p = *osdmap.get_pg_pool(pool);
7105 if (pending_inc.new_pools.count(pool))
7106 p = pending_inc.new_pools[pool];
7107
7108 // accept val as a json string in the normal case (current
7109 // generation monitor). parse out int or float values from the
7110 // string as needed. however, if it is not a string, try to pull
7111 // out an int, in case an older monitor with an older json schema is
7112 // forwarding a request.
7113 string val;
7114 string interr, floaterr;
7115 int64_t n = 0;
7116 double f = 0;
7117 int64_t uf = 0; // micro-f
7118 cmd_getval(cct, cmdmap, "val", val);
7119
7120 // parse string as both int and float; different fields use different types.
7121 n = strict_strtoll(val.c_str(), 10, &interr);
7122 f = strict_strtod(val.c_str(), &floaterr);
7123 uf = llrintl(f * (double)1000000.0);
7124
7125 if (!p.is_tier() &&
7126 (var == "hit_set_type" || var == "hit_set_period" ||
7127 var == "hit_set_count" || var == "hit_set_fpp" ||
7128 var == "target_max_objects" || var == "target_max_bytes" ||
7129 var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7130 var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7131 var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7132 var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7133 var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7134 return -EACCES;
7135 }
7136
7137 if (var == "size") {
7138 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7139 ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7140 return -EPERM;
7141 }
7142 if (p.type == pg_pool_t::TYPE_ERASURE) {
7143 ss << "can not change the size of an erasure-coded pool";
7144 return -ENOTSUP;
7145 }
7146 if (interr.length()) {
7147 ss << "error parsing integer value '" << val << "': " << interr;
7148 return -EINVAL;
7149 }
7150 if (n <= 0 || n > 10) {
7151 ss << "pool size must be between 1 and 10";
7152 return -EINVAL;
7153 }
7154 int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7155 if (r < 0) {
7156 return r;
7157 }
7158 p.size = n;
7159 if (n < p.min_size)
7160 p.min_size = n;
7161 } else if (var == "min_size") {
7162 if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7163 ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7164 return -EPERM;
7165 }
7166 if (interr.length()) {
7167 ss << "error parsing integer value '" << val << "': " << interr;
7168 return -EINVAL;
7169 }
7170
7171 if (p.type != pg_pool_t::TYPE_ERASURE) {
7172 if (n < 1 || n > p.size) {
7173 ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7174 return -EINVAL;
7175 }
7176 } else {
7177 ErasureCodeInterfaceRef erasure_code;
7178 int k;
7179 stringstream tmp;
7180 int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7181 if (err == 0) {
7182 k = erasure_code->get_data_chunk_count();
7183 } else {
7184 ss << __func__ << " get_erasure_code failed: " << tmp.str();
7185 return err;
7186 }
7187
7188 if (n < k || n > p.size) {
7189 ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7190 return -EINVAL;
7191 }
7192 }
7193 p.min_size = n;
7194 } else if (var == "pg_num_actual") {
7195 if (interr.length()) {
7196 ss << "error parsing integer value '" << val << "': " << interr;
7197 return -EINVAL;
7198 }
7199 if (n == (int)p.get_pg_num()) {
7200 return 0;
7201 }
7202 if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7203 ss << "'pg_num' must be greater than 0 and less than or equal to "
7204 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7205 << " (you may adjust 'mon max pool pg num' for higher values)";
7206 return -ERANGE;
7207 }
7208 if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
7209 ss << "cannot adjust pg_num while initial PGs are being created";
7210 return -EBUSY;
7211 }
7212 if (n > (int)p.get_pg_num()) {
7213 if (p.get_pg_num() != p.get_pg_num_pending()) {
7214 // force pre-nautilus clients to resend their ops, since they
7215 // don't understand pg_num_pending changes form a new interval
7216 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7217 }
7218 p.set_pg_num(n);
7219 } else {
7220 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7221 ss << "nautilus OSDs are required to adjust pg_num_pending";
7222 return -EPERM;
7223 }
7224 if (n < (int)p.get_pgp_num()) {
7225 ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
7226 return -EINVAL;
7227 }
7228 if (n < (int)p.get_pg_num() - 1) {
7229 ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
7230 << ") - 1; only single pg decrease is currently supported";
7231 return -EINVAL;
7232 }
7233 p.set_pg_num_pending(n);
7234 // force pre-nautilus clients to resend their ops, since they
7235 // don't understand pg_num_pending changes form a new interval
7236 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7237 }
7238 // force pre-luminous clients to resend their ops, since they
7239 // don't understand that split PGs now form a new interval.
7240 p.last_force_op_resend_preluminous = pending_inc.epoch;
7241 } else if (var == "pg_num") {
7242 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7243 ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
7244 return -EPERM;
7245 }
7246 if (interr.length()) {
7247 ss << "error parsing integer value '" << val << "': " << interr;
7248 return -EINVAL;
7249 }
7250 if (n == (int)p.get_pg_num_target()) {
7251 return 0;
7252 }
7253 if (n <= 0 || static_cast<uint64_t>(n) >
7254 g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7255 ss << "'pg_num' must be greater than 0 and less than or equal to "
7256 << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7257 << " (you may adjust 'mon max pool pg num' for higher values)";
7258 return -ERANGE;
7259 }
7260 if (n > (int)p.get_pg_num_target()) {
7261 int r = check_pg_num(pool, n, p.get_size(), &ss);
7262 if (r) {
7263 return r;
7264 }
7265 bool force = false;
7266 cmd_getval(cct,cmdmap, "yes_i_really_mean_it", force);
7267 if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
7268 ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
7269 return -EPERM;
7270 }
7271 } else {
7272 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7273 ss << "nautilus OSDs are required to decrease pg_num";
7274 return -EPERM;
7275 }
7276 }
7277 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7278 // pre-nautilus osdmap format; increase pg_num directly
7279 assert(n > (int)p.get_pg_num());
7280 // force pre-nautilus clients to resend their ops, since they
7281 // don't understand pg_num_target changes form a new interval
7282 p.last_force_op_resend_prenautilus = pending_inc.epoch;
7283 // force pre-luminous clients to resend their ops, since they
7284 // don't understand that split PGs now form a new interval.
7285 p.last_force_op_resend_preluminous = pending_inc.epoch;
7286 p.set_pg_num(n);
7287 } else {
7288 // set targets; mgr will adjust pg_num_actual and pgp_num later.
7289 // make pgp_num track pg_num if it already matches. if it is set
7290 // differently, leave it different and let the user control it
7291 // manually.
7292 if (p.get_pg_num_target() == p.get_pgp_num_target()) {
7293 p.set_pgp_num_target(n);
7294 }
7295 p.set_pg_num_target(n);
7296 }
7297 } else if (var == "pgp_num_actual") {
7298 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7299 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7300 return -EPERM;
7301 }
7302 if (interr.length()) {
7303 ss << "error parsing integer value '" << val << "': " << interr;
7304 return -EINVAL;
7305 }
7306 if (n <= 0) {
7307 ss << "specified pgp_num must > 0, but you set to " << n;
7308 return -EINVAL;
7309 }
7310 if (n > (int)p.get_pg_num()) {
7311 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
7312 return -EINVAL;
7313 }
7314 if (n > (int)p.get_pg_num_pending()) {
7315 ss << "specified pgp_num " << n
7316 << " > pg_num_pending " << p.get_pg_num_pending();
7317 return -EINVAL;
7318 }
7319 p.set_pgp_num(n);
7320 } else if (var == "pgp_num") {
7321 if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7322 ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7323 return -EPERM;
7324 }
7325 if (interr.length()) {
7326 ss << "error parsing integer value '" << val << "': " << interr;
7327 return -EINVAL;
7328 }
7329 if (n <= 0) {
7330 ss << "specified pgp_num must > 0, but you set to " << n;
7331 return -EINVAL;
7332 }
7333 if (n > (int)p.get_pg_num_target()) {
7334 ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
7335 return -EINVAL;
7336 }
7337 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7338 // pre-nautilus osdmap format; increase pgp_num directly
7339 p.set_pgp_num(n);
7340 } else {
7341 p.set_pgp_num_target(n);
7342 }
7343 } else if (var == "pg_autoscale_mode") {
7344 n = pg_pool_t::get_pg_autoscale_mode_by_name(val);
7345 if (n < 0) {
7346 ss << "specified invalid mode " << val;
7347 return -EINVAL;
7348 }
7349 if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7350 ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
7351 return -EINVAL;
7352 }
7353 p.pg_autoscale_mode = n;
7354 } else if (var == "crush_rule") {
7355 int id = osdmap.crush->get_rule_id(val);
7356 if (id == -ENOENT) {
7357 ss << "crush rule " << val << " does not exist";
7358 return -ENOENT;
7359 }
7360 if (id < 0) {
7361 ss << cpp_strerror(id);
7362 return -ENOENT;
7363 }
7364 if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
7365 return -EINVAL;
7366 }
7367 p.crush_rule = id;
7368 } else if (var == "nodelete" || var == "nopgchange" ||
7369 var == "nosizechange" || var == "write_fadvise_dontneed" ||
7370 var == "noscrub" || var == "nodeep-scrub") {
7371 uint64_t flag = pg_pool_t::get_flag_by_name(var);
7372 // make sure we only compare against 'n' if we didn't receive a string
7373 if (val == "true" || (interr.empty() && n == 1)) {
7374 p.set_flag(flag);
7375 } else if (val == "false" || (interr.empty() && n == 0)) {
7376 p.unset_flag(flag);
7377 } else {
7378 ss << "expecting value 'true', 'false', '0', or '1'";
7379 return -EINVAL;
7380 }
7381 } else if (var == "hashpspool") {
7382 uint64_t flag = pg_pool_t::get_flag_by_name(var);
7383 bool force = false;
7384 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7385
7386 if (!force) {
7387 ss << "are you SURE? this will remap all placement groups in this pool,"
7388 " this triggers large data movement,"
7389 " pass --yes-i-really-mean-it if you really do.";
7390 return -EPERM;
7391 }
7392 // make sure we only compare against 'n' if we didn't receive a string
7393 if (val == "true" || (interr.empty() && n == 1)) {
7394 p.set_flag(flag);
7395 } else if (val == "false" || (interr.empty() && n == 0)) {
7396 p.unset_flag(flag);
7397 } else {
7398 ss << "expecting value 'true', 'false', '0', or '1'";
7399 return -EINVAL;
7400 }
7401 } else if (var == "hit_set_type") {
7402 if (val == "none")
7403 p.hit_set_params = HitSet::Params();
7404 else {
7405 int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
7406 if (err)
7407 return err;
7408 if (val == "bloom") {
7409 BloomHitSet::Params *bsp = new BloomHitSet::Params;
7410 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7411 p.hit_set_params = HitSet::Params(bsp);
7412 } else if (val == "explicit_hash")
7413 p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
7414 else if (val == "explicit_object")
7415 p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
7416 else {
7417 ss << "unrecognized hit_set type '" << val << "'";
7418 return -EINVAL;
7419 }
7420 }
7421 } else if (var == "hit_set_period") {
7422 if (interr.length()) {
7423 ss << "error parsing integer value '" << val << "': " << interr;
7424 return -EINVAL;
7425 } else if (n < 0) {
7426 ss << "hit_set_period should be non-negative";
7427 return -EINVAL;
7428 }
7429 p.hit_set_period = n;
7430 } else if (var == "hit_set_count") {
7431 if (interr.length()) {
7432 ss << "error parsing integer value '" << val << "': " << interr;
7433 return -EINVAL;
7434 } else if (n < 0) {
7435 ss << "hit_set_count should be non-negative";
7436 return -EINVAL;
7437 }
7438 p.hit_set_count = n;
7439 } else if (var == "hit_set_fpp") {
7440 if (floaterr.length()) {
7441 ss << "error parsing floating point value '" << val << "': " << floaterr;
7442 return -EINVAL;
7443 } else if (f < 0 || f > 1.0) {
7444 ss << "hit_set_fpp should be in the range 0..1";
7445 return -EINVAL;
7446 }
7447 if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
7448 ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
7449 return -EINVAL;
7450 }
7451 BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
7452 bloomp->set_fpp(f);
7453 } else if (var == "use_gmt_hitset") {
7454 if (val == "true" || (interr.empty() && n == 1)) {
7455 p.use_gmt_hitset = true;
7456 } else {
7457 ss << "expecting value 'true' or '1'";
7458 return -EINVAL;
7459 }
7460 } else if (var == "allow_ec_overwrites") {
7461 if (!p.is_erasure()) {
7462 ss << "ec overwrites can only be enabled for an erasure coded pool";
7463 return -EINVAL;
7464 }
7465 stringstream err;
7466 if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
7467 !is_pool_currently_all_bluestore(pool, p, &err)) {
7468 ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
7469 return -EINVAL;
7470 }
7471 if (val == "true" || (interr.empty() && n == 1)) {
7472 p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
7473 } else if (val == "false" || (interr.empty() && n == 0)) {
7474 ss << "ec overwrites cannot be disabled once enabled";
7475 return -EINVAL;
7476 } else {
7477 ss << "expecting value 'true', 'false', '0', or '1'";
7478 return -EINVAL;
7479 }
7480 } else if (var == "target_max_objects") {
7481 if (interr.length()) {
7482 ss << "error parsing int '" << val << "': " << interr;
7483 return -EINVAL;
7484 }
7485 p.target_max_objects = n;
7486 } else if (var == "target_max_bytes") {
7487 if (interr.length()) {
7488 ss << "error parsing int '" << val << "': " << interr;
7489 return -EINVAL;
7490 }
7491 p.target_max_bytes = n;
7492 } else if (var == "cache_target_dirty_ratio") {
7493 if (floaterr.length()) {
7494 ss << "error parsing float '" << val << "': " << floaterr;
7495 return -EINVAL;
7496 }
7497 if (f < 0 || f > 1.0) {
7498 ss << "value must be in the range 0..1";
7499 return -ERANGE;
7500 }
7501 p.cache_target_dirty_ratio_micro = uf;
7502 } else if (var == "cache_target_dirty_high_ratio") {
7503 if (floaterr.length()) {
7504 ss << "error parsing float '" << val << "': " << floaterr;
7505 return -EINVAL;
7506 }
7507 if (f < 0 || f > 1.0) {
7508 ss << "value must be in the range 0..1";
7509 return -ERANGE;
7510 }
7511 p.cache_target_dirty_high_ratio_micro = uf;
7512 } else if (var == "cache_target_full_ratio") {
7513 if (floaterr.length()) {
7514 ss << "error parsing float '" << val << "': " << floaterr;
7515 return -EINVAL;
7516 }
7517 if (f < 0 || f > 1.0) {
7518 ss << "value must be in the range 0..1";
7519 return -ERANGE;
7520 }
7521 p.cache_target_full_ratio_micro = uf;
7522 } else if (var == "cache_min_flush_age") {
7523 if (interr.length()) {
7524 ss << "error parsing int '" << val << "': " << interr;
7525 return -EINVAL;
7526 }
7527 p.cache_min_flush_age = n;
7528 } else if (var == "cache_min_evict_age") {
7529 if (interr.length()) {
7530 ss << "error parsing int '" << val << "': " << interr;
7531 return -EINVAL;
7532 }
7533 p.cache_min_evict_age = n;
7534 } else if (var == "min_read_recency_for_promote") {
7535 if (interr.length()) {
7536 ss << "error parsing integer value '" << val << "': " << interr;
7537 return -EINVAL;
7538 }
7539 p.min_read_recency_for_promote = n;
7540 } else if (var == "hit_set_grade_decay_rate") {
7541 if (interr.length()) {
7542 ss << "error parsing integer value '" << val << "': " << interr;
7543 return -EINVAL;
7544 }
7545 if (n > 100 || n < 0) {
7546 ss << "value out of range,valid range is 0 - 100";
7547 return -EINVAL;
7548 }
7549 p.hit_set_grade_decay_rate = n;
7550 } else if (var == "hit_set_search_last_n") {
7551 if (interr.length()) {
7552 ss << "error parsing integer value '" << val << "': " << interr;
7553 return -EINVAL;
7554 }
7555 if (n > p.hit_set_count || n < 0) {
7556 ss << "value out of range,valid range is 0 - hit_set_count";
7557 return -EINVAL;
7558 }
7559 p.hit_set_search_last_n = n;
7560 } else if (var == "min_write_recency_for_promote") {
7561 if (interr.length()) {
7562 ss << "error parsing integer value '" << val << "': " << interr;
7563 return -EINVAL;
7564 }
7565 p.min_write_recency_for_promote = n;
7566 } else if (var == "fast_read") {
7567 if (p.is_replicated()) {
7568 ss << "fast read is not supported in replication pool";
7569 return -EINVAL;
7570 }
7571 if (val == "true" || (interr.empty() && n == 1)) {
7572 p.fast_read = true;
7573 } else if (val == "false" || (interr.empty() && n == 0)) {
7574 p.fast_read = false;
7575 } else {
7576 ss << "expecting value 'true', 'false', '0', or '1'";
7577 return -EINVAL;
7578 }
7579 } else if (pool_opts_t::is_opt_name(var)) {
7580 bool unset = val == "unset";
7581 if (var == "compression_mode") {
7582 if (!unset) {
7583 auto cmode = Compressor::get_comp_mode_type(val);
7584 if (!cmode) {
7585 ss << "unrecognized compression mode '" << val << "'";
7586 return -EINVAL;
7587 }
7588 }
7589 } else if (var == "compression_algorithm") {
7590 if (!unset) {
7591 auto alg = Compressor::get_comp_alg_type(val);
7592 if (!alg) {
7593 ss << "unrecognized compression_algorithm '" << val << "'";
7594 return -EINVAL;
7595 }
7596 }
7597 } else if (var == "compression_required_ratio") {
7598 if (floaterr.length()) {
7599 ss << "error parsing float value '" << val << "': " << floaterr;
7600 return -EINVAL;
7601 }
7602 if (f < 0 || f > 1) {
7603 ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
7604 return -EINVAL;
7605 }
7606 } else if (var == "csum_type") {
7607 auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7608 if (t < 0 ) {
7609 ss << "unrecognized csum_type '" << val << "'";
7610 return -EINVAL;
7611 }
7612 //preserve csum_type numeric value
7613 n = t;
7614 interr.clear();
7615 } else if (var == "compression_max_blob_size" ||
7616 var == "compression_min_blob_size" ||
7617 var == "csum_max_block" ||
7618 var == "csum_min_block") {
7619 if (interr.length()) {
7620 ss << "error parsing int value '" << val << "': " << interr;
7621 return -EINVAL;
7622 }
7623 } else if (var == "fingerprint_algorithm") {
7624 if (!unset) {
7625 auto alg = pg_pool_t::get_fingerprint_from_str(val);
7626 if (!alg) {
7627 ss << "unrecognized fingerprint_algorithm '" << val << "'";
7628 return -EINVAL;
7629 }
7630 }
7631 } else if (var == "pg_num_min") {
7632 if (interr.length()) {
7633 ss << "error parsing int value '" << val << "': " << interr;
7634 return -EINVAL;
7635 }
7636 if (n > (int)p.get_pg_num_target()) {
7637 ss << "specified pg_num_min " << n
7638 << " > pg_num " << p.get_pg_num_target();
7639 return -EINVAL;
7640 }
7641 } else if (var == "recovery_priority") {
7642 if (interr.length()) {
7643 ss << "error parsing int value '" << val << "': " << interr;
7644 return -EINVAL;
7645 }
7646 if (!g_conf()->debug_allow_any_pool_priority) {
7647 if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
7648 ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
7649 << " and " << OSD_POOL_PRIORITY_MAX;
7650 return -EINVAL;
7651 }
7652 }
7653 } else if (var == "pg_autoscale_bias") {
7654 if (f < 0.0 || f > 1000.0) {
7655 ss << "pg_autoscale_bias must be between 0 and 1000";
7656 return -EINVAL;
7657 }
7658 }
7659
7660 pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
7661 switch (desc.type) {
7662 case pool_opts_t::STR:
7663 if (unset) {
7664 p.opts.unset(desc.key);
7665 } else {
7666 p.opts.set(desc.key, static_cast<std::string>(val));
7667 }
7668 break;
7669 case pool_opts_t::INT:
7670 if (interr.length()) {
7671 ss << "error parsing integer value '" << val << "': " << interr;
7672 return -EINVAL;
7673 }
7674 if (n == 0) {
7675 p.opts.unset(desc.key);
7676 } else {
7677 p.opts.set(desc.key, static_cast<int64_t>(n));
7678 }
7679 break;
7680 case pool_opts_t::DOUBLE:
7681 if (floaterr.length()) {
7682 ss << "error parsing floating point value '" << val << "': " << floaterr;
7683 return -EINVAL;
7684 }
7685 if (f == 0) {
7686 p.opts.unset(desc.key);
7687 } else {
7688 p.opts.set(desc.key, static_cast<double>(f));
7689 }
7690 break;
7691 default:
7692 ceph_assert(!"unknown type");
7693 }
7694 } else {
7695 ss << "unrecognized variable '" << var << "'";
7696 return -EINVAL;
7697 }
7698 if (val != "unset") {
7699 ss << "set pool " << pool << " " << var << " to " << val;
7700 } else {
7701 ss << "unset pool " << pool << " " << var;
7702 }
7703 p.last_change = pending_inc.epoch;
7704 pending_inc.new_pools[pool] = p;
7705 return 0;
7706 }
7707
7708 int OSDMonitor::prepare_command_pool_application(const string &prefix,
7709 const cmdmap_t& cmdmap,
7710 stringstream& ss)
7711 {
7712 return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
7713 }
7714
7715 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
7716 const cmdmap_t& cmdmap,
7717 stringstream& ss,
7718 bool *modified)
7719 {
7720 return _command_pool_application(prefix, cmdmap, ss, modified, false);
7721 }
7722
7723
7724 /**
7725 * Common logic for preprocess and prepare phases of pool application
7726 * tag commands. In preprocess mode we're only detecting invalid
7727 * commands, and determining whether it was a modification or a no-op.
7728 * In prepare mode we're actually updating the pending state.
7729 */
7730 int OSDMonitor::_command_pool_application(const string &prefix,
7731 const cmdmap_t& cmdmap,
7732 stringstream& ss,
7733 bool *modified,
7734 bool preparing)
7735 {
7736 string pool_name;
7737 cmd_getval(cct, cmdmap, "pool", pool_name);
7738 int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
7739 if (pool < 0) {
7740 ss << "unrecognized pool '" << pool_name << "'";
7741 return -ENOENT;
7742 }
7743
7744 pg_pool_t p = *osdmap.get_pg_pool(pool);
7745 if (preparing) {
7746 if (pending_inc.new_pools.count(pool)) {
7747 p = pending_inc.new_pools[pool];
7748 }
7749 }
7750
7751 string app;
7752 cmd_getval(cct, cmdmap, "app", app);
7753 bool app_exists = (p.application_metadata.count(app) > 0);
7754
7755 string key;
7756 cmd_getval(cct, cmdmap, "key", key);
7757 if (key == "all") {
7758 ss << "key cannot be 'all'";
7759 return -EINVAL;
7760 }
7761
7762 string value;
7763 cmd_getval(cct, cmdmap, "value", value);
7764 if (value == "all") {
7765 ss << "value cannot be 'all'";
7766 return -EINVAL;
7767 }
7768
7769 if (boost::algorithm::ends_with(prefix, "enable")) {
7770 if (app.empty()) {
7771 ss << "application name must be provided";
7772 return -EINVAL;
7773 }
7774
7775 if (p.is_tier()) {
7776 ss << "application must be enabled on base tier";
7777 return -EINVAL;
7778 }
7779
7780 bool force = false;
7781 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7782
7783 if (!app_exists && !p.application_metadata.empty() && !force) {
7784 ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
7785 << "application; pass --yes-i-really-mean-it to proceed anyway";
7786 return -EPERM;
7787 }
7788
7789 if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
7790 ss << "too many enabled applications on pool '" << pool_name << "'; "
7791 << "max " << MAX_POOL_APPLICATIONS;
7792 return -EINVAL;
7793 }
7794
7795 if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
7796 ss << "application name '" << app << "' too long; max length "
7797 << MAX_POOL_APPLICATION_LENGTH;
7798 return -EINVAL;
7799 }
7800
7801 if (!app_exists) {
7802 p.application_metadata[app] = {};
7803 }
7804 ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
7805
7806 } else if (boost::algorithm::ends_with(prefix, "disable")) {
7807 bool force = false;
7808 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7809
7810 if (!force) {
7811 ss << "Are you SURE? Disabling an application within a pool might result "
7812 << "in loss of application functionality; pass "
7813 << "--yes-i-really-mean-it to proceed anyway";
7814 return -EPERM;
7815 }
7816
7817 if (!app_exists) {
7818 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7819 << "'";
7820 return 0; // idempotent
7821 }
7822
7823 p.application_metadata.erase(app);
7824 ss << "disable application '" << app << "' on pool '" << pool_name << "'";
7825
7826 } else if (boost::algorithm::ends_with(prefix, "set")) {
7827 if (p.is_tier()) {
7828 ss << "application metadata must be set on base tier";
7829 return -EINVAL;
7830 }
7831
7832 if (!app_exists) {
7833 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7834 << "'";
7835 return -ENOENT;
7836 }
7837
7838 string key;
7839 cmd_getval(cct, cmdmap, "key", key);
7840
7841 if (key.empty()) {
7842 ss << "key must be provided";
7843 return -EINVAL;
7844 }
7845
7846 auto &app_keys = p.application_metadata[app];
7847 if (app_keys.count(key) == 0 &&
7848 app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
7849 ss << "too many keys set for application '" << app << "' on pool '"
7850 << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
7851 return -EINVAL;
7852 }
7853
7854 if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
7855 ss << "key '" << app << "' too long; max length "
7856 << MAX_POOL_APPLICATION_LENGTH;
7857 return -EINVAL;
7858 }
7859
7860 string value;
7861 cmd_getval(cct, cmdmap, "value", value);
7862 if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
7863 ss << "value '" << value << "' too long; max length "
7864 << MAX_POOL_APPLICATION_LENGTH;
7865 return -EINVAL;
7866 }
7867
7868 p.application_metadata[app][key] = value;
7869 ss << "set application '" << app << "' key '" << key << "' to '"
7870 << value << "' on pool '" << pool_name << "'";
7871 } else if (boost::algorithm::ends_with(prefix, "rm")) {
7872 if (!app_exists) {
7873 ss << "application '" << app << "' is not enabled on pool '" << pool_name
7874 << "'";
7875 return -ENOENT;
7876 }
7877
7878 string key;
7879 cmd_getval(cct, cmdmap, "key", key);
7880 auto it = p.application_metadata[app].find(key);
7881 if (it == p.application_metadata[app].end()) {
7882 ss << "application '" << app << "' on pool '" << pool_name
7883 << "' does not have key '" << key << "'";
7884 return 0; // idempotent
7885 }
7886
7887 p.application_metadata[app].erase(it);
7888 ss << "removed application '" << app << "' key '" << key << "' on pool '"
7889 << pool_name << "'";
7890 } else {
7891 ceph_abort();
7892 }
7893
7894 if (preparing) {
7895 p.last_change = pending_inc.epoch;
7896 pending_inc.new_pools[pool] = p;
7897 }
7898
7899 // Because we fell through this far, we didn't hit no-op cases,
7900 // so pool was definitely modified
7901 if (modified != nullptr) {
7902 *modified = true;
7903 }
7904
7905 return 0;
7906 }
7907
7908 int OSDMonitor::_prepare_command_osd_crush_remove(
7909 CrushWrapper &newcrush,
7910 int32_t id,
7911 int32_t ancestor,
7912 bool has_ancestor,
7913 bool unlink_only)
7914 {
7915 int err = 0;
7916
7917 if (has_ancestor) {
7918 err = newcrush.remove_item_under(cct, id, ancestor,
7919 unlink_only);
7920 } else {
7921 err = newcrush.remove_item(cct, id, unlink_only);
7922 }
7923 return err;
7924 }
7925
7926 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
7927 {
7928 pending_inc.crush.clear();
7929 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7930 }
7931
7932 int OSDMonitor::prepare_command_osd_crush_remove(
7933 CrushWrapper &newcrush,
7934 int32_t id,
7935 int32_t ancestor,
7936 bool has_ancestor,
7937 bool unlink_only)
7938 {
7939 int err = _prepare_command_osd_crush_remove(
7940 newcrush, id, ancestor,
7941 has_ancestor, unlink_only);
7942
7943 if (err < 0)
7944 return err;
7945
7946 ceph_assert(err == 0);
7947 do_osd_crush_remove(newcrush);
7948
7949 return 0;
7950 }
7951
7952 int OSDMonitor::prepare_command_osd_remove(int32_t id)
7953 {
7954 if (osdmap.is_up(id)) {
7955 return -EBUSY;
7956 }
7957
7958 pending_inc.new_state[id] = osdmap.get_state(id);
7959 pending_inc.new_uuid[id] = uuid_d();
7960 pending_metadata_rm.insert(id);
7961 pending_metadata.erase(id);
7962
7963 return 0;
7964 }
7965
7966 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
7967 {
7968 ceph_assert(existing_id);
7969 *existing_id = -1;
7970
7971 for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
7972 if (!osdmap.exists(i) &&
7973 pending_inc.new_up_client.count(i) == 0 &&
7974 (pending_inc.new_state.count(i) == 0 ||
7975 (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
7976 *existing_id = i;
7977 return -1;
7978 }
7979 }
7980
7981 if (pending_inc.new_max_osd < 0) {
7982 return osdmap.get_max_osd();
7983 }
7984 return pending_inc.new_max_osd;
7985 }
7986
7987 void OSDMonitor::do_osd_create(
7988 const int32_t id,
7989 const uuid_d& uuid,
7990 const string& device_class,
7991 int32_t* new_id)
7992 {
7993 dout(10) << __func__ << " uuid " << uuid << dendl;
7994 ceph_assert(new_id);
7995
7996 // We presume validation has been performed prior to calling this
7997 // function. We assert with prejudice.
7998
7999 int32_t allocated_id = -1; // declare here so we can jump
8000 int32_t existing_id = -1;
8001 if (!uuid.is_zero()) {
8002 existing_id = osdmap.identify_osd(uuid);
8003 if (existing_id >= 0) {
8004 ceph_assert(id < 0 || id == existing_id);
8005 *new_id = existing_id;
8006 goto out;
8007 } else if (id >= 0) {
8008 // uuid does not exist, and id has been provided, so just create
8009 // the new osd.id
8010 *new_id = id;
8011 goto out;
8012 }
8013 }
8014
8015 // allocate a new id
8016 allocated_id = _allocate_osd_id(&existing_id);
8017 dout(10) << __func__ << " allocated id " << allocated_id
8018 << " existing id " << existing_id << dendl;
8019 if (existing_id >= 0) {
8020 ceph_assert(existing_id < osdmap.get_max_osd());
8021 ceph_assert(allocated_id < 0);
8022 pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8023 *new_id = existing_id;
8024 } else if (allocated_id >= 0) {
8025 ceph_assert(existing_id < 0);
8026 // raise max_osd
8027 if (pending_inc.new_max_osd < 0) {
8028 pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8029 } else {
8030 ++pending_inc.new_max_osd;
8031 }
8032 *new_id = pending_inc.new_max_osd - 1;
8033 ceph_assert(*new_id == allocated_id);
8034 } else {
8035 ceph_abort_msg("unexpected condition");
8036 }
8037
8038 out:
8039 if (device_class.size()) {
8040 CrushWrapper newcrush;
8041 _get_pending_crush(newcrush);
8042 if (newcrush.get_max_devices() < *new_id + 1) {
8043 newcrush.set_max_devices(*new_id + 1);
8044 }
8045 string name = string("osd.") + stringify(*new_id);
8046 if (!newcrush.item_exists(*new_id)) {
8047 newcrush.set_item_name(*new_id, name);
8048 }
8049 ostringstream ss;
8050 int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8051 if (r < 0) {
8052 derr << __func__ << " failed to set " << name << " device_class "
8053 << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8054 << dendl;
8055 // non-fatal... this might be a replay and we want to be idempotent.
8056 } else {
8057 dout(20) << __func__ << " set " << name << " device_class " << device_class
8058 << dendl;
8059 pending_inc.crush.clear();
8060 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8061 }
8062 } else {
8063 dout(20) << __func__ << " no device_class" << dendl;
8064 }
8065
8066 dout(10) << __func__ << " using id " << *new_id << dendl;
8067 if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8068 pending_inc.new_max_osd = *new_id + 1;
8069 }
8070
8071 pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8072 if (!uuid.is_zero())
8073 pending_inc.new_uuid[*new_id] = uuid;
8074 }
8075
8076 int OSDMonitor::validate_osd_create(
8077 const int32_t id,
8078 const uuid_d& uuid,
8079 const bool check_osd_exists,
8080 int32_t* existing_id,
8081 stringstream& ss)
8082 {
8083
8084 dout(10) << __func__ << " id " << id << " uuid " << uuid
8085 << " check_osd_exists " << check_osd_exists << dendl;
8086
8087 ceph_assert(existing_id);
8088
8089 if (id < 0 && uuid.is_zero()) {
8090 // we have nothing to validate
8091 *existing_id = -1;
8092 return 0;
8093 } else if (uuid.is_zero()) {
8094 // we have an id but we will ignore it - because that's what
8095 // `osd create` does.
8096 return 0;
8097 }
8098
8099 /*
8100 * This function will be used to validate whether we are able to
8101 * create a new osd when the `uuid` is specified.
8102 *
8103 * It will be used by both `osd create` and `osd new`, as the checks
8104 * are basically the same when it pertains to osd id and uuid validation.
8105 * However, `osd create` presumes an `uuid` is optional, for legacy
8106 * reasons, while `osd new` requires the `uuid` to be provided. This
8107 * means that `osd create` will not be idempotent if an `uuid` is not
8108 * provided, but we will always guarantee the idempotency of `osd new`.
8109 */
8110
8111 ceph_assert(!uuid.is_zero());
8112 if (pending_inc.identify_osd(uuid) >= 0) {
8113 // osd is about to exist
8114 return -EAGAIN;
8115 }
8116
8117 int32_t i = osdmap.identify_osd(uuid);
8118 if (i >= 0) {
8119 // osd already exists
8120 if (id >= 0 && i != id) {
8121 ss << "uuid " << uuid << " already in use for different id " << i;
8122 return -EEXIST;
8123 }
8124 // return a positive errno to distinguish between a blocking error
8125 // and an error we consider to not be a problem (i.e., this would be
8126 // an idempotent operation).
8127 *existing_id = i;
8128 return EEXIST;
8129 }
8130 // i < 0
8131 if (id >= 0) {
8132 if (pending_inc.new_state.count(id)) {
8133 // osd is about to exist
8134 return -EAGAIN;
8135 }
8136 // we may not care if an osd exists if we are recreating a previously
8137 // destroyed osd.
8138 if (check_osd_exists && osdmap.exists(id)) {
8139 ss << "id " << id << " already in use and does not match uuid "
8140 << uuid;
8141 return -EINVAL;
8142 }
8143 }
8144 return 0;
8145 }
8146
8147 int OSDMonitor::prepare_command_osd_create(
8148 const int32_t id,
8149 const uuid_d& uuid,
8150 int32_t* existing_id,
8151 stringstream& ss)
8152 {
8153 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8154 ceph_assert(existing_id);
8155 if (osdmap.is_destroyed(id)) {
8156 ss << "ceph osd create has been deprecated. Please use ceph osd new "
8157 "instead.";
8158 return -EINVAL;
8159 }
8160
8161 if (uuid.is_zero()) {
8162 dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8163 }
8164
8165 return validate_osd_create(id, uuid, true, existing_id, ss);
8166 }
8167
8168 int OSDMonitor::prepare_command_osd_new(
8169 MonOpRequestRef op,
8170 const cmdmap_t& cmdmap,
8171 const map<string,string>& params,
8172 stringstream &ss,
8173 Formatter *f)
8174 {
8175 uuid_d uuid;
8176 string uuidstr;
8177 int64_t id = -1;
8178
8179 ceph_assert(paxos->is_plugged());
8180
8181 dout(10) << __func__ << " " << op << dendl;
8182
8183 /* validate command. abort now if something's wrong. */
8184
8185 /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8186 *
8187 * If `id` is not specified, we will identify any existing osd based
8188 * on `uuid`. Operation will be idempotent iff secrets match.
8189 *
8190 * If `id` is specified, we will identify any existing osd based on
8191 * `uuid` and match against `id`. If they match, operation will be
8192 * idempotent iff secrets match.
8193 *
8194 * `-i secrets.json` will be optional. If supplied, will be used
8195 * to check for idempotency when `id` and `uuid` match.
8196 *
8197 * If `id` is not specified, and `uuid` does not exist, an id will
8198 * be found or allocated for the osd.
8199 *
8200 * If `id` is specified, and the osd has been previously marked
8201 * as destroyed, then the `id` will be reused.
8202 */
8203 if (!cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
8204 ss << "requires the OSD's UUID to be specified.";
8205 return -EINVAL;
8206 } else if (!uuid.parse(uuidstr.c_str())) {
8207 ss << "invalid UUID value '" << uuidstr << "'.";
8208 return -EINVAL;
8209 }
8210
8211 if (cmd_getval(cct, cmdmap, "id", id) &&
8212 (id < 0)) {
8213 ss << "invalid OSD id; must be greater or equal than zero.";
8214 return -EINVAL;
8215 }
8216
8217 // are we running an `osd create`-like command, or recreating
8218 // a previously destroyed osd?
8219
8220 bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
8221
8222 // we will care about `id` to assess whether osd is `destroyed`, or
8223 // to create a new osd.
8224 // we will need an `id` by the time we reach auth.
8225
8226 int32_t existing_id = -1;
8227 int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
8228 &existing_id, ss);
8229
8230 bool may_be_idempotent = false;
8231 if (err == EEXIST) {
8232 // this is idempotent from the osdmon's point-of-view
8233 may_be_idempotent = true;
8234 ceph_assert(existing_id >= 0);
8235 id = existing_id;
8236 } else if (err < 0) {
8237 return err;
8238 }
8239
8240 if (!may_be_idempotent) {
8241 // idempotency is out of the window. We are either creating a new
8242 // osd or recreating a destroyed osd.
8243 //
8244 // We now need to figure out if we have an `id` (and if it's valid),
8245 // of find an `id` if we don't have one.
8246
8247 // NOTE: we need to consider the case where the `id` is specified for
8248 // `osd create`, and we must honor it. So this means checking if
8249 // the `id` is destroyed, and if so assume the destroy; otherwise,
8250 // check if it `exists` - in which case we complain about not being
8251 // `destroyed`. In the end, if nothing fails, we must allow the
8252 // creation, so that we are compatible with `create`.
8253 if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
8254 dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
8255 ss << "OSD " << id << " has not yet been destroyed";
8256 return -EINVAL;
8257 } else if (id < 0) {
8258 // find an `id`
8259 id = _allocate_osd_id(&existing_id);
8260 if (id < 0) {
8261 ceph_assert(existing_id >= 0);
8262 id = existing_id;
8263 }
8264 dout(10) << __func__ << " found id " << id << " to use" << dendl;
8265 } else if (id >= 0 && osdmap.is_destroyed(id)) {
8266 dout(10) << __func__ << " recreating osd." << id << dendl;
8267 } else {
8268 dout(10) << __func__ << " creating new osd." << id << dendl;
8269 }
8270 } else {
8271 ceph_assert(id >= 0);
8272 ceph_assert(osdmap.exists(id));
8273 }
8274
8275 // we are now able to either create a brand new osd or reuse an existing
8276 // osd that has been previously destroyed.
8277
8278 dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8279
8280 if (may_be_idempotent && params.empty()) {
8281 // nothing to do, really.
8282 dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
8283 ceph_assert(id >= 0);
8284 if (f) {
8285 f->open_object_section("created_osd");
8286 f->dump_int("osdid", id);
8287 f->close_section();
8288 } else {
8289 ss << id;
8290 }
8291 return EEXIST;
8292 }
8293
8294 string device_class;
8295 auto p = params.find("crush_device_class");
8296 if (p != params.end()) {
8297 device_class = p->second;
8298 dout(20) << __func__ << " device_class will be " << device_class << dendl;
8299 }
8300 string cephx_secret, lockbox_secret, dmcrypt_key;
8301 bool has_lockbox = false;
8302 bool has_secrets = params.count("cephx_secret")
8303 || params.count("cephx_lockbox_secret")
8304 || params.count("dmcrypt_key");
8305
8306 ConfigKeyService *svc = nullptr;
8307 AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
8308
8309 if (has_secrets) {
8310 if (params.count("cephx_secret") == 0) {
8311 ss << "requires a cephx secret.";
8312 return -EINVAL;
8313 }
8314 cephx_secret = params.at("cephx_secret");
8315
8316 bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
8317 bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
8318
8319 dout(10) << __func__ << " has lockbox " << has_lockbox_secret
8320 << " dmcrypt " << has_dmcrypt_key << dendl;
8321
8322 if (has_lockbox_secret && has_dmcrypt_key) {
8323 has_lockbox = true;
8324 lockbox_secret = params.at("cephx_lockbox_secret");
8325 dmcrypt_key = params.at("dmcrypt_key");
8326 } else if (!has_lockbox_secret != !has_dmcrypt_key) {
8327 ss << "requires both a cephx lockbox secret and a dm-crypt key.";
8328 return -EINVAL;
8329 }
8330
8331 dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
8332
8333 err = mon->authmon()->validate_osd_new(id, uuid,
8334 cephx_secret,
8335 lockbox_secret,
8336 cephx_entity,
8337 lockbox_entity,
8338 ss);
8339 if (err < 0) {
8340 return err;
8341 } else if (may_be_idempotent && err != EEXIST) {
8342 // for this to be idempotent, `id` should already be >= 0; no need
8343 // to use validate_id.
8344 ceph_assert(id >= 0);
8345 ss << "osd." << id << " exists but secrets do not match";
8346 return -EEXIST;
8347 }
8348
8349 if (has_lockbox) {
8350 svc = (ConfigKeyService*)mon->config_key_service;
8351 err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
8352 if (err < 0) {
8353 return err;
8354 } else if (may_be_idempotent && err != EEXIST) {
8355 ceph_assert(id >= 0);
8356 ss << "osd." << id << " exists but dm-crypt key does not match.";
8357 return -EEXIST;
8358 }
8359 }
8360 }
8361 ceph_assert(!has_secrets || !cephx_secret.empty());
8362 ceph_assert(!has_lockbox || !lockbox_secret.empty());
8363
8364 if (may_be_idempotent) {
8365 // we have nothing to do for either the osdmon or the authmon,
8366 // and we have no lockbox - so the config key service will not be
8367 // touched. This is therefore an idempotent operation, and we can
8368 // just return right away.
8369 dout(10) << __func__ << " idempotent -- no op." << dendl;
8370 ceph_assert(id >= 0);
8371 if (f) {
8372 f->open_object_section("created_osd");
8373 f->dump_int("osdid", id);
8374 f->close_section();
8375 } else {
8376 ss << id;
8377 }
8378 return EEXIST;
8379 }
8380 ceph_assert(!may_be_idempotent);
8381
8382 // perform updates.
8383 if (has_secrets) {
8384 ceph_assert(!cephx_secret.empty());
8385 ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
8386 (!lockbox_secret.empty() && !dmcrypt_key.empty()));
8387
8388 err = mon->authmon()->do_osd_new(cephx_entity,
8389 lockbox_entity,
8390 has_lockbox);
8391 ceph_assert(0 == err);
8392
8393 if (has_lockbox) {
8394 ceph_assert(nullptr != svc);
8395 svc->do_osd_new(uuid, dmcrypt_key);
8396 }
8397 }
8398
8399 if (is_recreate_destroyed) {
8400 ceph_assert(id >= 0);
8401 ceph_assert(osdmap.is_destroyed(id));
8402 pending_inc.new_weight[id] = CEPH_OSD_OUT;
8403 pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
8404 if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
8405 pending_inc.new_state[id] |= CEPH_OSD_NEW;
8406 }
8407 if (osdmap.get_state(id) & CEPH_OSD_UP) {
8408 // due to http://tracker.ceph.com/issues/20751 some clusters may
8409 // have UP set for non-existent OSDs; make sure it is cleared
8410 // for a newly created osd.
8411 pending_inc.new_state[id] |= CEPH_OSD_UP;
8412 }
8413 pending_inc.new_uuid[id] = uuid;
8414 } else {
8415 ceph_assert(id >= 0);
8416 int32_t new_id = -1;
8417 do_osd_create(id, uuid, device_class, &new_id);
8418 ceph_assert(new_id >= 0);
8419 ceph_assert(id == new_id);
8420 }
8421
8422 if (f) {
8423 f->open_object_section("created_osd");
8424 f->dump_int("osdid", id);
8425 f->close_section();
8426 } else {
8427 ss << id;
8428 }
8429
8430 return 0;
8431 }
8432
8433 bool OSDMonitor::prepare_command(MonOpRequestRef op)
8434 {
8435 op->mark_osdmon_event(__func__);
8436 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8437 stringstream ss;
8438 cmdmap_t cmdmap;
8439 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
8440 string rs = ss.str();
8441 mon->reply_command(op, -EINVAL, rs, get_last_committed());
8442 return true;
8443 }
8444
8445 MonSession *session = op->get_session();
8446 if (!session) {
8447 derr << __func__ << " no session" << dendl;
8448 mon->reply_command(op, -EACCES, "access denied", get_last_committed());
8449 return true;
8450 }
8451
8452 return prepare_command_impl(op, cmdmap);
8453 }
8454
8455 static int parse_reweights(CephContext *cct,
8456 const cmdmap_t& cmdmap,
8457 const OSDMap& osdmap,
8458 map<int32_t, uint32_t>* weights)
8459 {
8460 string weights_str;
8461 if (!cmd_getval(cct, cmdmap, "weights", weights_str)) {
8462 return -EINVAL;
8463 }
8464 std::replace(begin(weights_str), end(weights_str), '\'', '"');
8465 json_spirit::mValue json_value;
8466 if (!json_spirit::read(weights_str, json_value)) {
8467 return -EINVAL;
8468 }
8469 if (json_value.type() != json_spirit::obj_type) {
8470 return -EINVAL;
8471 }
8472 const auto obj = json_value.get_obj();
8473 try {
8474 for (auto& osd_weight : obj) {
8475 auto osd_id = std::stoi(osd_weight.first);
8476 if (!osdmap.exists(osd_id)) {
8477 return -ENOENT;
8478 }
8479 if (osd_weight.second.type() != json_spirit::str_type) {
8480 return -EINVAL;
8481 }
8482 auto weight = std::stoul(osd_weight.second.get_str());
8483 weights->insert({osd_id, weight});
8484 }
8485 } catch (const std::logic_error& e) {
8486 return -EINVAL;
8487 }
8488 return 0;
8489 }
8490
8491 int OSDMonitor::prepare_command_osd_destroy(
8492 int32_t id,
8493 stringstream& ss)
8494 {
8495 ceph_assert(paxos->is_plugged());
8496
8497 // we check if the osd exists for the benefit of `osd purge`, which may
8498 // have previously removed the osd. If the osd does not exist, return
8499 // -ENOENT to convey this, and let the caller deal with it.
8500 //
8501 // we presume that all auth secrets and config keys were removed prior
8502 // to this command being called. if they exist by now, we also assume
8503 // they must have been created by some other command and do not pertain
8504 // to this non-existent osd.
8505 if (!osdmap.exists(id)) {
8506 dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
8507 return -ENOENT;
8508 }
8509
8510 uuid_d uuid = osdmap.get_uuid(id);
8511 dout(10) << __func__ << " destroying osd." << id
8512 << " uuid " << uuid << dendl;
8513
8514 // if it has been destroyed, we assume our work here is done.
8515 if (osdmap.is_destroyed(id)) {
8516 ss << "destroyed osd." << id;
8517 return 0;
8518 }
8519
8520 EntityName cephx_entity, lockbox_entity;
8521 bool idempotent_auth = false, idempotent_cks = false;
8522
8523 int err = mon->authmon()->validate_osd_destroy(id, uuid,
8524 cephx_entity,
8525 lockbox_entity,
8526 ss);
8527 if (err < 0) {
8528 if (err == -ENOENT) {
8529 idempotent_auth = true;
8530 } else {
8531 return err;
8532 }
8533 }
8534
8535 ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
8536 err = svc->validate_osd_destroy(id, uuid);
8537 if (err < 0) {
8538 ceph_assert(err == -ENOENT);
8539 err = 0;
8540 idempotent_cks = true;
8541 }
8542
8543 if (!idempotent_auth) {
8544 err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
8545 ceph_assert(0 == err);
8546 }
8547
8548 if (!idempotent_cks) {
8549 svc->do_osd_destroy(id, uuid);
8550 }
8551
8552 pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
8553 pending_inc.new_uuid[id] = uuid_d();
8554
8555 // we can only propose_pending() once per service, otherwise we'll be
8556 // defying PaxosService and all laws of nature. Therefore, as we may
8557 // be used during 'osd purge', let's keep the caller responsible for
8558 // proposing.
8559 ceph_assert(err == 0);
8560 return 0;
8561 }
8562
8563 int OSDMonitor::prepare_command_osd_purge(
8564 int32_t id,
8565 stringstream& ss)
8566 {
8567 ceph_assert(paxos->is_plugged());
8568 dout(10) << __func__ << " purging osd." << id << dendl;
8569
8570 ceph_assert(!osdmap.is_up(id));
8571
8572 /*
8573 * This may look a bit weird, but this is what's going to happen:
8574 *
8575 * 1. we make sure that removing from crush works
8576 * 2. we call `prepare_command_osd_destroy()`. If it returns an
8577 * error, then we abort the whole operation, as no updates
8578 * have been made. However, we this function will have
8579 * side-effects, thus we need to make sure that all operations
8580 * performed henceforth will *always* succeed.
8581 * 3. we call `prepare_command_osd_remove()`. Although this
8582 * function can return an error, it currently only checks if the
8583 * osd is up - and we have made sure that it is not so, so there
8584 * is no conflict, and it is effectively an update.
8585 * 4. finally, we call `do_osd_crush_remove()`, which will perform
8586 * the crush update we delayed from before.
8587 */
8588
8589 CrushWrapper newcrush;
8590 _get_pending_crush(newcrush);
8591
8592 bool may_be_idempotent = false;
8593
8594 int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
8595 if (err == -ENOENT) {
8596 err = 0;
8597 may_be_idempotent = true;
8598 } else if (err < 0) {
8599 ss << "error removing osd." << id << " from crush";
8600 return err;
8601 }
8602
8603 // no point destroying the osd again if it has already been marked destroyed
8604 if (!osdmap.is_destroyed(id)) {
8605 err = prepare_command_osd_destroy(id, ss);
8606 if (err < 0) {
8607 if (err == -ENOENT) {
8608 err = 0;
8609 } else {
8610 return err;
8611 }
8612 } else {
8613 may_be_idempotent = false;
8614 }
8615 }
8616 ceph_assert(0 == err);
8617
8618 if (may_be_idempotent && !osdmap.exists(id)) {
8619 dout(10) << __func__ << " osd." << id << " does not exist and "
8620 << "we are idempotent." << dendl;
8621 return -ENOENT;
8622 }
8623
8624 err = prepare_command_osd_remove(id);
8625 // we should not be busy, as we should have made sure this id is not up.
8626 ceph_assert(0 == err);
8627
8628 do_osd_crush_remove(newcrush);
8629 return 0;
8630 }
8631
8632 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
8633 const cmdmap_t& cmdmap)
8634 {
8635 op->mark_osdmon_event(__func__);
8636 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8637 bool ret = false;
8638 stringstream ss;
8639 string rs;
8640 bufferlist rdata;
8641 int err = 0;
8642
8643 string format;
8644 cmd_getval(cct, cmdmap, "format", format, string("plain"));
8645 boost::scoped_ptr<Formatter> f(Formatter::create(format));
8646
8647 string prefix;
8648 cmd_getval(cct, cmdmap, "prefix", prefix);
8649
8650 int64_t osdid;
8651 string osd_name;
8652 bool osdid_present = false;
8653 if (prefix != "osd pg-temp" &&
8654 prefix != "osd pg-upmap" &&
8655 prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg
8656 osdid_present = cmd_getval(cct, cmdmap, "id", osdid);
8657 }
8658 if (osdid_present) {
8659 ostringstream oss;
8660 oss << "osd." << osdid;
8661 osd_name = oss.str();
8662 }
8663
8664 // Even if there's a pending state with changes that could affect
8665 // a command, considering that said state isn't yet committed, we
8666 // just don't care about those changes if the command currently being
8667 // handled acts as a no-op against the current committed state.
8668 // In a nutshell, we assume this command happens *before*.
8669 //
8670 // Let me make this clearer:
8671 //
8672 // - If we have only one client, and that client issues some
8673 // operation that would conflict with this operation but is
8674 // still on the pending state, then we would be sure that said
8675 // operation wouldn't have returned yet, so the client wouldn't
8676 // issue this operation (unless the client didn't wait for the
8677 // operation to finish, and that would be the client's own fault).
8678 //
8679 // - If we have more than one client, each client will observe
8680 // whatever is the state at the moment of the commit. So, if we
8681 // have two clients, one issuing an unlink and another issuing a
8682 // link, and if the link happens while the unlink is still on the
8683 // pending state, from the link's point-of-view this is a no-op.
8684 // If different clients are issuing conflicting operations and
8685 // they care about that, then the clients should make sure they
8686 // enforce some kind of concurrency mechanism -- from our
8687 // perspective that's what Douglas Adams would call an SEP.
8688 //
8689 // This should be used as a general guideline for most commands handled
8690 // in this function. Adapt as you see fit, but please bear in mind that
8691 // this is the expected behavior.
8692
8693
8694 if (prefix == "osd setcrushmap" ||
8695 (prefix == "osd crush set" && !osdid_present)) {
8696 if (pending_inc.crush.length()) {
8697 dout(10) << __func__ << " waiting for pending crush update " << dendl;
8698 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8699 return true;
8700 }
8701 dout(10) << "prepare_command setting new crush map" << dendl;
8702 bufferlist data(m->get_data());
8703 CrushWrapper crush;
8704 try {
8705 auto bl = data.cbegin();
8706 crush.decode(bl);
8707 }
8708 catch (const std::exception &e) {
8709 err = -EINVAL;
8710 ss << "Failed to parse crushmap: " << e.what();
8711 goto reply;
8712 }
8713
8714 int64_t prior_version = 0;
8715 if (cmd_getval(cct, cmdmap, "prior_version", prior_version)) {
8716 if (prior_version == osdmap.get_crush_version() - 1) {
8717 // see if we are a resend of the last update. this is imperfect
8718 // (multiple racing updaters may not both get reliable success)
8719 // but we expect crush updaters (via this interface) to be rare-ish.
8720 bufferlist current, proposed;
8721 osdmap.crush->encode(current, mon->get_quorum_con_features());
8722 crush.encode(proposed, mon->get_quorum_con_features());
8723 if (current.contents_equal(proposed)) {
8724 dout(10) << __func__
8725 << " proposed matches current and version equals previous"
8726 << dendl;
8727 err = 0;
8728 ss << osdmap.get_crush_version();
8729 goto reply;
8730 }
8731 }
8732 if (prior_version != osdmap.get_crush_version()) {
8733 err = -EPERM;
8734 ss << "prior_version " << prior_version << " != crush version "
8735 << osdmap.get_crush_version();
8736 goto reply;
8737 }
8738 }
8739
8740 if (crush.has_legacy_rule_ids()) {
8741 err = -EINVAL;
8742 ss << "crush maps with ruleset != ruleid are no longer allowed";
8743 goto reply;
8744 }
8745 if (!validate_crush_against_features(&crush, ss)) {
8746 err = -EINVAL;
8747 goto reply;
8748 }
8749
8750 err = osdmap.validate_crush_rules(&crush, &ss);
8751 if (err < 0) {
8752 goto reply;
8753 }
8754
8755 if (g_conf()->mon_osd_crush_smoke_test) {
8756 // sanity check: test some inputs to make sure this map isn't
8757 // totally broken
8758 dout(10) << " testing map" << dendl;
8759 stringstream ess;
8760 CrushTester tester(crush, ess);
8761 tester.set_min_x(0);
8762 tester.set_max_x(50);
8763 auto start = ceph::coarse_mono_clock::now();
8764 int r = tester.test_with_fork(g_conf()->mon_lease);
8765 auto duration = ceph::coarse_mono_clock::now() - start;
8766 if (r < 0) {
8767 dout(10) << " tester.test_with_fork returns " << r
8768 << ": " << ess.str() << dendl;
8769 ss << "crush smoke test failed with " << r << ": " << ess.str();
8770 err = r;
8771 goto reply;
8772 }
8773 dout(10) << __func__ << " crush somke test duration: "
8774 << duration << ", result: " << ess.str() << dendl;
8775 }
8776
8777 pending_inc.crush = data;
8778 ss << osdmap.get_crush_version() + 1;
8779 goto update;
8780
8781 } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
8782 CrushWrapper newcrush;
8783 _get_pending_crush(newcrush);
8784 for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
8785 int bid = -1 - b;
8786 if (newcrush.bucket_exists(bid) &&
8787 newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
8788 dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
8789 newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
8790 }
8791 }
8792 if (!validate_crush_against_features(&newcrush, ss)) {
8793 err = -EINVAL;
8794 goto reply;
8795 }
8796 pending_inc.crush.clear();
8797 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8798 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8799 get_last_committed() + 1));
8800 return true;
8801 } else if (prefix == "osd crush set-device-class") {
8802 string device_class;
8803 if (!cmd_getval(cct, cmdmap, "class", device_class)) {
8804 err = -EINVAL; // no value!
8805 goto reply;
8806 }
8807
8808 bool stop = false;
8809 vector<string> idvec;
8810 cmd_getval(cct, cmdmap, "ids", idvec);
8811 CrushWrapper newcrush;
8812 _get_pending_crush(newcrush);
8813 set<int> updated;
8814 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8815 set<int> osds;
8816 // wildcard?
8817 if (j == 0 &&
8818 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8819 osdmap.get_all_osds(osds);
8820 stop = true;
8821 } else {
8822 // try traditional single osd way
8823 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8824 if (osd < 0) {
8825 // ss has reason for failure
8826 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8827 err = -EINVAL;
8828 continue;
8829 }
8830 osds.insert(osd);
8831 }
8832
8833 for (auto &osd : osds) {
8834 if (!osdmap.exists(osd)) {
8835 ss << "osd." << osd << " does not exist. ";
8836 continue;
8837 }
8838
8839 ostringstream oss;
8840 oss << "osd." << osd;
8841 string name = oss.str();
8842
8843 if (newcrush.get_max_devices() < osd + 1) {
8844 newcrush.set_max_devices(osd + 1);
8845 }
8846 string action;
8847 if (newcrush.item_exists(osd)) {
8848 action = "updating";
8849 } else {
8850 action = "creating";
8851 newcrush.set_item_name(osd, name);
8852 }
8853
8854 dout(5) << action << " crush item id " << osd << " name '" << name
8855 << "' device_class '" << device_class << "'"
8856 << dendl;
8857 err = newcrush.update_device_class(osd, device_class, name, &ss);
8858 if (err < 0) {
8859 goto reply;
8860 }
8861 if (err == 0 && !_have_pending_crush()) {
8862 if (!stop) {
8863 // for single osd only, wildcard makes too much noise
8864 ss << "set-device-class item id " << osd << " name '" << name
8865 << "' device_class '" << device_class << "': no change. ";
8866 }
8867 } else {
8868 updated.insert(osd);
8869 }
8870 }
8871 }
8872
8873 if (!updated.empty()) {
8874 pending_inc.crush.clear();
8875 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8876 ss << "set osd(s) " << updated << " to class '" << device_class << "'";
8877 getline(ss, rs);
8878 wait_for_finished_proposal(op,
8879 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8880 return true;
8881 }
8882
8883 } else if (prefix == "osd crush rm-device-class") {
8884 bool stop = false;
8885 vector<string> idvec;
8886 cmd_getval(cct, cmdmap, "ids", idvec);
8887 CrushWrapper newcrush;
8888 _get_pending_crush(newcrush);
8889 set<int> updated;
8890
8891 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8892 set<int> osds;
8893
8894 // wildcard?
8895 if (j == 0 &&
8896 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8897 osdmap.get_all_osds(osds);
8898 stop = true;
8899 } else {
8900 // try traditional single osd way
8901 long osd = parse_osd_id(idvec[j].c_str(), &ss);
8902 if (osd < 0) {
8903 // ss has reason for failure
8904 ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8905 err = -EINVAL;
8906 goto reply;
8907 }
8908 osds.insert(osd);
8909 }
8910
8911 for (auto &osd : osds) {
8912 if (!osdmap.exists(osd)) {
8913 ss << "osd." << osd << " does not exist. ";
8914 continue;
8915 }
8916
8917 auto class_name = newcrush.get_item_class(osd);
8918 if (!class_name) {
8919 ss << "osd." << osd << " belongs to no class, ";
8920 continue;
8921 }
8922 // note that we do not verify if class_is_in_use here
8923 // in case the device is misclassified and user wants
8924 // to overridely reset...
8925
8926 err = newcrush.remove_device_class(cct, osd, &ss);
8927 if (err < 0) {
8928 // ss has reason for failure
8929 goto reply;
8930 }
8931 updated.insert(osd);
8932 }
8933 }
8934
8935 if (!updated.empty()) {
8936 pending_inc.crush.clear();
8937 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8938 ss << "done removing class of osd(s): " << updated;
8939 getline(ss, rs);
8940 wait_for_finished_proposal(op,
8941 new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8942 return true;
8943 }
8944 } else if (prefix == "osd crush class create") {
8945 string device_class;
8946 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8947 err = -EINVAL; // no value!
8948 goto reply;
8949 }
8950 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8951 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8952 << "luminous' before using crush device classes";
8953 err = -EPERM;
8954 goto reply;
8955 }
8956 if (!_have_pending_crush() &&
8957 _get_stable_crush().class_exists(device_class)) {
8958 ss << "class '" << device_class << "' already exists";
8959 goto reply;
8960 }
8961 CrushWrapper newcrush;
8962 _get_pending_crush(newcrush);
8963 if (newcrush.class_exists(device_class)) {
8964 ss << "class '" << device_class << "' already exists";
8965 goto update;
8966 }
8967 int class_id = newcrush.get_or_create_class_id(device_class);
8968 pending_inc.crush.clear();
8969 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8970 ss << "created class " << device_class << " with id " << class_id
8971 << " to crush map";
8972 goto update;
8973 } else if (prefix == "osd crush class rm") {
8974 string device_class;
8975 if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8976 err = -EINVAL; // no value!
8977 goto reply;
8978 }
8979 if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8980 ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8981 << "luminous' before using crush device classes";
8982 err = -EPERM;
8983 goto reply;
8984 }
8985
8986 if (!osdmap.crush->class_exists(device_class)) {
8987 err = 0;
8988 goto reply;
8989 }
8990
8991 CrushWrapper newcrush;
8992 _get_pending_crush(newcrush);
8993 if (!newcrush.class_exists(device_class)) {
8994 err = 0; // make command idempotent
8995 goto wait;
8996 }
8997 int class_id = newcrush.get_class_id(device_class);
8998 stringstream ts;
8999 if (newcrush.class_is_in_use(class_id, &ts)) {
9000 err = -EBUSY;
9001 ss << "class '" << device_class << "' " << ts.str();
9002 goto reply;
9003 }
9004
9005 // check if class is used by any erasure-code-profiles
9006 mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9007 osdmap.get_erasure_code_profiles();
9008 auto ec_profiles = pending_inc.get_erasure_code_profiles();
9009 #ifdef HAVE_STDLIB_MAP_SPLICING
9010 ec_profiles.merge(old_ec_profiles);
9011 #else
9012 ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9013 make_move_iterator(end(old_ec_profiles)));
9014 #endif
9015 list<string> referenced_by;
9016 for (auto &i: ec_profiles) {
9017 for (auto &j: i.second) {
9018 if ("crush-device-class" == j.first && device_class == j.second) {
9019 referenced_by.push_back(i.first);
9020 }
9021 }
9022 }
9023 if (!referenced_by.empty()) {
9024 err = -EBUSY;
9025 ss << "class '" << device_class
9026 << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9027 goto reply;
9028 }
9029
9030 set<int> osds;
9031 newcrush.get_devices_by_class(device_class, &osds);
9032 for (auto& p: osds) {
9033 err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9034 if (err < 0) {
9035 // ss has reason for failure
9036 goto reply;
9037 }
9038 }
9039
9040 if (osds.empty()) {
9041 // empty class, remove directly
9042 err = newcrush.remove_class_name(device_class);
9043 if (err < 0) {
9044 ss << "class '" << device_class << "' cannot be removed '"
9045 << cpp_strerror(err) << "'";
9046 goto reply;
9047 }
9048 }
9049
9050 pending_inc.crush.clear();
9051 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9052 ss << "removed class " << device_class << " with id " << class_id
9053 << " from crush map";
9054 goto update;
9055 } else if (prefix == "osd crush class rename") {
9056 string srcname, dstname;
9057 if (!cmd_getval(cct, cmdmap, "srcname", srcname)) {
9058 err = -EINVAL;
9059 goto reply;
9060 }
9061 if (!cmd_getval(cct, cmdmap, "dstname", dstname)) {
9062 err = -EINVAL;
9063 goto reply;
9064 }
9065
9066 CrushWrapper newcrush;
9067 _get_pending_crush(newcrush);
9068 if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9069 // suppose this is a replay and return success
9070 // so command is idempotent
9071 ss << "already renamed to '" << dstname << "'";
9072 err = 0;
9073 goto reply;
9074 }
9075
9076 err = newcrush.rename_class(srcname, dstname);
9077 if (err < 0) {
9078 ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9079 << cpp_strerror(err);
9080 goto reply;
9081 }
9082
9083 pending_inc.crush.clear();
9084 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9085 ss << "rename class '" << srcname << "' to '" << dstname << "'";
9086 goto update;
9087 } else if (prefix == "osd crush add-bucket") {
9088 // os crush add-bucket <name> <type>
9089 string name, typestr;
9090 vector<string> argvec;
9091 cmd_getval(cct, cmdmap, "name", name);
9092 cmd_getval(cct, cmdmap, "type", typestr);
9093 cmd_getval(cct, cmdmap, "args", argvec);
9094 map<string,string> loc;
9095 if (!argvec.empty()) {
9096 CrushWrapper::parse_loc_map(argvec, &loc);
9097 dout(0) << "will create and move bucket '" << name
9098 << "' to location " << loc << dendl;
9099 }
9100
9101 if (!_have_pending_crush() &&
9102 _get_stable_crush().name_exists(name)) {
9103 ss << "bucket '" << name << "' already exists";
9104 goto reply;
9105 }
9106
9107 CrushWrapper newcrush;
9108 _get_pending_crush(newcrush);
9109
9110 if (newcrush.name_exists(name)) {
9111 ss << "bucket '" << name << "' already exists";
9112 goto update;
9113 }
9114 int type = newcrush.get_type_id(typestr);
9115 if (type < 0) {
9116 ss << "type '" << typestr << "' does not exist";
9117 err = -EINVAL;
9118 goto reply;
9119 }
9120 if (type == 0) {
9121 ss << "type '" << typestr << "' is for devices, not buckets";
9122 err = -EINVAL;
9123 goto reply;
9124 }
9125 int bucketno;
9126 err = newcrush.add_bucket(0, 0,
9127 CRUSH_HASH_DEFAULT, type, 0, NULL,
9128 NULL, &bucketno);
9129 if (err < 0) {
9130 ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9131 goto reply;
9132 }
9133 err = newcrush.set_item_name(bucketno, name);
9134 if (err < 0) {
9135 ss << "error setting bucket name to '" << name << "'";
9136 goto reply;
9137 }
9138
9139 if (!loc.empty()) {
9140 if (!newcrush.check_item_loc(cct, bucketno, loc,
9141 (int *)NULL)) {
9142 err = newcrush.move_bucket(cct, bucketno, loc);
9143 if (err < 0) {
9144 ss << "error moving bucket '" << name << "' to location " << loc;
9145 goto reply;
9146 }
9147 } else {
9148 ss << "no need to move item id " << bucketno << " name '" << name
9149 << "' to location " << loc << " in crush map";
9150 }
9151 }
9152
9153 pending_inc.crush.clear();
9154 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9155 if (loc.empty()) {
9156 ss << "added bucket " << name << " type " << typestr
9157 << " to crush map";
9158 } else {
9159 ss << "added bucket " << name << " type " << typestr
9160 << " to location " << loc;
9161 }
9162 goto update;
9163 } else if (prefix == "osd crush rename-bucket") {
9164 string srcname, dstname;
9165 cmd_getval(cct, cmdmap, "srcname", srcname);
9166 cmd_getval(cct, cmdmap, "dstname", dstname);
9167
9168 err = crush_rename_bucket(srcname, dstname, &ss);
9169 if (err == -EALREADY) // equivalent to success for idempotency
9170 err = 0;
9171 if (err)
9172 goto reply;
9173 else
9174 goto update;
9175 } else if (prefix == "osd crush weight-set create" ||
9176 prefix == "osd crush weight-set create-compat") {
9177 CrushWrapper newcrush;
9178 _get_pending_crush(newcrush);
9179 int64_t pool;
9180 int positions;
9181 if (newcrush.has_non_straw2_buckets()) {
9182 ss << "crush map contains one or more bucket(s) that are not straw2";
9183 err = -EPERM;
9184 goto reply;
9185 }
9186 if (prefix == "osd crush weight-set create") {
9187 if (osdmap.require_min_compat_client > 0 &&
9188 osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9189 ss << "require_min_compat_client "
9190 << ceph_release_name(osdmap.require_min_compat_client)
9191 << " < luminous, which is required for per-pool weight-sets. "
9192 << "Try 'ceph osd set-require-min-compat-client luminous' "
9193 << "before using the new interface";
9194 err = -EPERM;
9195 goto reply;
9196 }
9197 string poolname, mode;
9198 cmd_getval(cct, cmdmap, "pool", poolname);
9199 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9200 if (pool < 0) {
9201 ss << "pool '" << poolname << "' not found";
9202 err = -ENOENT;
9203 goto reply;
9204 }
9205 cmd_getval(cct, cmdmap, "mode", mode);
9206 if (mode != "flat" && mode != "positional") {
9207 ss << "unrecognized weight-set mode '" << mode << "'";
9208 err = -EINVAL;
9209 goto reply;
9210 }
9211 positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
9212 } else {
9213 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9214 positions = 1;
9215 }
9216 if (!newcrush.create_choose_args(pool, positions)) {
9217 if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
9218 ss << "compat weight-set already created";
9219 } else {
9220 ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
9221 << "' already created";
9222 }
9223 goto reply;
9224 }
9225 pending_inc.crush.clear();
9226 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9227 goto update;
9228
9229 } else if (prefix == "osd crush weight-set rm" ||
9230 prefix == "osd crush weight-set rm-compat") {
9231 CrushWrapper newcrush;
9232 _get_pending_crush(newcrush);
9233 int64_t pool;
9234 if (prefix == "osd crush weight-set rm") {
9235 string poolname;
9236 cmd_getval(cct, cmdmap, "pool", poolname);
9237 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9238 if (pool < 0) {
9239 ss << "pool '" << poolname << "' not found";
9240 err = -ENOENT;
9241 goto reply;
9242 }
9243 } else {
9244 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9245 }
9246 newcrush.rm_choose_args(pool);
9247 pending_inc.crush.clear();
9248 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9249 goto update;
9250
9251 } else if (prefix == "osd crush weight-set reweight" ||
9252 prefix == "osd crush weight-set reweight-compat") {
9253 string poolname, item;
9254 vector<double> weight;
9255 cmd_getval(cct, cmdmap, "pool", poolname);
9256 cmd_getval(cct, cmdmap, "item", item);
9257 cmd_getval(cct, cmdmap, "weight", weight);
9258 CrushWrapper newcrush;
9259 _get_pending_crush(newcrush);
9260 int64_t pool;
9261 if (prefix == "osd crush weight-set reweight") {
9262 pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9263 if (pool < 0) {
9264 ss << "pool '" << poolname << "' not found";
9265 err = -ENOENT;
9266 goto reply;
9267 }
9268 if (!newcrush.have_choose_args(pool)) {
9269 ss << "no weight-set for pool '" << poolname << "'";
9270 err = -ENOENT;
9271 goto reply;
9272 }
9273 auto arg_map = newcrush.choose_args_get(pool);
9274 int positions = newcrush.get_choose_args_positions(arg_map);
9275 if (weight.size() != (size_t)positions) {
9276 ss << "must specify exact " << positions << " weight values";
9277 err = -EINVAL;
9278 goto reply;
9279 }
9280 } else {
9281 pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9282 if (!newcrush.have_choose_args(pool)) {
9283 ss << "no backward-compatible weight-set";
9284 err = -ENOENT;
9285 goto reply;
9286 }
9287 }
9288 if (!newcrush.name_exists(item)) {
9289 ss << "item '" << item << "' does not exist";
9290 err = -ENOENT;
9291 goto reply;
9292 }
9293 err = newcrush.choose_args_adjust_item_weightf(
9294 cct,
9295 newcrush.choose_args_get(pool),
9296 newcrush.get_item_id(item),
9297 weight,
9298 &ss);
9299 if (err < 0) {
9300 goto reply;
9301 }
9302 err = 0;
9303 pending_inc.crush.clear();
9304 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9305 goto update;
9306 } else if (osdid_present &&
9307 (prefix == "osd crush set" || prefix == "osd crush add")) {
9308 // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
9309 // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
9310 // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
9311
9312 if (!osdmap.exists(osdid)) {
9313 err = -ENOENT;
9314 ss << osd_name
9315 << " does not exist. Create it before updating the crush map";
9316 goto reply;
9317 }
9318
9319 double weight;
9320 if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9321 ss << "unable to parse weight value '"
9322 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9323 err = -EINVAL;
9324 goto reply;
9325 }
9326
9327 string args;
9328 vector<string> argvec;
9329 cmd_getval(cct, cmdmap, "args", argvec);
9330 map<string,string> loc;
9331 CrushWrapper::parse_loc_map(argvec, &loc);
9332
9333 if (prefix == "osd crush set"
9334 && !_get_stable_crush().item_exists(osdid)) {
9335 err = -ENOENT;
9336 ss << "unable to set item id " << osdid << " name '" << osd_name
9337 << "' weight " << weight << " at location " << loc
9338 << ": does not exist";
9339 goto reply;
9340 }
9341
9342 dout(5) << "adding/updating crush item id " << osdid << " name '"
9343 << osd_name << "' weight " << weight << " at location "
9344 << loc << dendl;
9345 CrushWrapper newcrush;
9346 _get_pending_crush(newcrush);
9347
9348 string action;
9349 if (prefix == "osd crush set" ||
9350 newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
9351 action = "set";
9352 err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
9353 } else {
9354 action = "add";
9355 err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
9356 if (err == 0)
9357 err = 1;
9358 }
9359
9360 if (err < 0)
9361 goto reply;
9362
9363 if (err == 0 && !_have_pending_crush()) {
9364 ss << action << " item id " << osdid << " name '" << osd_name
9365 << "' weight " << weight << " at location " << loc << ": no change";
9366 goto reply;
9367 }
9368
9369 pending_inc.crush.clear();
9370 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9371 ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
9372 << weight << " at location " << loc << " to crush map";
9373 getline(ss, rs);
9374 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9375 get_last_committed() + 1));
9376 return true;
9377
9378 } else if (prefix == "osd crush create-or-move") {
9379 do {
9380 // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
9381 if (!osdmap.exists(osdid)) {
9382 err = -ENOENT;
9383 ss << osd_name
9384 << " does not exist. create it before updating the crush map";
9385 goto reply;
9386 }
9387
9388 double weight;
9389 if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9390 ss << "unable to parse weight value '"
9391 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9392 err = -EINVAL;
9393 goto reply;
9394 }
9395
9396 string args;
9397 vector<string> argvec;
9398 cmd_getval(cct, cmdmap, "args", argvec);
9399 map<string,string> loc;
9400 CrushWrapper::parse_loc_map(argvec, &loc);
9401
9402 dout(0) << "create-or-move crush item name '" << osd_name
9403 << "' initial_weight " << weight << " at location " << loc
9404 << dendl;
9405
9406 CrushWrapper newcrush;
9407 _get_pending_crush(newcrush);
9408
9409 err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
9410 g_conf()->osd_crush_update_weight_set);
9411 if (err == 0) {
9412 ss << "create-or-move updated item name '" << osd_name
9413 << "' weight " << weight
9414 << " at location " << loc << " to crush map";
9415 break;
9416 }
9417 if (err > 0) {
9418 pending_inc.crush.clear();
9419 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9420 ss << "create-or-move updating item name '" << osd_name
9421 << "' weight " << weight
9422 << " at location " << loc << " to crush map";
9423 getline(ss, rs);
9424 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9425 get_last_committed() + 1));
9426 return true;
9427 }
9428 } while (false);
9429
9430 } else if (prefix == "osd crush move") {
9431 do {
9432 // osd crush move <name> <loc1> [<loc2> ...]
9433 string name;
9434 vector<string> argvec;
9435 cmd_getval(cct, cmdmap, "name", name);
9436 cmd_getval(cct, cmdmap, "args", argvec);
9437 map<string,string> loc;
9438 CrushWrapper::parse_loc_map(argvec, &loc);
9439
9440 dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
9441 CrushWrapper newcrush;
9442 _get_pending_crush(newcrush);
9443
9444 if (!newcrush.name_exists(name)) {
9445 err = -ENOENT;
9446 ss << "item " << name << " does not exist";
9447 break;
9448 }
9449 int id = newcrush.get_item_id(name);
9450
9451 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9452 if (id >= 0) {
9453 err = newcrush.create_or_move_item(
9454 cct, id, 0, name, loc,
9455 g_conf()->osd_crush_update_weight_set);
9456 } else {
9457 err = newcrush.move_bucket(cct, id, loc);
9458 }
9459 if (err >= 0) {
9460 ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9461 pending_inc.crush.clear();
9462 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9463 getline(ss, rs);
9464 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9465 get_last_committed() + 1));
9466 return true;
9467 }
9468 } else {
9469 ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9470 err = 0;
9471 }
9472 } while (false);
9473 } else if (prefix == "osd crush swap-bucket") {
9474 string source, dest;
9475 cmd_getval(cct, cmdmap, "source", source);
9476 cmd_getval(cct, cmdmap, "dest", dest);
9477
9478 bool force = false;
9479 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
9480
9481 CrushWrapper newcrush;
9482 _get_pending_crush(newcrush);
9483 if (!newcrush.name_exists(source)) {
9484 ss << "source item " << source << " does not exist";
9485 err = -ENOENT;
9486 goto reply;
9487 }
9488 if (!newcrush.name_exists(dest)) {
9489 ss << "dest item " << dest << " does not exist";
9490 err = -ENOENT;
9491 goto reply;
9492 }
9493 int sid = newcrush.get_item_id(source);
9494 int did = newcrush.get_item_id(dest);
9495 int sparent;
9496 if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
9497 ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
9498 err = -EPERM;
9499 goto reply;
9500 }
9501 if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
9502 !force) {
9503 ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
9504 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
9505 << "; pass --yes-i-really-mean-it to proceed anyway";
9506 err = -EPERM;
9507 goto reply;
9508 }
9509 int r = newcrush.swap_bucket(cct, sid, did);
9510 if (r < 0) {
9511 ss << "failed to swap bucket contents: " << cpp_strerror(r);
9512 err = r;
9513 goto reply;
9514 }
9515 ss << "swapped bucket of " << source << " to " << dest;
9516 pending_inc.crush.clear();
9517 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9518 wait_for_finished_proposal(op,
9519 new Monitor::C_Command(mon, op, err, ss.str(),
9520 get_last_committed() + 1));
9521 return true;
9522 } else if (prefix == "osd crush link") {
9523 // osd crush link <name> <loc1> [<loc2> ...]
9524 string name;
9525 cmd_getval(cct, cmdmap, "name", name);
9526 vector<string> argvec;
9527 cmd_getval(cct, cmdmap, "args", argvec);
9528 map<string,string> loc;
9529 CrushWrapper::parse_loc_map(argvec, &loc);
9530
9531 // Need an explicit check for name_exists because get_item_id returns
9532 // 0 on unfound.
9533 int id = osdmap.crush->get_item_id(name);
9534 if (!osdmap.crush->name_exists(name)) {
9535 err = -ENOENT;
9536 ss << "item " << name << " does not exist";
9537 goto reply;
9538 } else {
9539 dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
9540 }
9541 if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
9542 ss << "no need to move item id " << id << " name '" << name
9543 << "' to location " << loc << " in crush map";
9544 err = 0;
9545 goto reply;
9546 }
9547
9548 dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
9549 CrushWrapper newcrush;
9550 _get_pending_crush(newcrush);
9551
9552 if (!newcrush.name_exists(name)) {
9553 err = -ENOENT;
9554 ss << "item " << name << " does not exist";
9555 goto reply;
9556 } else {
9557 int id = newcrush.get_item_id(name);
9558 if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9559 err = newcrush.link_bucket(cct, id, loc);
9560 if (err >= 0) {
9561 ss << "linked item id " << id << " name '" << name
9562 << "' to location " << loc << " in crush map";
9563 pending_inc.crush.clear();
9564 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9565 } else {
9566 ss << "cannot link item id " << id << " name '" << name
9567 << "' to location " << loc;
9568 goto reply;
9569 }
9570 } else {
9571 ss << "no need to move item id " << id << " name '" << name
9572 << "' to location " << loc << " in crush map";
9573 err = 0;
9574 }
9575 }
9576 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
9577 get_last_committed() + 1));
9578 return true;
9579 } else if (prefix == "osd crush rm" ||
9580 prefix == "osd crush remove" ||
9581 prefix == "osd crush unlink") {
9582 do {
9583 // osd crush rm <id> [ancestor]
9584 CrushWrapper newcrush;
9585 _get_pending_crush(newcrush);
9586
9587 string name;
9588 cmd_getval(cct, cmdmap, "name", name);
9589
9590 if (!osdmap.crush->name_exists(name)) {
9591 err = 0;
9592 ss << "device '" << name << "' does not appear in the crush map";
9593 break;
9594 }
9595 if (!newcrush.name_exists(name)) {
9596 err = 0;
9597 ss << "device '" << name << "' does not appear in the crush map";
9598 getline(ss, rs);
9599 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9600 get_last_committed() + 1));
9601 return true;
9602 }
9603 int id = newcrush.get_item_id(name);
9604 int ancestor = 0;
9605
9606 bool unlink_only = prefix == "osd crush unlink";
9607 string ancestor_str;
9608 if (cmd_getval(cct, cmdmap, "ancestor", ancestor_str)) {
9609 if (!newcrush.name_exists(ancestor_str)) {
9610 err = -ENOENT;
9611 ss << "ancestor item '" << ancestor_str
9612 << "' does not appear in the crush map";
9613 break;
9614 }
9615 ancestor = newcrush.get_item_id(ancestor_str);
9616 }
9617
9618 err = prepare_command_osd_crush_remove(
9619 newcrush,
9620 id, ancestor,
9621 (ancestor < 0), unlink_only);
9622
9623 if (err == -ENOENT) {
9624 ss << "item " << id << " does not appear in that position";
9625 err = 0;
9626 break;
9627 }
9628 if (err == 0) {
9629 if (!unlink_only)
9630 pending_inc.new_crush_node_flags[id] = 0;
9631 ss << "removed item id " << id << " name '" << name << "' from crush map";
9632 getline(ss, rs);
9633 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9634 get_last_committed() + 1));
9635 return true;
9636 }
9637 } while (false);
9638
9639 } else if (prefix == "osd crush reweight-all") {
9640 CrushWrapper newcrush;
9641 _get_pending_crush(newcrush);
9642
9643 newcrush.reweight(cct);
9644 pending_inc.crush.clear();
9645 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9646 ss << "reweighted crush hierarchy";
9647 getline(ss, rs);
9648 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9649 get_last_committed() + 1));
9650 return true;
9651 } else if (prefix == "osd crush reweight") {
9652 // osd crush reweight <name> <weight>
9653 CrushWrapper newcrush;
9654 _get_pending_crush(newcrush);
9655
9656 string name;
9657 cmd_getval(cct, cmdmap, "name", name);
9658 if (!newcrush.name_exists(name)) {
9659 err = -ENOENT;
9660 ss << "device '" << name << "' does not appear in the crush map";
9661 goto reply;
9662 }
9663
9664 int id = newcrush.get_item_id(name);
9665 if (id < 0) {
9666 ss << "device '" << name << "' is not a leaf in the crush map";
9667 err = -EINVAL;
9668 goto reply;
9669 }
9670 double w;
9671 if (!cmd_getval(cct, cmdmap, "weight", w)) {
9672 ss << "unable to parse weight value '"
9673 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9674 err = -EINVAL;
9675 goto reply;
9676 }
9677
9678 err = newcrush.adjust_item_weightf(cct, id, w,
9679 g_conf()->osd_crush_update_weight_set);
9680 if (err < 0)
9681 goto reply;
9682 pending_inc.crush.clear();
9683 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9684 ss << "reweighted item id " << id << " name '" << name << "' to " << w
9685 << " in crush map";
9686 getline(ss, rs);
9687 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9688 get_last_committed() + 1));
9689 return true;
9690 } else if (prefix == "osd crush reweight-subtree") {
9691 // osd crush reweight <name> <weight>
9692 CrushWrapper newcrush;
9693 _get_pending_crush(newcrush);
9694
9695 string name;
9696 cmd_getval(cct, cmdmap, "name", name);
9697 if (!newcrush.name_exists(name)) {
9698 err = -ENOENT;
9699 ss << "device '" << name << "' does not appear in the crush map";
9700 goto reply;
9701 }
9702
9703 int id = newcrush.get_item_id(name);
9704 if (id >= 0) {
9705 ss << "device '" << name << "' is not a subtree in the crush map";
9706 err = -EINVAL;
9707 goto reply;
9708 }
9709 double w;
9710 if (!cmd_getval(cct, cmdmap, "weight", w)) {
9711 ss << "unable to parse weight value '"
9712 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9713 err = -EINVAL;
9714 goto reply;
9715 }
9716
9717 err = newcrush.adjust_subtree_weightf(cct, id, w,
9718 g_conf()->osd_crush_update_weight_set);
9719 if (err < 0)
9720 goto reply;
9721 pending_inc.crush.clear();
9722 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9723 ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
9724 << " in crush map";
9725 getline(ss, rs);
9726 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9727 get_last_committed() + 1));
9728 return true;
9729 } else if (prefix == "osd crush tunables") {
9730 CrushWrapper newcrush;
9731 _get_pending_crush(newcrush);
9732
9733 err = 0;
9734 string profile;
9735 cmd_getval(cct, cmdmap, "profile", profile);
9736 if (profile == "legacy" || profile == "argonaut") {
9737 newcrush.set_tunables_legacy();
9738 } else if (profile == "bobtail") {
9739 newcrush.set_tunables_bobtail();
9740 } else if (profile == "firefly") {
9741 newcrush.set_tunables_firefly();
9742 } else if (profile == "hammer") {
9743 newcrush.set_tunables_hammer();
9744 } else if (profile == "jewel") {
9745 newcrush.set_tunables_jewel();
9746 } else if (profile == "optimal") {
9747 newcrush.set_tunables_optimal();
9748 } else if (profile == "default") {
9749 newcrush.set_tunables_default();
9750 } else {
9751 ss << "unrecognized profile '" << profile << "'";
9752 err = -EINVAL;
9753 goto reply;
9754 }
9755
9756 if (!validate_crush_against_features(&newcrush, ss)) {
9757 err = -EINVAL;
9758 goto reply;
9759 }
9760
9761 pending_inc.crush.clear();
9762 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9763 ss << "adjusted tunables profile to " << profile;
9764 getline(ss, rs);
9765 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9766 get_last_committed() + 1));
9767 return true;
9768 } else if (prefix == "osd crush set-tunable") {
9769 CrushWrapper newcrush;
9770 _get_pending_crush(newcrush);
9771
9772 err = 0;
9773 string tunable;
9774 cmd_getval(cct, cmdmap, "tunable", tunable);
9775
9776 int64_t value = -1;
9777 if (!cmd_getval(cct, cmdmap, "value", value)) {
9778 err = -EINVAL;
9779 ss << "failed to parse integer value "
9780 << cmd_vartype_stringify(cmdmap.at("value"));
9781 goto reply;
9782 }
9783
9784 if (tunable == "straw_calc_version") {
9785 if (value != 0 && value != 1) {
9786 ss << "value must be 0 or 1; got " << value;
9787 err = -EINVAL;
9788 goto reply;
9789 }
9790 newcrush.set_straw_calc_version(value);
9791 } else {
9792 ss << "unrecognized tunable '" << tunable << "'";
9793 err = -EINVAL;
9794 goto reply;
9795 }
9796
9797 if (!validate_crush_against_features(&newcrush, ss)) {
9798 err = -EINVAL;
9799 goto reply;
9800 }
9801
9802 pending_inc.crush.clear();
9803 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9804 ss << "adjusted tunable " << tunable << " to " << value;
9805 getline(ss, rs);
9806 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9807 get_last_committed() + 1));
9808 return true;
9809
9810 } else if (prefix == "osd crush rule create-simple") {
9811 string name, root, type, mode;
9812 cmd_getval(cct, cmdmap, "name", name);
9813 cmd_getval(cct, cmdmap, "root", root);
9814 cmd_getval(cct, cmdmap, "type", type);
9815 cmd_getval(cct, cmdmap, "mode", mode);
9816 if (mode == "")
9817 mode = "firstn";
9818
9819 if (osdmap.crush->rule_exists(name)) {
9820 // The name is uniquely associated to a ruleid and the rule it contains
9821 // From the user point of view, the rule is more meaningfull.
9822 ss << "rule " << name << " already exists";
9823 err = 0;
9824 goto reply;
9825 }
9826
9827 CrushWrapper newcrush;
9828 _get_pending_crush(newcrush);
9829
9830 if (newcrush.rule_exists(name)) {
9831 // The name is uniquely associated to a ruleid and the rule it contains
9832 // From the user point of view, the rule is more meaningfull.
9833 ss << "rule " << name << " already exists";
9834 err = 0;
9835 } else {
9836 int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
9837 pg_pool_t::TYPE_REPLICATED, &ss);
9838 if (ruleno < 0) {
9839 err = ruleno;
9840 goto reply;
9841 }
9842
9843 pending_inc.crush.clear();
9844 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9845 }
9846 getline(ss, rs);
9847 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9848 get_last_committed() + 1));
9849 return true;
9850
9851 } else if (prefix == "osd crush rule create-replicated") {
9852 string name, root, type, device_class;
9853 cmd_getval(cct, cmdmap, "name", name);
9854 cmd_getval(cct, cmdmap, "root", root);
9855 cmd_getval(cct, cmdmap, "type", type);
9856 cmd_getval(cct, cmdmap, "class", device_class);
9857
9858 if (osdmap.crush->rule_exists(name)) {
9859 // The name is uniquely associated to a ruleid and the rule it contains
9860 // From the user point of view, the rule is more meaningfull.
9861 ss << "rule " << name << " already exists";
9862 err = 0;
9863 goto reply;
9864 }
9865
9866 CrushWrapper newcrush;
9867 _get_pending_crush(newcrush);
9868
9869 if (newcrush.rule_exists(name)) {
9870 // The name is uniquely associated to a ruleid and the rule it contains
9871 // From the user point of view, the rule is more meaningfull.
9872 ss << "rule " << name << " already exists";
9873 err = 0;
9874 } else {
9875 int ruleno = newcrush.add_simple_rule(
9876 name, root, type, device_class,
9877 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
9878 if (ruleno < 0) {
9879 err = ruleno;
9880 goto reply;
9881 }
9882
9883 pending_inc.crush.clear();
9884 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9885 }
9886 getline(ss, rs);
9887 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9888 get_last_committed() + 1));
9889 return true;
9890
9891 } else if (prefix == "osd erasure-code-profile rm") {
9892 string name;
9893 cmd_getval(cct, cmdmap, "name", name);
9894
9895 if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
9896 goto wait;
9897
9898 if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
9899 err = -EBUSY;
9900 goto reply;
9901 }
9902
9903 if (osdmap.has_erasure_code_profile(name) ||
9904 pending_inc.new_erasure_code_profiles.count(name)) {
9905 if (osdmap.has_erasure_code_profile(name)) {
9906 pending_inc.old_erasure_code_profiles.push_back(name);
9907 } else {
9908 dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
9909 pending_inc.new_erasure_code_profiles.erase(name);
9910 }
9911
9912 getline(ss, rs);
9913 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9914 get_last_committed() + 1));
9915 return true;
9916 } else {
9917 ss << "erasure-code-profile " << name << " does not exist";
9918 err = 0;
9919 goto reply;
9920 }
9921
9922 } else if (prefix == "osd erasure-code-profile set") {
9923 string name;
9924 cmd_getval(cct, cmdmap, "name", name);
9925 vector<string> profile;
9926 cmd_getval(cct, cmdmap, "profile", profile);
9927
9928 bool force = false;
9929 cmd_getval(cct, cmdmap, "force", force);
9930
9931 map<string,string> profile_map;
9932 err = parse_erasure_code_profile(profile, &profile_map, &ss);
9933 if (err)
9934 goto reply;
9935 if (profile_map.find("plugin") == profile_map.end()) {
9936 ss << "erasure-code-profile " << profile_map
9937 << " must contain a plugin entry" << std::endl;
9938 err = -EINVAL;
9939 goto reply;
9940 }
9941 string plugin = profile_map["plugin"];
9942
9943 if (pending_inc.has_erasure_code_profile(name)) {
9944 dout(20) << "erasure code profile " << name << " try again" << dendl;
9945 goto wait;
9946 } else {
9947 err = normalize_profile(name, profile_map, force, &ss);
9948 if (err)
9949 goto reply;
9950
9951 if (osdmap.has_erasure_code_profile(name)) {
9952 ErasureCodeProfile existing_profile_map =
9953 osdmap.get_erasure_code_profile(name);
9954 err = normalize_profile(name, existing_profile_map, force, &ss);
9955 if (err)
9956 goto reply;
9957
9958 if (existing_profile_map == profile_map) {
9959 err = 0;
9960 goto reply;
9961 }
9962 if (!force) {
9963 err = -EPERM;
9964 ss << "will not override erasure code profile " << name
9965 << " because the existing profile "
9966 << existing_profile_map
9967 << " is different from the proposed profile "
9968 << profile_map;
9969 goto reply;
9970 }
9971 }
9972
9973 dout(20) << "erasure code profile set " << name << "="
9974 << profile_map << dendl;
9975 pending_inc.set_erasure_code_profile(name, profile_map);
9976 }
9977
9978 getline(ss, rs);
9979 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9980 get_last_committed() + 1));
9981 return true;
9982
9983 } else if (prefix == "osd crush rule create-erasure") {
9984 err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
9985 if (err == -EAGAIN)
9986 goto wait;
9987 if (err)
9988 goto reply;
9989 string name, poolstr;
9990 cmd_getval(cct, cmdmap, "name", name);
9991 string profile;
9992 cmd_getval(cct, cmdmap, "profile", profile);
9993 if (profile == "")
9994 profile = "default";
9995 if (profile == "default") {
9996 if (!osdmap.has_erasure_code_profile(profile)) {
9997 if (pending_inc.has_erasure_code_profile(profile)) {
9998 dout(20) << "erasure code profile " << profile << " already pending" << dendl;
9999 goto wait;
10000 }
10001
10002 map<string,string> profile_map;
10003 err = osdmap.get_erasure_code_profile_default(cct,
10004 profile_map,
10005 &ss);
10006 if (err)
10007 goto reply;
10008 err = normalize_profile(name, profile_map, true, &ss);
10009 if (err)
10010 goto reply;
10011 dout(20) << "erasure code profile set " << profile << "="
10012 << profile_map << dendl;
10013 pending_inc.set_erasure_code_profile(profile, profile_map);
10014 goto wait;
10015 }
10016 }
10017
10018 int rule;
10019 err = crush_rule_create_erasure(name, profile, &rule, &ss);
10020 if (err < 0) {
10021 switch(err) {
10022 case -EEXIST: // return immediately
10023 ss << "rule " << name << " already exists";
10024 err = 0;
10025 goto reply;
10026 break;
10027 case -EALREADY: // wait for pending to be proposed
10028 ss << "rule " << name << " already exists";
10029 err = 0;
10030 break;
10031 default: // non recoverable error
10032 goto reply;
10033 break;
10034 }
10035 } else {
10036 ss << "created rule " << name << " at " << rule;
10037 }
10038
10039 getline(ss, rs);
10040 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10041 get_last_committed() + 1));
10042 return true;
10043
10044 } else if (prefix == "osd crush rule rm") {
10045 string name;
10046 cmd_getval(cct, cmdmap, "name", name);
10047
10048 if (!osdmap.crush->rule_exists(name)) {
10049 ss << "rule " << name << " does not exist";
10050 err = 0;
10051 goto reply;
10052 }
10053
10054 CrushWrapper newcrush;
10055 _get_pending_crush(newcrush);
10056
10057 if (!newcrush.rule_exists(name)) {
10058 ss << "rule " << name << " does not exist";
10059 err = 0;
10060 } else {
10061 int ruleno = newcrush.get_rule_id(name);
10062 ceph_assert(ruleno >= 0);
10063
10064 // make sure it is not in use.
10065 // FIXME: this is ok in some situations, but let's not bother with that
10066 // complexity now.
10067 int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10068 if (osdmap.crush_rule_in_use(ruleset)) {
10069 ss << "crush ruleset " << name << " " << ruleset << " is in use";
10070 err = -EBUSY;
10071 goto reply;
10072 }
10073
10074 err = newcrush.remove_rule(ruleno);
10075 if (err < 0) {
10076 goto reply;
10077 }
10078
10079 pending_inc.crush.clear();
10080 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10081 }
10082 getline(ss, rs);
10083 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10084 get_last_committed() + 1));
10085 return true;
10086
10087 } else if (prefix == "osd crush rule rename") {
10088 string srcname;
10089 string dstname;
10090 cmd_getval(cct, cmdmap, "srcname", srcname);
10091 cmd_getval(cct, cmdmap, "dstname", dstname);
10092 if (srcname.empty() || dstname.empty()) {
10093 ss << "must specify both source rule name and destination rule name";
10094 err = -EINVAL;
10095 goto reply;
10096 }
10097 if (srcname == dstname) {
10098 ss << "destination rule name is equal to source rule name";
10099 err = 0;
10100 goto reply;
10101 }
10102
10103 CrushWrapper newcrush;
10104 _get_pending_crush(newcrush);
10105 if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10106 // srcname does not exist and dstname already exists
10107 // suppose this is a replay and return success
10108 // (so this command is idempotent)
10109 ss << "already renamed to '" << dstname << "'";
10110 err = 0;
10111 goto reply;
10112 }
10113
10114 err = newcrush.rename_rule(srcname, dstname, &ss);
10115 if (err < 0) {
10116 // ss has reason for failure
10117 goto reply;
10118 }
10119 pending_inc.crush.clear();
10120 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10121 getline(ss, rs);
10122 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10123 get_last_committed() + 1));
10124 return true;
10125
10126 } else if (prefix == "osd setmaxosd") {
10127 int64_t newmax;
10128 if (!cmd_getval(cct, cmdmap, "newmax", newmax)) {
10129 ss << "unable to parse 'newmax' value '"
10130 << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10131 err = -EINVAL;
10132 goto reply;
10133 }
10134
10135 if (newmax > g_conf()->mon_max_osd) {
10136 err = -ERANGE;
10137 ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10138 << g_conf()->mon_max_osd << ")";
10139 goto reply;
10140 }
10141
10142 // Don't allow shrinking OSD number as this will cause data loss
10143 // and may cause kernel crashes.
10144 // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10145 if (newmax < osdmap.get_max_osd()) {
10146 // Check if the OSDs exist between current max and new value.
10147 // If there are any OSDs exist, then don't allow shrinking number
10148 // of OSDs.
10149 for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10150 if (osdmap.exists(i)) {
10151 err = -EBUSY;
10152 ss << "cannot shrink max_osd to " << newmax
10153 << " because osd." << i << " (and possibly others) still in use";
10154 goto reply;
10155 }
10156 }
10157 }
10158
10159 pending_inc.new_max_osd = newmax;
10160 ss << "set new max_osd = " << pending_inc.new_max_osd;
10161 getline(ss, rs);
10162 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10163 get_last_committed() + 1));
10164 return true;
10165
10166 } else if (prefix == "osd set-full-ratio" ||
10167 prefix == "osd set-backfillfull-ratio" ||
10168 prefix == "osd set-nearfull-ratio") {
10169 double n;
10170 if (!cmd_getval(cct, cmdmap, "ratio", n)) {
10171 ss << "unable to parse 'ratio' value '"
10172 << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
10173 err = -EINVAL;
10174 goto reply;
10175 }
10176 if (prefix == "osd set-full-ratio")
10177 pending_inc.new_full_ratio = n;
10178 else if (prefix == "osd set-backfillfull-ratio")
10179 pending_inc.new_backfillfull_ratio = n;
10180 else if (prefix == "osd set-nearfull-ratio")
10181 pending_inc.new_nearfull_ratio = n;
10182 ss << prefix << " " << n;
10183 getline(ss, rs);
10184 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10185 get_last_committed() + 1));
10186 return true;
10187 } else if (prefix == "osd set-require-min-compat-client") {
10188 string v;
10189 cmd_getval(cct, cmdmap, "version", v);
10190 int vno = ceph_release_from_name(v.c_str());
10191 if (vno <= 0) {
10192 ss << "version " << v << " is not recognized";
10193 err = -EINVAL;
10194 goto reply;
10195 }
10196 OSDMap newmap;
10197 newmap.deepish_copy_from(osdmap);
10198 newmap.apply_incremental(pending_inc);
10199 newmap.require_min_compat_client = vno;
10200 auto mvno = newmap.get_min_compat_client();
10201 if (vno < mvno) {
10202 ss << "osdmap current utilizes features that require "
10203 << ceph_release_name(mvno)
10204 << "; cannot set require_min_compat_client below that to "
10205 << ceph_release_name(vno);
10206 err = -EPERM;
10207 goto reply;
10208 }
10209 bool sure = false;
10210 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10211 if (!sure) {
10212 FeatureMap m;
10213 mon->get_combined_feature_map(&m);
10214 uint64_t features = ceph_release_features(vno);
10215 bool first = true;
10216 bool ok = true;
10217 for (int type : {
10218 CEPH_ENTITY_TYPE_CLIENT,
10219 CEPH_ENTITY_TYPE_MDS,
10220 CEPH_ENTITY_TYPE_MGR }) {
10221 auto p = m.m.find(type);
10222 if (p == m.m.end()) {
10223 continue;
10224 }
10225 for (auto& q : p->second) {
10226 uint64_t missing = ~q.first & features;
10227 if (missing) {
10228 if (first) {
10229 ss << "cannot set require_min_compat_client to " << v << ": ";
10230 } else {
10231 ss << "; ";
10232 }
10233 first = false;
10234 ss << q.second << " connected " << ceph_entity_type_name(type)
10235 << "(s) look like " << ceph_release_name(
10236 ceph_release_from_features(q.first))
10237 << " (missing 0x" << std::hex << missing << std::dec << ")";
10238 ok = false;
10239 }
10240 }
10241 }
10242 if (!ok) {
10243 ss << "; add --yes-i-really-mean-it to do it anyway";
10244 err = -EPERM;
10245 goto reply;
10246 }
10247 }
10248 ss << "set require_min_compat_client to " << ceph_release_name(vno);
10249 pending_inc.new_require_min_compat_client = vno;
10250 getline(ss, rs);
10251 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10252 get_last_committed() + 1));
10253 return true;
10254 } else if (prefix == "osd pause") {
10255 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10256
10257 } else if (prefix == "osd unpause") {
10258 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10259
10260 } else if (prefix == "osd set") {
10261 bool sure = false;
10262 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
10263
10264 string key;
10265 cmd_getval(cct, cmdmap, "key", key);
10266 if (key == "full")
10267 return prepare_set_flag(op, CEPH_OSDMAP_FULL);
10268 else if (key == "pause")
10269 return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10270 else if (key == "noup")
10271 return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
10272 else if (key == "nodown")
10273 return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
10274 else if (key == "noout")
10275 return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
10276 else if (key == "noin")
10277 return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
10278 else if (key == "nobackfill")
10279 return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
10280 else if (key == "norebalance")
10281 return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
10282 else if (key == "norecover")
10283 return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
10284 else if (key == "noscrub")
10285 return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
10286 else if (key == "nodeep-scrub")
10287 return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10288 else if (key == "notieragent")
10289 return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10290 else if (key == "nosnaptrim")
10291 return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10292 else if (key == "pglog_hardlimit") {
10293 if (!osdmap.get_num_up_osds() && !sure) {
10294 ss << "Not advisable to continue since no OSDs are up. Pass "
10295 << "--yes-i-really-mean-it if you really wish to continue.";
10296 err = -EPERM;
10297 goto reply;
10298 }
10299 // The release check here is required because for OSD_PGLOG_HARDLIMIT,
10300 // we are reusing a jewel feature bit that was retired in luminous.
10301 if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
10302 (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
10303 || sure)) {
10304 return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
10305 } else {
10306 ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
10307 err = -EPERM;
10308 goto reply;
10309 }
10310 } else {
10311 ss << "unrecognized flag '" << key << "'";
10312 err = -EINVAL;
10313 }
10314
10315 } else if (prefix == "osd unset") {
10316 string key;
10317 cmd_getval(cct, cmdmap, "key", key);
10318 if (key == "full")
10319 return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
10320 else if (key == "pause")
10321 return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10322 else if (key == "noup")
10323 return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
10324 else if (key == "nodown")
10325 return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
10326 else if (key == "noout")
10327 return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
10328 else if (key == "noin")
10329 return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
10330 else if (key == "nobackfill")
10331 return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
10332 else if (key == "norebalance")
10333 return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
10334 else if (key == "norecover")
10335 return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
10336 else if (key == "noscrub")
10337 return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
10338 else if (key == "nodeep-scrub")
10339 return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10340 else if (key == "notieragent")
10341 return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10342 else if (key == "nosnaptrim")
10343 return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10344 else {
10345 ss << "unrecognized flag '" << key << "'";
10346 err = -EINVAL;
10347 }
10348
10349 } else if (prefix == "osd require-osd-release") {
10350 string release;
10351 cmd_getval(cct, cmdmap, "release", release);
10352 bool sure = false;
10353 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10354 int rel = ceph_release_from_name(release.c_str());
10355 if (rel <= 0) {
10356 ss << "unrecognized release " << release;
10357 err = -EINVAL;
10358 goto reply;
10359 }
10360 if (rel == osdmap.require_osd_release) {
10361 // idempotent
10362 err = 0;
10363 goto reply;
10364 }
10365 ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
10366 if (!osdmap.get_num_up_osds() && !sure) {
10367 ss << "Not advisable to continue since no OSDs are up. Pass "
10368 << "--yes-i-really-mean-it if you really wish to continue.";
10369 err = -EPERM;
10370 goto reply;
10371 }
10372 if (rel == CEPH_RELEASE_MIMIC) {
10373 if (!mon->monmap->get_required_features().contains_all(
10374 ceph::features::mon::FEATURE_MIMIC)) {
10375 ss << "not all mons are mimic";
10376 err = -EPERM;
10377 goto reply;
10378 }
10379 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
10380 && !sure) {
10381 ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
10382 err = -EPERM;
10383 goto reply;
10384 }
10385 } else if (rel == CEPH_RELEASE_NAUTILUS) {
10386 if (!mon->monmap->get_required_features().contains_all(
10387 ceph::features::mon::FEATURE_NAUTILUS)) {
10388 ss << "not all mons are nautilus";
10389 err = -EPERM;
10390 goto reply;
10391 }
10392 if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
10393 && !sure) {
10394 ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
10395 err = -EPERM;
10396 goto reply;
10397 }
10398 } else {
10399 ss << "not supported for this release yet";
10400 err = -EPERM;
10401 goto reply;
10402 }
10403 if (rel < osdmap.require_osd_release) {
10404 ss << "require_osd_release cannot be lowered once it has been set";
10405 err = -EPERM;
10406 goto reply;
10407 }
10408 pending_inc.new_require_osd_release = rel;
10409 goto update;
10410 } else if (prefix == "osd down" ||
10411 prefix == "osd out" ||
10412 prefix == "osd in" ||
10413 prefix == "osd rm") {
10414
10415 bool any = false;
10416 bool stop = false;
10417 bool verbose = true;
10418
10419 vector<string> idvec;
10420 cmd_getval(cct, cmdmap, "ids", idvec);
10421 for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10422 set<int> osds;
10423
10424 // wildcard?
10425 if (j == 0 &&
10426 (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10427 if (prefix == "osd in") {
10428 // touch out osds only
10429 osdmap.get_out_existing_osds(osds);
10430 } else {
10431 osdmap.get_all_osds(osds);
10432 }
10433 stop = true;
10434 verbose = false; // so the output is less noisy.
10435 } else {
10436 long osd = parse_osd_id(idvec[j].c_str(), &ss);
10437 if (osd < 0) {
10438 ss << "invalid osd id" << osd;
10439 err = -EINVAL;
10440 continue;
10441 } else if (!osdmap.exists(osd)) {
10442 ss << "osd." << osd << " does not exist. ";
10443 continue;
10444 }
10445
10446 osds.insert(osd);
10447 }
10448
10449 for (auto &osd : osds) {
10450 if (prefix == "osd down") {
10451 if (osdmap.is_down(osd)) {
10452 if (verbose)
10453 ss << "osd." << osd << " is already down. ";
10454 } else {
10455 pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
10456 ss << "marked down osd." << osd << ". ";
10457 any = true;
10458 }
10459 } else if (prefix == "osd out") {
10460 if (osdmap.is_out(osd)) {
10461 if (verbose)
10462 ss << "osd." << osd << " is already out. ";
10463 } else {
10464 pending_inc.new_weight[osd] = CEPH_OSD_OUT;
10465 if (osdmap.osd_weight[osd]) {
10466 if (pending_inc.new_xinfo.count(osd) == 0) {
10467 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10468 }
10469 pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
10470 }
10471 ss << "marked out osd." << osd << ". ";
10472 std::ostringstream msg;
10473 msg << "Client " << op->get_session()->entity_name
10474 << " marked osd." << osd << " out";
10475 if (osdmap.is_up(osd)) {
10476 msg << ", while it was still marked up";
10477 } else {
10478 auto period = ceph_clock_now() - down_pending_out[osd];
10479 msg << ", after it was down for " << int(period.sec())
10480 << " seconds";
10481 }
10482
10483 mon->clog->info() << msg.str();
10484 any = true;
10485 }
10486 } else if (prefix == "osd in") {
10487 if (osdmap.is_in(osd)) {
10488 if (verbose)
10489 ss << "osd." << osd << " is already in. ";
10490 } else {
10491 if (osdmap.osd_xinfo[osd].old_weight > 0) {
10492 pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
10493 if (pending_inc.new_xinfo.count(osd) == 0) {
10494 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10495 }
10496 pending_inc.new_xinfo[osd].old_weight = 0;
10497 } else {
10498 pending_inc.new_weight[osd] = CEPH_OSD_IN;
10499 }
10500 ss << "marked in osd." << osd << ". ";
10501 any = true;
10502 }
10503 } else if (prefix == "osd rm") {
10504 err = prepare_command_osd_remove(osd);
10505
10506 if (err == -EBUSY) {
10507 if (any)
10508 ss << ", ";
10509 ss << "osd." << osd << " is still up; must be down before removal. ";
10510 } else {
10511 ceph_assert(err == 0);
10512 if (any) {
10513 ss << ", osd." << osd;
10514 } else {
10515 ss << "removed osd." << osd;
10516 }
10517 any = true;
10518 }
10519 }
10520 }
10521 }
10522 if (any) {
10523 getline(ss, rs);
10524 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10525 get_last_committed() + 1));
10526 return true;
10527 }
10528 } else if (prefix == "osd set-group" ||
10529 prefix == "osd unset-group" ||
10530 prefix == "osd add-noup" ||
10531 prefix == "osd add-nodown" ||
10532 prefix == "osd add-noin" ||
10533 prefix == "osd add-noout" ||
10534 prefix == "osd rm-noup" ||
10535 prefix == "osd rm-nodown" ||
10536 prefix == "osd rm-noin" ||
10537 prefix == "osd rm-noout") {
10538 bool do_set = prefix == "osd set-group" ||
10539 prefix.find("add") != string::npos;
10540 string flag_str;
10541 unsigned flags = 0;
10542 vector<string> who;
10543 if (prefix == "osd set-group" || prefix == "osd unset-group") {
10544 cmd_getval(cct, cmdmap, "flags", flag_str);
10545 cmd_getval(cct, cmdmap, "who", who);
10546 vector<string> raw_flags;
10547 boost::split(raw_flags, flag_str, boost::is_any_of(","));
10548 for (auto& f : raw_flags) {
10549 if (f == "noup")
10550 flags |= CEPH_OSD_NOUP;
10551 else if (f == "nodown")
10552 flags |= CEPH_OSD_NODOWN;
10553 else if (f == "noin")
10554 flags |= CEPH_OSD_NOIN;
10555 else if (f == "noout")
10556 flags |= CEPH_OSD_NOOUT;
10557 else {
10558 ss << "unrecognized flag '" << f << "', must be one of "
10559 << "{noup,nodown,noin,noout}";
10560 err = -EINVAL;
10561 goto reply;
10562 }
10563 }
10564 } else {
10565 cmd_getval(cct, cmdmap, "ids", who);
10566 if (prefix.find("noup") != string::npos)
10567 flags = CEPH_OSD_NOUP;
10568 else if (prefix.find("nodown") != string::npos)
10569 flags = CEPH_OSD_NODOWN;
10570 else if (prefix.find("noin") != string::npos)
10571 flags = CEPH_OSD_NOIN;
10572 else if (prefix.find("noout") != string::npos)
10573 flags = CEPH_OSD_NOOUT;
10574 else
10575 ceph_assert(0 == "Unreachable!");
10576 }
10577 if (flags == 0) {
10578 ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
10579 err = -EINVAL;
10580 goto reply;
10581 }
10582 if (who.empty()) {
10583 ss << "must specify at least one or more targets to set/unset";
10584 err = -EINVAL;
10585 goto reply;
10586 }
10587 set<int> osds;
10588 set<int> crush_nodes;
10589 set<int> device_classes;
10590 for (auto& w : who) {
10591 if (w == "any" || w == "all" || w == "*") {
10592 osdmap.get_all_osds(osds);
10593 break;
10594 }
10595 std::stringstream ts;
10596 if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
10597 osds.insert(osd);
10598 } else if (osdmap.crush->name_exists(w)) {
10599 crush_nodes.insert(osdmap.crush->get_item_id(w));
10600 } else if (osdmap.crush->class_exists(w)) {
10601 device_classes.insert(osdmap.crush->get_class_id(w));
10602 } else {
10603 ss << "unable to parse osd id or crush node or device class: "
10604 << "\"" << w << "\". ";
10605 }
10606 }
10607 if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
10608 // ss has reason for failure
10609 err = -EINVAL;
10610 goto reply;
10611 }
10612 bool any = false;
10613 for (auto osd : osds) {
10614 if (!osdmap.exists(osd)) {
10615 ss << "osd." << osd << " does not exist. ";
10616 continue;
10617 }
10618 if (do_set) {
10619 if (flags & CEPH_OSD_NOUP) {
10620 any |= osdmap.is_noup_by_osd(osd) ?
10621 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
10622 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
10623 }
10624 if (flags & CEPH_OSD_NODOWN) {
10625 any |= osdmap.is_nodown_by_osd(osd) ?
10626 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
10627 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
10628 }
10629 if (flags & CEPH_OSD_NOIN) {
10630 any |= osdmap.is_noin_by_osd(osd) ?
10631 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
10632 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
10633 }
10634 if (flags & CEPH_OSD_NOOUT) {
10635 any |= osdmap.is_noout_by_osd(osd) ?
10636 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
10637 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
10638 }
10639 } else {
10640 if (flags & CEPH_OSD_NOUP) {
10641 any |= osdmap.is_noup_by_osd(osd) ?
10642 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
10643 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
10644 }
10645 if (flags & CEPH_OSD_NODOWN) {
10646 any |= osdmap.is_nodown_by_osd(osd) ?
10647 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
10648 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
10649 }
10650 if (flags & CEPH_OSD_NOIN) {
10651 any |= osdmap.is_noin_by_osd(osd) ?
10652 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
10653 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
10654 }
10655 if (flags & CEPH_OSD_NOOUT) {
10656 any |= osdmap.is_noout_by_osd(osd) ?
10657 pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
10658 pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
10659 }
10660 }
10661 }
10662 for (auto& id : crush_nodes) {
10663 auto old_flags = osdmap.get_crush_node_flags(id);
10664 auto& pending_flags = pending_inc.new_crush_node_flags[id];
10665 pending_flags |= old_flags; // adopt existing flags first!
10666 if (do_set) {
10667 pending_flags |= flags;
10668 } else {
10669 pending_flags &= ~flags;
10670 }
10671 any = true;
10672 }
10673 for (auto& id : device_classes) {
10674 auto old_flags = osdmap.get_device_class_flags(id);
10675 auto& pending_flags = pending_inc.new_device_class_flags[id];
10676 pending_flags |= old_flags;
10677 if (do_set) {
10678 pending_flags |= flags;
10679 } else {
10680 pending_flags &= ~flags;
10681 }
10682 any = true;
10683 }
10684 if (any) {
10685 getline(ss, rs);
10686 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10687 get_last_committed() + 1));
10688 return true;
10689 }
10690 } else if (prefix == "osd pg-temp") {
10691 string pgidstr;
10692 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10693 ss << "unable to parse 'pgid' value '"
10694 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10695 err = -EINVAL;
10696 goto reply;
10697 }
10698 pg_t pgid;
10699 if (!pgid.parse(pgidstr.c_str())) {
10700 ss << "invalid pgid '" << pgidstr << "'";
10701 err = -EINVAL;
10702 goto reply;
10703 }
10704 if (!osdmap.pg_exists(pgid)) {
10705 ss << "pg " << pgid << " does not exist";
10706 err = -ENOENT;
10707 goto reply;
10708 }
10709 if (pending_inc.new_pg_temp.count(pgid)) {
10710 dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
10711 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10712 return true;
10713 }
10714
10715 vector<int64_t> id_vec;
10716 vector<int32_t> new_pg_temp;
10717 cmd_getval(cct, cmdmap, "id", id_vec);
10718 if (id_vec.empty()) {
10719 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
10720 ss << "done cleaning up pg_temp of " << pgid;
10721 goto update;
10722 }
10723 for (auto osd : id_vec) {
10724 if (!osdmap.exists(osd)) {
10725 ss << "osd." << osd << " does not exist";
10726 err = -ENOENT;
10727 goto reply;
10728 }
10729 new_pg_temp.push_back(osd);
10730 }
10731
10732 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10733 if ((int)new_pg_temp.size() < pool_min_size) {
10734 ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
10735 << pool_min_size << ")";
10736 err = -EINVAL;
10737 goto reply;
10738 }
10739
10740 int pool_size = osdmap.get_pg_pool_size(pgid);
10741 if ((int)new_pg_temp.size() > pool_size) {
10742 ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
10743 << pool_size << ")";
10744 err = -EINVAL;
10745 goto reply;
10746 }
10747
10748 pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
10749 new_pg_temp.begin(), new_pg_temp.end());
10750 ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
10751 goto update;
10752 } else if (prefix == "osd primary-temp") {
10753 string pgidstr;
10754 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10755 ss << "unable to parse 'pgid' value '"
10756 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10757 err = -EINVAL;
10758 goto reply;
10759 }
10760 pg_t pgid;
10761 if (!pgid.parse(pgidstr.c_str())) {
10762 ss << "invalid pgid '" << pgidstr << "'";
10763 err = -EINVAL;
10764 goto reply;
10765 }
10766 if (!osdmap.pg_exists(pgid)) {
10767 ss << "pg " << pgid << " does not exist";
10768 err = -ENOENT;
10769 goto reply;
10770 }
10771
10772 int64_t osd;
10773 if (!cmd_getval(cct, cmdmap, "id", osd)) {
10774 ss << "unable to parse 'id' value '"
10775 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10776 err = -EINVAL;
10777 goto reply;
10778 }
10779 if (osd != -1 && !osdmap.exists(osd)) {
10780 ss << "osd." << osd << " does not exist";
10781 err = -ENOENT;
10782 goto reply;
10783 }
10784
10785 if (osdmap.require_min_compat_client > 0 &&
10786 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10787 ss << "require_min_compat_client "
10788 << ceph_release_name(osdmap.require_min_compat_client)
10789 << " < firefly, which is required for primary-temp";
10790 err = -EPERM;
10791 goto reply;
10792 }
10793
10794 pending_inc.new_primary_temp[pgid] = osd;
10795 ss << "set " << pgid << " primary_temp mapping to " << osd;
10796 goto update;
10797 } else if (prefix == "pg repeer") {
10798 pg_t pgid;
10799 string pgidstr;
10800 cmd_getval(cct, cmdmap, "pgid", pgidstr);
10801 if (!pgid.parse(pgidstr.c_str())) {
10802 ss << "invalid pgid '" << pgidstr << "'";
10803 err = -EINVAL;
10804 goto reply;
10805 }
10806 if (!osdmap.pg_exists(pgid)) {
10807 ss << "pg '" << pgidstr << "' does not exist";
10808 err = -ENOENT;
10809 goto reply;
10810 }
10811 vector<int> acting;
10812 int primary;
10813 osdmap.pg_to_acting_osds(pgid, &acting, &primary);
10814 if (primary < 0) {
10815 err = -EAGAIN;
10816 ss << "pg currently has no primary";
10817 goto reply;
10818 }
10819 if (acting.size() > 1) {
10820 // map to just primary; it will map back to what it wants
10821 pending_inc.new_pg_temp[pgid] = { primary };
10822 } else {
10823 // hmm, pick another arbitrary osd to induce a change. Note
10824 // that this won't work if there is only one suitable OSD in the cluster.
10825 int i;
10826 bool done = false;
10827 for (i = 0; i < osdmap.get_max_osd(); ++i) {
10828 if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
10829 continue;
10830 }
10831 pending_inc.new_pg_temp[pgid] = { primary, i };
10832 done = true;
10833 break;
10834 }
10835 if (!done) {
10836 err = -EAGAIN;
10837 ss << "not enough up OSDs in the cluster to force repeer";
10838 goto reply;
10839 }
10840 }
10841 goto update;
10842 } else if (prefix == "osd pg-upmap" ||
10843 prefix == "osd rm-pg-upmap" ||
10844 prefix == "osd pg-upmap-items" ||
10845 prefix == "osd rm-pg-upmap-items") {
10846 if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
10847 ss << "min_compat_client "
10848 << ceph_release_name(osdmap.require_min_compat_client)
10849 << " < luminous, which is required for pg-upmap. "
10850 << "Try 'ceph osd set-require-min-compat-client luminous' "
10851 << "before using the new interface";
10852 err = -EPERM;
10853 goto reply;
10854 }
10855 err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
10856 if (err == -EAGAIN)
10857 goto wait;
10858 if (err < 0)
10859 goto reply;
10860 string pgidstr;
10861 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10862 ss << "unable to parse 'pgid' value '"
10863 << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10864 err = -EINVAL;
10865 goto reply;
10866 }
10867 pg_t pgid;
10868 if (!pgid.parse(pgidstr.c_str())) {
10869 ss << "invalid pgid '" << pgidstr << "'";
10870 err = -EINVAL;
10871 goto reply;
10872 }
10873 if (!osdmap.pg_exists(pgid)) {
10874 ss << "pg " << pgid << " does not exist";
10875 err = -ENOENT;
10876 goto reply;
10877 }
10878 if (pending_inc.old_pools.count(pgid.pool())) {
10879 ss << "pool of " << pgid << " is pending removal";
10880 err = -ENOENT;
10881 getline(ss, rs);
10882 wait_for_finished_proposal(op,
10883 new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
10884 return true;
10885 }
10886
10887 enum {
10888 OP_PG_UPMAP,
10889 OP_RM_PG_UPMAP,
10890 OP_PG_UPMAP_ITEMS,
10891 OP_RM_PG_UPMAP_ITEMS,
10892 } option;
10893
10894 if (prefix == "osd pg-upmap") {
10895 option = OP_PG_UPMAP;
10896 } else if (prefix == "osd rm-pg-upmap") {
10897 option = OP_RM_PG_UPMAP;
10898 } else if (prefix == "osd pg-upmap-items") {
10899 option = OP_PG_UPMAP_ITEMS;
10900 } else {
10901 option = OP_RM_PG_UPMAP_ITEMS;
10902 }
10903
10904 // check pending upmap changes
10905 switch (option) {
10906 case OP_PG_UPMAP: // fall through
10907 case OP_RM_PG_UPMAP:
10908 if (pending_inc.new_pg_upmap.count(pgid) ||
10909 pending_inc.old_pg_upmap.count(pgid)) {
10910 dout(10) << __func__ << " waiting for pending update on "
10911 << pgid << dendl;
10912 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10913 return true;
10914 }
10915 break;
10916
10917 case OP_PG_UPMAP_ITEMS: // fall through
10918 case OP_RM_PG_UPMAP_ITEMS:
10919 if (pending_inc.new_pg_upmap_items.count(pgid) ||
10920 pending_inc.old_pg_upmap_items.count(pgid)) {
10921 dout(10) << __func__ << " waiting for pending update on "
10922 << pgid << dendl;
10923 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10924 return true;
10925 }
10926 break;
10927
10928 default:
10929 ceph_abort_msg("invalid option");
10930 }
10931
10932 switch (option) {
10933 case OP_PG_UPMAP:
10934 {
10935 vector<int64_t> id_vec;
10936 if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10937 ss << "unable to parse 'id' value(s) '"
10938 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10939 err = -EINVAL;
10940 goto reply;
10941 }
10942
10943 int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10944 if ((int)id_vec.size() < pool_min_size) {
10945 ss << "num of osds (" << id_vec.size() <<") < pool min size ("
10946 << pool_min_size << ")";
10947 err = -EINVAL;
10948 goto reply;
10949 }
10950
10951 int pool_size = osdmap.get_pg_pool_size(pgid);
10952 if ((int)id_vec.size() > pool_size) {
10953 ss << "num of osds (" << id_vec.size() <<") > pool size ("
10954 << pool_size << ")";
10955 err = -EINVAL;
10956 goto reply;
10957 }
10958
10959 vector<int32_t> new_pg_upmap;
10960 for (auto osd : id_vec) {
10961 if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
10962 ss << "osd." << osd << " does not exist";
10963 err = -ENOENT;
10964 goto reply;
10965 }
10966 auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
10967 if (it != new_pg_upmap.end()) {
10968 ss << "osd." << osd << " already exists, ";
10969 continue;
10970 }
10971 new_pg_upmap.push_back(osd);
10972 }
10973
10974 if (new_pg_upmap.empty()) {
10975 ss << "no valid upmap items(pairs) is specified";
10976 err = -EINVAL;
10977 goto reply;
10978 }
10979
10980 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
10981 new_pg_upmap.begin(), new_pg_upmap.end());
10982 ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
10983 }
10984 break;
10985
10986 case OP_RM_PG_UPMAP:
10987 {
10988 pending_inc.old_pg_upmap.insert(pgid);
10989 ss << "clear " << pgid << " pg_upmap mapping";
10990 }
10991 break;
10992
10993 case OP_PG_UPMAP_ITEMS:
10994 {
10995 vector<int64_t> id_vec;
10996 if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10997 ss << "unable to parse 'id' value(s) '"
10998 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10999 err = -EINVAL;
11000 goto reply;
11001 }
11002
11003 if (id_vec.size() % 2) {
11004 ss << "you must specify pairs of osd ids to be remapped";
11005 err = -EINVAL;
11006 goto reply;
11007 }
11008
11009 int pool_size = osdmap.get_pg_pool_size(pgid);
11010 if ((int)(id_vec.size() / 2) > pool_size) {
11011 ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11012 << pool_size << ")";
11013 err = -EINVAL;
11014 goto reply;
11015 }
11016
11017 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11018 ostringstream items;
11019 items << "[";
11020 for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11021 int from = *p++;
11022 int to = *p;
11023 if (from == to) {
11024 ss << "from osd." << from << " == to osd." << to << ", ";
11025 continue;
11026 }
11027 if (!osdmap.exists(from)) {
11028 ss << "osd." << from << " does not exist";
11029 err = -ENOENT;
11030 goto reply;
11031 }
11032 if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11033 ss << "osd." << to << " does not exist";
11034 err = -ENOENT;
11035 goto reply;
11036 }
11037 pair<int32_t,int32_t> entry = make_pair(from, to);
11038 auto it = std::find(new_pg_upmap_items.begin(),
11039 new_pg_upmap_items.end(), entry);
11040 if (it != new_pg_upmap_items.end()) {
11041 ss << "osd." << from << " -> osd." << to << " already exists, ";
11042 continue;
11043 }
11044 new_pg_upmap_items.push_back(entry);
11045 items << from << "->" << to << ",";
11046 }
11047 string out(items.str());
11048 out.resize(out.size() - 1); // drop last ','
11049 out += "]";
11050
11051 if (new_pg_upmap_items.empty()) {
11052 ss << "no valid upmap items(pairs) is specified";
11053 err = -EINVAL;
11054 goto reply;
11055 }
11056
11057 pending_inc.new_pg_upmap_items[pgid] =
11058 mempool::osdmap::vector<pair<int32_t,int32_t>>(
11059 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11060 ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11061 }
11062 break;
11063
11064 case OP_RM_PG_UPMAP_ITEMS:
11065 {
11066 pending_inc.old_pg_upmap_items.insert(pgid);
11067 ss << "clear " << pgid << " pg_upmap_items mapping";
11068 }
11069 break;
11070
11071 default:
11072 ceph_abort_msg("invalid option");
11073 }
11074
11075 goto update;
11076 } else if (prefix == "osd primary-affinity") {
11077 int64_t id;
11078 if (!cmd_getval(cct, cmdmap, "id", id)) {
11079 ss << "invalid osd id value '"
11080 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11081 err = -EINVAL;
11082 goto reply;
11083 }
11084 double w;
11085 if (!cmd_getval(cct, cmdmap, "weight", w)) {
11086 ss << "unable to parse 'weight' value '"
11087 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11088 err = -EINVAL;
11089 goto reply;
11090 }
11091 long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11092 if (ww < 0L) {
11093 ss << "weight must be >= 0";
11094 err = -EINVAL;
11095 goto reply;
11096 }
11097 if (osdmap.require_min_compat_client > 0 &&
11098 osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
11099 ss << "require_min_compat_client "
11100 << ceph_release_name(osdmap.require_min_compat_client)
11101 << " < firefly, which is required for primary-affinity";
11102 err = -EPERM;
11103 goto reply;
11104 }
11105 if (osdmap.exists(id)) {
11106 pending_inc.new_primary_affinity[id] = ww;
11107 ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11108 getline(ss, rs);
11109 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11110 get_last_committed() + 1));
11111 return true;
11112 } else {
11113 ss << "osd." << id << " does not exist";
11114 err = -ENOENT;
11115 goto reply;
11116 }
11117 } else if (prefix == "osd reweight") {
11118 int64_t id;
11119 if (!cmd_getval(cct, cmdmap, "id", id)) {
11120 ss << "unable to parse osd id value '"
11121 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11122 err = -EINVAL;
11123 goto reply;
11124 }
11125 double w;
11126 if (!cmd_getval(cct, cmdmap, "weight", w)) {
11127 ss << "unable to parse weight value '"
11128 << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11129 err = -EINVAL;
11130 goto reply;
11131 }
11132 long ww = (int)((double)CEPH_OSD_IN*w);
11133 if (ww < 0L) {
11134 ss << "weight must be >= 0";
11135 err = -EINVAL;
11136 goto reply;
11137 }
11138 if (osdmap.exists(id)) {
11139 pending_inc.new_weight[id] = ww;
11140 ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
11141 getline(ss, rs);
11142 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11143 get_last_committed() + 1));
11144 return true;
11145 } else {
11146 ss << "osd." << id << " does not exist";
11147 err = -ENOENT;
11148 goto reply;
11149 }
11150 } else if (prefix == "osd reweightn") {
11151 map<int32_t, uint32_t> weights;
11152 err = parse_reweights(cct, cmdmap, osdmap, &weights);
11153 if (err) {
11154 ss << "unable to parse 'weights' value '"
11155 << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
11156 goto reply;
11157 }
11158 pending_inc.new_weight.insert(weights.begin(), weights.end());
11159 wait_for_finished_proposal(
11160 op,
11161 new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11162 return true;
11163 } else if (prefix == "osd lost") {
11164 int64_t id;
11165 if (!cmd_getval(cct, cmdmap, "id", id)) {
11166 ss << "unable to parse osd id value '"
11167 << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11168 err = -EINVAL;
11169 goto reply;
11170 }
11171 bool sure = false;
11172 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11173 if (!sure) {
11174 ss << "are you SURE? this might mean real, permanent data loss. pass "
11175 "--yes-i-really-mean-it if you really do.";
11176 err = -EPERM;
11177 goto reply;
11178 } else if (!osdmap.exists(id)) {
11179 ss << "osd." << id << " does not exist";
11180 err = -ENOENT;
11181 goto reply;
11182 } else if (!osdmap.is_down(id)) {
11183 ss << "osd." << id << " is not down";
11184 err = -EBUSY;
11185 goto reply;
11186 } else {
11187 epoch_t e = osdmap.get_info(id).down_at;
11188 pending_inc.new_lost[id] = e;
11189 ss << "marked osd lost in epoch " << e;
11190 getline(ss, rs);
11191 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11192 get_last_committed() + 1));
11193 return true;
11194 }
11195
11196 } else if (prefix == "osd destroy-actual" ||
11197 prefix == "osd purge-actual" ||
11198 prefix == "osd purge-new") {
11199 /* Destroying an OSD means that we don't expect to further make use of
11200 * the OSDs data (which may even become unreadable after this operation),
11201 * and that we are okay with scrubbing all its cephx keys and config-key
11202 * data (which may include lockbox keys, thus rendering the osd's data
11203 * unreadable).
11204 *
11205 * The OSD will not be removed. Instead, we will mark it as destroyed,
11206 * such that a subsequent call to `create` will not reuse the osd id.
11207 * This will play into being able to recreate the OSD, at the same
11208 * crush location, with minimal data movement.
11209 */
11210
11211 // make sure authmon is writeable.
11212 if (!mon->authmon()->is_writeable()) {
11213 dout(10) << __func__ << " waiting for auth mon to be writeable for "
11214 << "osd destroy" << dendl;
11215 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11216 return false;
11217 }
11218
11219 int64_t id;
11220 if (!cmd_getval(cct, cmdmap, "id", id)) {
11221 auto p = cmdmap.find("id");
11222 if (p == cmdmap.end()) {
11223 ss << "no osd id specified";
11224 } else {
11225 ss << "unable to parse osd id value '"
11226 << cmd_vartype_stringify(cmdmap.at("id")) << "";
11227 }
11228 err = -EINVAL;
11229 goto reply;
11230 }
11231
11232 bool is_destroy = (prefix == "osd destroy-actual");
11233 if (!is_destroy) {
11234 ceph_assert("osd purge-actual" == prefix ||
11235 "osd purge-new" == prefix);
11236 }
11237
11238 bool sure = false;
11239 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11240 if (!sure) {
11241 ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? "
11242 << "This will mean real, permanent data loss, as well "
11243 << "as deletion of cephx and lockbox keys. "
11244 << "Pass --yes-i-really-mean-it if you really do.";
11245 err = -EPERM;
11246 goto reply;
11247 } else if (!osdmap.exists(id)) {
11248 ss << "osd." << id << " does not exist";
11249 err = 0; // idempotent
11250 goto reply;
11251 } else if (osdmap.is_up(id)) {
11252 ss << "osd." << id << " is not `down`.";
11253 err = -EBUSY;
11254 goto reply;
11255 } else if (is_destroy && osdmap.is_destroyed(id)) {
11256 ss << "destroyed osd." << id;
11257 err = 0;
11258 goto reply;
11259 }
11260
11261 if (prefix == "osd purge-new" &&
11262 (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
11263 ss << "osd." << id << " is not new";
11264 err = -EPERM;
11265 goto reply;
11266 }
11267
11268 bool goto_reply = false;
11269
11270 paxos->plug();
11271 if (is_destroy) {
11272 err = prepare_command_osd_destroy(id, ss);
11273 // we checked above that it should exist.
11274 ceph_assert(err != -ENOENT);
11275 } else {
11276 err = prepare_command_osd_purge(id, ss);
11277 if (err == -ENOENT) {
11278 err = 0;
11279 ss << "osd." << id << " does not exist.";
11280 goto_reply = true;
11281 }
11282 }
11283 paxos->unplug();
11284
11285 if (err < 0 || goto_reply) {
11286 goto reply;
11287 }
11288
11289 if (is_destroy) {
11290 ss << "destroyed osd." << id;
11291 } else {
11292 ss << "purged osd." << id;
11293 }
11294
11295 getline(ss, rs);
11296 wait_for_finished_proposal(op,
11297 new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11298 force_immediate_propose();
11299 return true;
11300
11301 } else if (prefix == "osd new") {
11302
11303 // make sure authmon is writeable.
11304 if (!mon->authmon()->is_writeable()) {
11305 dout(10) << __func__ << " waiting for auth mon to be writeable for "
11306 << "osd new" << dendl;
11307 mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11308 return false;
11309 }
11310
11311 map<string,string> param_map;
11312
11313 bufferlist bl = m->get_data();
11314 string param_json = bl.to_str();
11315 dout(20) << __func__ << " osd new json = " << param_json << dendl;
11316
11317 err = get_json_str_map(param_json, ss, &param_map);
11318 if (err < 0)
11319 goto reply;
11320
11321 dout(20) << __func__ << " osd new params " << param_map << dendl;
11322
11323 paxos->plug();
11324 err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
11325 paxos->unplug();
11326
11327 if (err < 0) {
11328 goto reply;
11329 }
11330
11331 if (f) {
11332 f->flush(rdata);
11333 } else {
11334 rdata.append(ss);
11335 }
11336
11337 if (err == EEXIST) {
11338 // idempotent operation
11339 err = 0;
11340 goto reply;
11341 }
11342
11343 wait_for_finished_proposal(op,
11344 new Monitor::C_Command(mon, op, 0, rs, rdata,
11345 get_last_committed() + 1));
11346 force_immediate_propose();
11347 return true;
11348
11349 } else if (prefix == "osd create") {
11350
11351 // optional id provided?
11352 int64_t id = -1, cmd_id = -1;
11353 if (cmd_getval(cct, cmdmap, "id", cmd_id)) {
11354 if (cmd_id < 0) {
11355 ss << "invalid osd id value '" << cmd_id << "'";
11356 err = -EINVAL;
11357 goto reply;
11358 }
11359 dout(10) << " osd create got id " << cmd_id << dendl;
11360 }
11361
11362 uuid_d uuid;
11363 string uuidstr;
11364 if (cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
11365 if (!uuid.parse(uuidstr.c_str())) {
11366 ss << "invalid uuid value '" << uuidstr << "'";
11367 err = -EINVAL;
11368 goto reply;
11369 }
11370 // we only care about the id if we also have the uuid, to
11371 // ensure the operation's idempotency.
11372 id = cmd_id;
11373 }
11374
11375 int32_t new_id = -1;
11376 err = prepare_command_osd_create(id, uuid, &new_id, ss);
11377 if (err < 0) {
11378 if (err == -EAGAIN) {
11379 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11380 return true;
11381 }
11382 // a check has failed; reply to the user.
11383 goto reply;
11384
11385 } else if (err == EEXIST) {
11386 // this is an idempotent operation; we can go ahead and reply.
11387 if (f) {
11388 f->open_object_section("created_osd");
11389 f->dump_int("osdid", new_id);
11390 f->close_section();
11391 f->flush(rdata);
11392 } else {
11393 ss << new_id;
11394 rdata.append(ss);
11395 }
11396 err = 0;
11397 goto reply;
11398 }
11399
11400 string empty_device_class;
11401 do_osd_create(id, uuid, empty_device_class, &new_id);
11402
11403 if (f) {
11404 f->open_object_section("created_osd");
11405 f->dump_int("osdid", new_id);
11406 f->close_section();
11407 f->flush(rdata);
11408 } else {
11409 ss << new_id;
11410 rdata.append(ss);
11411 }
11412 wait_for_finished_proposal(op,
11413 new Monitor::C_Command(mon, op, 0, rs, rdata,
11414 get_last_committed() + 1));
11415 return true;
11416
11417 } else if (prefix == "osd blacklist clear") {
11418 pending_inc.new_blacklist.clear();
11419 std::list<std::pair<entity_addr_t,utime_t > > blacklist;
11420 osdmap.get_blacklist(&blacklist);
11421 for (const auto &entry : blacklist) {
11422 pending_inc.old_blacklist.push_back(entry.first);
11423 }
11424 ss << " removed all blacklist entries";
11425 getline(ss, rs);
11426 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11427 get_last_committed() + 1));
11428 return true;
11429 } else if (prefix == "osd blacklist") {
11430 string addrstr;
11431 cmd_getval(cct, cmdmap, "addr", addrstr);
11432 entity_addr_t addr;
11433 if (!addr.parse(addrstr.c_str(), 0)) {
11434 ss << "unable to parse address " << addrstr;
11435 err = -EINVAL;
11436 goto reply;
11437 }
11438 else {
11439 if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
11440 // always blacklist type ANY
11441 addr.set_type(entity_addr_t::TYPE_ANY);
11442 } else {
11443 addr.set_type(entity_addr_t::TYPE_LEGACY);
11444 }
11445
11446 string blacklistop;
11447 cmd_getval(cct, cmdmap, "blacklistop", blacklistop);
11448 if (blacklistop == "add") {
11449 utime_t expires = ceph_clock_now();
11450 double d;
11451 // default one hour
11452 cmd_getval(cct, cmdmap, "expire", d,
11453 g_conf()->mon_osd_blacklist_default_expire);
11454 expires += d;
11455
11456 pending_inc.new_blacklist[addr] = expires;
11457
11458 {
11459 // cancel any pending un-blacklisting request too
11460 auto it = std::find(pending_inc.old_blacklist.begin(),
11461 pending_inc.old_blacklist.end(), addr);
11462 if (it != pending_inc.old_blacklist.end()) {
11463 pending_inc.old_blacklist.erase(it);
11464 }
11465 }
11466
11467 ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
11468 getline(ss, rs);
11469 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11470 get_last_committed() + 1));
11471 return true;
11472 } else if (blacklistop == "rm") {
11473 if (osdmap.is_blacklisted(addr) ||
11474 pending_inc.new_blacklist.count(addr)) {
11475 if (osdmap.is_blacklisted(addr))
11476 pending_inc.old_blacklist.push_back(addr);
11477 else
11478 pending_inc.new_blacklist.erase(addr);
11479 ss << "un-blacklisting " << addr;
11480 getline(ss, rs);
11481 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11482 get_last_committed() + 1));
11483 return true;
11484 }
11485 ss << addr << " isn't blacklisted";
11486 err = 0;
11487 goto reply;
11488 }
11489 }
11490 } else if (prefix == "osd pool mksnap") {
11491 string poolstr;
11492 cmd_getval(cct, cmdmap, "pool", poolstr);
11493 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11494 if (pool < 0) {
11495 ss << "unrecognized pool '" << poolstr << "'";
11496 err = -ENOENT;
11497 goto reply;
11498 }
11499 string snapname;
11500 cmd_getval(cct, cmdmap, "snap", snapname);
11501 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11502 if (p->is_unmanaged_snaps_mode()) {
11503 ss << "pool " << poolstr << " is in unmanaged snaps mode";
11504 err = -EINVAL;
11505 goto reply;
11506 } else if (p->snap_exists(snapname.c_str())) {
11507 ss << "pool " << poolstr << " snap " << snapname << " already exists";
11508 err = 0;
11509 goto reply;
11510 } else if (p->is_tier()) {
11511 ss << "pool " << poolstr << " is a cache tier";
11512 err = -EINVAL;
11513 goto reply;
11514 }
11515 pg_pool_t *pp = 0;
11516 if (pending_inc.new_pools.count(pool))
11517 pp = &pending_inc.new_pools[pool];
11518 if (!pp) {
11519 pp = &pending_inc.new_pools[pool];
11520 *pp = *p;
11521 }
11522 if (pp->snap_exists(snapname.c_str())) {
11523 ss << "pool " << poolstr << " snap " << snapname << " already exists";
11524 } else {
11525 pp->add_snap(snapname.c_str(), ceph_clock_now());
11526 pp->set_snap_epoch(pending_inc.epoch);
11527 ss << "created pool " << poolstr << " snap " << snapname;
11528 }
11529 getline(ss, rs);
11530 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11531 get_last_committed() + 1));
11532 return true;
11533 } else if (prefix == "osd pool rmsnap") {
11534 string poolstr;
11535 cmd_getval(cct, cmdmap, "pool", poolstr);
11536 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11537 if (pool < 0) {
11538 ss << "unrecognized pool '" << poolstr << "'";
11539 err = -ENOENT;
11540 goto reply;
11541 }
11542 string snapname;
11543 cmd_getval(cct, cmdmap, "snap", snapname);
11544 const pg_pool_t *p = osdmap.get_pg_pool(pool);
11545 if (p->is_unmanaged_snaps_mode()) {
11546 ss << "pool " << poolstr << " is in unmanaged snaps mode";
11547 err = -EINVAL;
11548 goto reply;
11549 } else if (!p->snap_exists(snapname.c_str())) {
11550 ss << "pool " << poolstr << " snap " << snapname << " does not exist";
11551 err = 0;
11552 goto reply;
11553 }
11554 pg_pool_t *pp = 0;
11555 if (pending_inc.new_pools.count(pool))
11556 pp = &pending_inc.new_pools[pool];
11557 if (!pp) {
11558 pp = &pending_inc.new_pools[pool];
11559 *pp = *p;
11560 }
11561 snapid_t sn = pp->snap_exists(snapname.c_str());
11562 if (sn) {
11563 pp->remove_snap(sn);
11564 pp->set_snap_epoch(pending_inc.epoch);
11565 ss << "removed pool " << poolstr << " snap " << snapname;
11566 } else {
11567 ss << "already removed pool " << poolstr << " snap " << snapname;
11568 }
11569 getline(ss, rs);
11570 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11571 get_last_committed() + 1));
11572 return true;
11573 } else if (prefix == "osd pool create") {
11574 int64_t pg_num, pg_num_min;
11575 int64_t pgp_num;
11576 cmd_getval(cct, cmdmap, "pg_num", pg_num, int64_t(0));
11577 cmd_getval(cct, cmdmap, "pgp_num", pgp_num, pg_num);
11578 cmd_getval(cct, cmdmap, "pg_num_min", pg_num_min, int64_t(0));
11579
11580 string pool_type_str;
11581 cmd_getval(cct, cmdmap, "pool_type", pool_type_str);
11582 if (pool_type_str.empty())
11583 pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
11584
11585 string poolstr;
11586 cmd_getval(cct, cmdmap, "pool", poolstr);
11587 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11588 if (pool_id >= 0) {
11589 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11590 if (pool_type_str != p->get_type_name()) {
11591 ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
11592 err = -EINVAL;
11593 } else {
11594 ss << "pool '" << poolstr << "' already exists";
11595 err = 0;
11596 }
11597 goto reply;
11598 }
11599
11600 int pool_type;
11601 if (pool_type_str == "replicated") {
11602 pool_type = pg_pool_t::TYPE_REPLICATED;
11603 } else if (pool_type_str == "erasure") {
11604 pool_type = pg_pool_t::TYPE_ERASURE;
11605 } else {
11606 ss << "unknown pool type '" << pool_type_str << "'";
11607 err = -EINVAL;
11608 goto reply;
11609 }
11610
11611 bool implicit_rule_creation = false;
11612 int64_t expected_num_objects = 0;
11613 string rule_name;
11614 cmd_getval(cct, cmdmap, "rule", rule_name);
11615 string erasure_code_profile;
11616 cmd_getval(cct, cmdmap, "erasure_code_profile", erasure_code_profile);
11617
11618 if (pool_type == pg_pool_t::TYPE_ERASURE) {
11619 if (erasure_code_profile == "")
11620 erasure_code_profile = "default";
11621 //handle the erasure code profile
11622 if (erasure_code_profile == "default") {
11623 if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
11624 if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
11625 dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
11626 goto wait;
11627 }
11628
11629 map<string,string> profile_map;
11630 err = osdmap.get_erasure_code_profile_default(cct,
11631 profile_map,
11632 &ss);
11633 if (err)
11634 goto reply;
11635 dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
11636 pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
11637 goto wait;
11638 }
11639 }
11640 if (rule_name == "") {
11641 implicit_rule_creation = true;
11642 if (erasure_code_profile == "default") {
11643 rule_name = "erasure-code";
11644 } else {
11645 dout(1) << "implicitly use rule named after the pool: "
11646 << poolstr << dendl;
11647 rule_name = poolstr;
11648 }
11649 }
11650 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11651 expected_num_objects, int64_t(0));
11652 } else {
11653 //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
11654 // and put expected_num_objects to rule field
11655 if (erasure_code_profile != "") { // cmd is from CLI
11656 if (rule_name != "") {
11657 string interr;
11658 expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
11659 if (interr.length()) {
11660 ss << "error parsing integer value '" << rule_name << "': " << interr;
11661 err = -EINVAL;
11662 goto reply;
11663 }
11664 }
11665 rule_name = erasure_code_profile;
11666 } else { // cmd is well-formed
11667 cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11668 expected_num_objects, int64_t(0));
11669 }
11670 }
11671
11672 if (!implicit_rule_creation && rule_name != "") {
11673 int rule;
11674 err = get_crush_rule(rule_name, &rule, &ss);
11675 if (err == -EAGAIN) {
11676 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11677 return true;
11678 }
11679 if (err)
11680 goto reply;
11681 }
11682
11683 if (expected_num_objects < 0) {
11684 ss << "'expected_num_objects' must be non-negative";
11685 err = -EINVAL;
11686 goto reply;
11687 }
11688
11689 if (expected_num_objects > 0 &&
11690 cct->_conf->osd_objectstore == "filestore" &&
11691 cct->_conf->filestore_merge_threshold > 0) {
11692 ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
11693 err = -EINVAL;
11694 goto reply;
11695 }
11696
11697 if (expected_num_objects == 0 &&
11698 cct->_conf->osd_objectstore == "filestore" &&
11699 cct->_conf->filestore_merge_threshold < 0) {
11700 int osds = osdmap.get_num_osds();
11701 if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
11702 ss << "For better initial performance on pools expected to store a "
11703 << "large number of objects, consider supplying the "
11704 << "expected_num_objects parameter when creating the pool.\n";
11705 }
11706 }
11707
11708 int64_t fast_read_param;
11709 cmd_getval(cct, cmdmap, "fast_read", fast_read_param, int64_t(-1));
11710 FastReadType fast_read = FAST_READ_DEFAULT;
11711 if (fast_read_param == 0)
11712 fast_read = FAST_READ_OFF;
11713 else if (fast_read_param > 0)
11714 fast_read = FAST_READ_ON;
11715
11716 int64_t repl_size = 0;
11717 cmd_getval(cct, cmdmap, "size", repl_size);
11718 int64_t target_size_bytes = 0;
11719 double target_size_ratio = 0.0;
11720 cmd_getval(cct, cmdmap, "target_size_bytes", target_size_bytes);
11721 cmd_getval(cct, cmdmap, "target_size_ratio", target_size_ratio);
11722
11723 err = prepare_new_pool(poolstr,
11724 -1, // default crush rule
11725 rule_name,
11726 pg_num, pgp_num, pg_num_min,
11727 repl_size, target_size_bytes, target_size_ratio,
11728 erasure_code_profile, pool_type,
11729 (uint64_t)expected_num_objects,
11730 fast_read,
11731 &ss);
11732 if (err < 0) {
11733 switch(err) {
11734 case -EEXIST:
11735 ss << "pool '" << poolstr << "' already exists";
11736 break;
11737 case -EAGAIN:
11738 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11739 return true;
11740 case -ERANGE:
11741 goto reply;
11742 default:
11743 goto reply;
11744 break;
11745 }
11746 } else {
11747 ss << "pool '" << poolstr << "' created";
11748 }
11749 getline(ss, rs);
11750 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11751 get_last_committed() + 1));
11752 return true;
11753
11754 } else if (prefix == "osd pool delete" ||
11755 prefix == "osd pool rm") {
11756 // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
11757 string poolstr, poolstr2, sure;
11758 cmd_getval(cct, cmdmap, "pool", poolstr);
11759 cmd_getval(cct, cmdmap, "pool2", poolstr2);
11760 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11761 if (pool < 0) {
11762 ss << "pool '" << poolstr << "' does not exist";
11763 err = 0;
11764 goto reply;
11765 }
11766
11767 bool force_no_fake = false;
11768 cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11769 bool force = false;
11770 cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it_not_faking", force);
11771 if (poolstr2 != poolstr ||
11772 (!force && !force_no_fake)) {
11773 ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
11774 << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
11775 << "followed by --yes-i-really-really-mean-it.";
11776 err = -EPERM;
11777 goto reply;
11778 }
11779 err = _prepare_remove_pool(pool, &ss, force_no_fake);
11780 if (err == -EAGAIN) {
11781 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11782 return true;
11783 }
11784 if (err < 0)
11785 goto reply;
11786 goto update;
11787 } else if (prefix == "osd pool rename") {
11788 string srcpoolstr, destpoolstr;
11789 cmd_getval(cct, cmdmap, "srcpool", srcpoolstr);
11790 cmd_getval(cct, cmdmap, "destpool", destpoolstr);
11791 int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
11792 int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
11793
11794 if (pool_src < 0) {
11795 if (pool_dst >= 0) {
11796 // src pool doesn't exist, dst pool does exist: to ensure idempotency
11797 // of operations, assume this rename succeeded, as it is not changing
11798 // the current state. Make sure we output something understandable
11799 // for whoever is issuing the command, if they are paying attention,
11800 // in case it was not intentional; or to avoid a "wtf?" and a bug
11801 // report in case it was intentional, while expecting a failure.
11802 ss << "pool '" << srcpoolstr << "' does not exist; pool '"
11803 << destpoolstr << "' does -- assuming successful rename";
11804 err = 0;
11805 } else {
11806 ss << "unrecognized pool '" << srcpoolstr << "'";
11807 err = -ENOENT;
11808 }
11809 goto reply;
11810 } else if (pool_dst >= 0) {
11811 // source pool exists and so does the destination pool
11812 ss << "pool '" << destpoolstr << "' already exists";
11813 err = -EEXIST;
11814 goto reply;
11815 }
11816
11817 int ret = _prepare_rename_pool(pool_src, destpoolstr);
11818 if (ret == 0) {
11819 ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
11820 } else {
11821 ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
11822 << cpp_strerror(ret);
11823 }
11824 getline(ss, rs);
11825 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
11826 get_last_committed() + 1));
11827 return true;
11828
11829 } else if (prefix == "osd pool set") {
11830 err = prepare_command_pool_set(cmdmap, ss);
11831 if (err == -EAGAIN)
11832 goto wait;
11833 if (err < 0)
11834 goto reply;
11835
11836 getline(ss, rs);
11837 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11838 get_last_committed() + 1));
11839 return true;
11840 } else if (prefix == "osd tier add") {
11841 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11842 if (err == -EAGAIN)
11843 goto wait;
11844 if (err)
11845 goto reply;
11846 string poolstr;
11847 cmd_getval(cct, cmdmap, "pool", poolstr);
11848 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11849 if (pool_id < 0) {
11850 ss << "unrecognized pool '" << poolstr << "'";
11851 err = -ENOENT;
11852 goto reply;
11853 }
11854 string tierpoolstr;
11855 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11856 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11857 if (tierpool_id < 0) {
11858 ss << "unrecognized pool '" << tierpoolstr << "'";
11859 err = -ENOENT;
11860 goto reply;
11861 }
11862 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11863 ceph_assert(p);
11864 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11865 ceph_assert(tp);
11866
11867 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11868 goto reply;
11869 }
11870
11871 // make sure new tier is empty
11872 string force_nonempty;
11873 cmd_getval(cct, cmdmap, "force_nonempty", force_nonempty);
11874 const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
11875 if (pstats && pstats->stats.sum.num_objects != 0 &&
11876 force_nonempty != "--force-nonempty") {
11877 ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
11878 err = -ENOTEMPTY;
11879 goto reply;
11880 }
11881 if (tp->is_erasure()) {
11882 ss << "tier pool '" << tierpoolstr
11883 << "' is an ec pool, which cannot be a tier";
11884 err = -ENOTSUP;
11885 goto reply;
11886 }
11887 if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
11888 ((force_nonempty != "--force-nonempty") ||
11889 (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
11890 ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
11891 err = -ENOTEMPTY;
11892 goto reply;
11893 }
11894 // go
11895 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11896 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11897 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11898 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11899 return true;
11900 }
11901 np->tiers.insert(tierpool_id);
11902 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11903 ntp->tier_of = pool_id;
11904 ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
11905 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11906 get_last_committed() + 1));
11907 return true;
11908 } else if (prefix == "osd tier remove" ||
11909 prefix == "osd tier rm") {
11910 string poolstr;
11911 cmd_getval(cct, cmdmap, "pool", poolstr);
11912 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11913 if (pool_id < 0) {
11914 ss << "unrecognized pool '" << poolstr << "'";
11915 err = -ENOENT;
11916 goto reply;
11917 }
11918 string tierpoolstr;
11919 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11920 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11921 if (tierpool_id < 0) {
11922 ss << "unrecognized pool '" << tierpoolstr << "'";
11923 err = -ENOENT;
11924 goto reply;
11925 }
11926 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11927 ceph_assert(p);
11928 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11929 ceph_assert(tp);
11930
11931 if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
11932 goto reply;
11933 }
11934
11935 if (p->tiers.count(tierpool_id) == 0) {
11936 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11937 err = 0;
11938 goto reply;
11939 }
11940 if (tp->tier_of != pool_id) {
11941 ss << "tier pool '" << tierpoolstr << "' is a tier of '"
11942 << osdmap.get_pool_name(tp->tier_of) << "': "
11943 // be scary about it; this is an inconsistency and bells must go off
11944 << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11945 err = -EINVAL;
11946 goto reply;
11947 }
11948 if (p->read_tier == tierpool_id) {
11949 ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
11950 err = -EBUSY;
11951 goto reply;
11952 }
11953 // go
11954 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11955 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11956 if (np->tiers.count(tierpool_id) == 0 ||
11957 ntp->tier_of != pool_id ||
11958 np->read_tier == tierpool_id) {
11959 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11960 return true;
11961 }
11962 np->tiers.erase(tierpool_id);
11963 ntp->clear_tier();
11964 ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11965 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11966 get_last_committed() + 1));
11967 return true;
11968 } else if (prefix == "osd tier set-overlay") {
11969 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11970 if (err == -EAGAIN)
11971 goto wait;
11972 if (err)
11973 goto reply;
11974 string poolstr;
11975 cmd_getval(cct, cmdmap, "pool", poolstr);
11976 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11977 if (pool_id < 0) {
11978 ss << "unrecognized pool '" << poolstr << "'";
11979 err = -ENOENT;
11980 goto reply;
11981 }
11982 string overlaypoolstr;
11983 cmd_getval(cct, cmdmap, "overlaypool", overlaypoolstr);
11984 int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
11985 if (overlaypool_id < 0) {
11986 ss << "unrecognized pool '" << overlaypoolstr << "'";
11987 err = -ENOENT;
11988 goto reply;
11989 }
11990 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11991 ceph_assert(p);
11992 const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11993 ceph_assert(overlay_p);
11994 if (p->tiers.count(overlaypool_id) == 0) {
11995 ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
11996 err = -EINVAL;
11997 goto reply;
11998 }
11999 if (p->read_tier == overlaypool_id) {
12000 err = 0;
12001 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12002 goto reply;
12003 }
12004 if (p->has_read_tier()) {
12005 ss << "pool '" << poolstr << "' has overlay '"
12006 << osdmap.get_pool_name(p->read_tier)
12007 << "'; please remove-overlay first";
12008 err = -EINVAL;
12009 goto reply;
12010 }
12011
12012 // go
12013 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12014 np->read_tier = overlaypool_id;
12015 np->write_tier = overlaypool_id;
12016 np->set_last_force_op_resend(pending_inc.epoch);
12017 pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12018 noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12019 ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12020 if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12021 ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12022 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12023 get_last_committed() + 1));
12024 return true;
12025 } else if (prefix == "osd tier remove-overlay" ||
12026 prefix == "osd tier rm-overlay") {
12027 string poolstr;
12028 cmd_getval(cct, cmdmap, "pool", poolstr);
12029 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12030 if (pool_id < 0) {
12031 ss << "unrecognized pool '" << poolstr << "'";
12032 err = -ENOENT;
12033 goto reply;
12034 }
12035 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12036 ceph_assert(p);
12037 if (!p->has_read_tier()) {
12038 err = 0;
12039 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12040 goto reply;
12041 }
12042
12043 if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12044 goto reply;
12045 }
12046
12047 // go
12048 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12049 if (np->has_read_tier()) {
12050 const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12051 pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12052 nop->set_last_force_op_resend(pending_inc.epoch);
12053 }
12054 if (np->has_write_tier()) {
12055 const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12056 pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12057 nop->set_last_force_op_resend(pending_inc.epoch);
12058 }
12059 np->clear_read_tier();
12060 np->clear_write_tier();
12061 np->set_last_force_op_resend(pending_inc.epoch);
12062 ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12063 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12064 get_last_committed() + 1));
12065 return true;
12066 } else if (prefix == "osd tier cache-mode") {
12067 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12068 if (err == -EAGAIN)
12069 goto wait;
12070 if (err)
12071 goto reply;
12072 string poolstr;
12073 cmd_getval(cct, cmdmap, "pool", poolstr);
12074 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12075 if (pool_id < 0) {
12076 ss << "unrecognized pool '" << poolstr << "'";
12077 err = -ENOENT;
12078 goto reply;
12079 }
12080 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12081 ceph_assert(p);
12082 if (!p->is_tier()) {
12083 ss << "pool '" << poolstr << "' is not a tier";
12084 err = -EINVAL;
12085 goto reply;
12086 }
12087 string modestr;
12088 cmd_getval(cct, cmdmap, "mode", modestr);
12089 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12090 if (mode < 0) {
12091 ss << "'" << modestr << "' is not a valid cache mode";
12092 err = -EINVAL;
12093 goto reply;
12094 }
12095
12096 bool sure = false;
12097 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12098
12099 if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12100 mode != pg_pool_t::CACHEMODE_NONE &&
12101 mode != pg_pool_t::CACHEMODE_PROXY &&
12102 mode != pg_pool_t::CACHEMODE_READPROXY) &&
12103 !sure) {
12104 ss << "'" << modestr << "' is not a well-supported cache mode and may "
12105 << "corrupt your data. pass --yes-i-really-mean-it to force.";
12106 err = -EPERM;
12107 goto reply;
12108 }
12109
12110 // pool already has this cache-mode set and there are no pending changes
12111 if (p->cache_mode == mode &&
12112 (pending_inc.new_pools.count(pool_id) == 0 ||
12113 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12114 ss << "set cache-mode for pool '" << poolstr << "'"
12115 << " to " << pg_pool_t::get_cache_mode_name(mode);
12116 err = 0;
12117 goto reply;
12118 }
12119
12120 /* Mode description:
12121 *
12122 * none: No cache-mode defined
12123 * forward: Forward all reads and writes to base pool
12124 * writeback: Cache writes, promote reads from base pool
12125 * readonly: Forward writes to base pool
12126 * readforward: Writes are in writeback mode, Reads are in forward mode
12127 * proxy: Proxy all reads and writes to base pool
12128 * readproxy: Writes are in writeback mode, Reads are in proxy mode
12129 *
12130 * Hence, these are the allowed transitions:
12131 *
12132 * none -> any
12133 * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12134 * proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12135 * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12136 * readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
12137 * writeback -> readforward || readproxy || forward || proxy
12138 * readonly -> any
12139 */
12140
12141 // We check if the transition is valid against the current pool mode, as
12142 // it is the only committed state thus far. We will blantly squash
12143 // whatever mode is on the pending state.
12144
12145 if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
12146 (mode != pg_pool_t::CACHEMODE_FORWARD &&
12147 mode != pg_pool_t::CACHEMODE_PROXY &&
12148 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12149 mode != pg_pool_t::CACHEMODE_READPROXY)) {
12150 ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
12151 << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
12152 << "' pool; only '"
12153 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
12154 << "','"
12155 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
12156 << "','"
12157 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
12158 << "','"
12159 << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
12160 << "' allowed.";
12161 err = -EINVAL;
12162 goto reply;
12163 }
12164 if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
12165 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12166 mode != pg_pool_t::CACHEMODE_FORWARD &&
12167 mode != pg_pool_t::CACHEMODE_PROXY &&
12168 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12169
12170 (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
12171 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12172 mode != pg_pool_t::CACHEMODE_FORWARD &&
12173 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12174 mode != pg_pool_t::CACHEMODE_PROXY)) ||
12175
12176 (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
12177 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12178 mode != pg_pool_t::CACHEMODE_FORWARD &&
12179 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12180 mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12181
12182 (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
12183 (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12184 mode != pg_pool_t::CACHEMODE_READFORWARD &&
12185 mode != pg_pool_t::CACHEMODE_PROXY &&
12186 mode != pg_pool_t::CACHEMODE_READPROXY))) {
12187
12188 const pool_stat_t* pstats =
12189 mon->mgrstatmon()->get_pool_stat(pool_id);
12190
12191 if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
12192 ss << "unable to set cache-mode '"
12193 << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
12194 << "': dirty objects found";
12195 err = -EBUSY;
12196 goto reply;
12197 }
12198 }
12199 // go
12200 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12201 np->cache_mode = mode;
12202 // set this both when moving to and from cache_mode NONE. this is to
12203 // capture legacy pools that were set up before this flag existed.
12204 np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
12205 ss << "set cache-mode for pool '" << poolstr
12206 << "' to " << pg_pool_t::get_cache_mode_name(mode);
12207 if (mode == pg_pool_t::CACHEMODE_NONE) {
12208 const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
12209 ceph_assert(base_pool);
12210 if (base_pool->read_tier == pool_id ||
12211 base_pool->write_tier == pool_id)
12212 ss <<" (WARNING: pool is still configured as read or write tier)";
12213 }
12214 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12215 get_last_committed() + 1));
12216 return true;
12217 } else if (prefix == "osd tier add-cache") {
12218 err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12219 if (err == -EAGAIN)
12220 goto wait;
12221 if (err)
12222 goto reply;
12223 string poolstr;
12224 cmd_getval(cct, cmdmap, "pool", poolstr);
12225 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12226 if (pool_id < 0) {
12227 ss << "unrecognized pool '" << poolstr << "'";
12228 err = -ENOENT;
12229 goto reply;
12230 }
12231 string tierpoolstr;
12232 cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
12233 int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12234 if (tierpool_id < 0) {
12235 ss << "unrecognized pool '" << tierpoolstr << "'";
12236 err = -ENOENT;
12237 goto reply;
12238 }
12239 const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12240 ceph_assert(p);
12241 const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12242 ceph_assert(tp);
12243
12244 if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12245 goto reply;
12246 }
12247
12248 int64_t size = 0;
12249 if (!cmd_getval(cct, cmdmap, "size", size)) {
12250 ss << "unable to parse 'size' value '"
12251 << cmd_vartype_stringify(cmdmap.at("size")) << "'";
12252 err = -EINVAL;
12253 goto reply;
12254 }
12255 // make sure new tier is empty
12256 const pool_stat_t *pstats =
12257 mon->mgrstatmon()->get_pool_stat(tierpool_id);
12258 if (pstats && pstats->stats.sum.num_objects != 0) {
12259 ss << "tier pool '" << tierpoolstr << "' is not empty";
12260 err = -ENOTEMPTY;
12261 goto reply;
12262 }
12263 auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
12264 pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12265 if (mode < 0) {
12266 ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
12267 err = -EINVAL;
12268 goto reply;
12269 }
12270 HitSet::Params hsp;
12271 auto& cache_hit_set_type =
12272 g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
12273 if (cache_hit_set_type == "bloom") {
12274 BloomHitSet::Params *bsp = new BloomHitSet::Params;
12275 bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
12276 hsp = HitSet::Params(bsp);
12277 } else if (cache_hit_set_type == "explicit_hash") {
12278 hsp = HitSet::Params(new ExplicitHashHitSet::Params);
12279 } else if (cache_hit_set_type == "explicit_object") {
12280 hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
12281 } else {
12282 ss << "osd tier cache default hit set type '"
12283 << cache_hit_set_type << "' is not a known type";
12284 err = -EINVAL;
12285 goto reply;
12286 }
12287 // go
12288 pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12289 pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12290 if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12291 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12292 return true;
12293 }
12294 np->tiers.insert(tierpool_id);
12295 np->read_tier = np->write_tier = tierpool_id;
12296 np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12297 np->set_last_force_op_resend(pending_inc.epoch);
12298 ntp->set_last_force_op_resend(pending_inc.epoch);
12299 ntp->tier_of = pool_id;
12300 ntp->cache_mode = mode;
12301 ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
12302 ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
12303 ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
12304 ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
12305 ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
12306 ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
12307 ntp->hit_set_params = hsp;
12308 ntp->target_max_bytes = size;
12309 ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
12310 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12311 get_last_committed() + 1));
12312 return true;
12313 } else if (prefix == "osd pool set-quota") {
12314 string poolstr;
12315 cmd_getval(cct, cmdmap, "pool", poolstr);
12316 int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12317 if (pool_id < 0) {
12318 ss << "unrecognized pool '" << poolstr << "'";
12319 err = -ENOENT;
12320 goto reply;
12321 }
12322
12323 string field;
12324 cmd_getval(cct, cmdmap, "field", field);
12325 if (field != "max_objects" && field != "max_bytes") {
12326 ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
12327 err = -EINVAL;
12328 goto reply;
12329 }
12330
12331 // val could contain unit designations, so we treat as a string
12332 string val;
12333 cmd_getval(cct, cmdmap, "val", val);
12334 string tss;
12335 int64_t value;
12336 if (field == "max_objects") {
12337 value = strict_sistrtoll(val.c_str(), &tss);
12338 } else if (field == "max_bytes") {
12339 value = strict_iecstrtoll(val.c_str(), &tss);
12340 } else {
12341 ceph_abort_msg("unrecognized option");
12342 }
12343 if (!tss.empty()) {
12344 ss << "error parsing value '" << val << "': " << tss;
12345 err = -EINVAL;
12346 goto reply;
12347 }
12348
12349 pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
12350 if (field == "max_objects") {
12351 pi->quota_max_objects = value;
12352 } else if (field == "max_bytes") {
12353 pi->quota_max_bytes = value;
12354 } else {
12355 ceph_abort_msg("unrecognized option");
12356 }
12357 ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
12358 rs = ss.str();
12359 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12360 get_last_committed() + 1));
12361 return true;
12362 } else if (prefix == "osd pool application enable" ||
12363 prefix == "osd pool application disable" ||
12364 prefix == "osd pool application set" ||
12365 prefix == "osd pool application rm") {
12366 err = prepare_command_pool_application(prefix, cmdmap, ss);
12367 if (err == -EAGAIN) {
12368 goto wait;
12369 } else if (err < 0) {
12370 goto reply;
12371 } else {
12372 goto update;
12373 }
12374 } else if (prefix == "osd force-create-pg") {
12375 pg_t pgid;
12376 string pgidstr;
12377 cmd_getval(cct, cmdmap, "pgid", pgidstr);
12378 if (!pgid.parse(pgidstr.c_str())) {
12379 ss << "invalid pgid '" << pgidstr << "'";
12380 err = -EINVAL;
12381 goto reply;
12382 }
12383 if (!osdmap.pg_exists(pgid)) {
12384 ss << "pg " << pgid << " should not exist";
12385 err = -ENOENT;
12386 goto reply;
12387 }
12388 bool sure = false;
12389 cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12390 if (!sure) {
12391 ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
12392 << "that the cluster will give up ever trying to recover the lost data. Do this "
12393 << "only if you are certain that all copies of the PG are in fact lost and you are "
12394 << "willing to accept that the data is permanently destroyed. Pass "
12395 << "--yes-i-really-mean-it to proceed.";
12396 err = -EPERM;
12397 goto reply;
12398 }
12399 bool creating_now;
12400 {
12401 std::lock_guard<std::mutex> l(creating_pgs_lock);
12402 auto emplaced = creating_pgs.pgs.emplace(pgid,
12403 make_pair(osdmap.get_epoch(),
12404 ceph_clock_now()));
12405 creating_now = emplaced.second;
12406 }
12407 if (creating_now) {
12408 ss << "pg " << pgidstr << " now creating, ok";
12409 // set the pool's CREATING flag so that (1) the osd won't ignore our
12410 // create message and (2) we won't propose any future pg_num changes
12411 // until after the PG has been instantiated.
12412 if (pending_inc.new_pools.count(pgid.pool()) == 0) {
12413 pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
12414 }
12415 pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
12416 err = 0;
12417 goto update;
12418 } else {
12419 ss << "pg " << pgid << " already creating";
12420 err = 0;
12421 goto reply;
12422 }
12423 } else {
12424 err = -EINVAL;
12425 }
12426
12427 reply:
12428 getline(ss, rs);
12429 if (err < 0 && rs.length() == 0)
12430 rs = cpp_strerror(err);
12431 mon->reply_command(op, err, rs, rdata, get_last_committed());
12432 return ret;
12433
12434 update:
12435 getline(ss, rs);
12436 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12437 get_last_committed() + 1));
12438 return true;
12439
12440 wait:
12441 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12442 return true;
12443 }
12444
12445 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
12446 {
12447 op->mark_osdmon_event(__func__);
12448
12449 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12450 MonSession *session = op->get_session();
12451 if (!session) {
12452 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12453 return true;
12454 }
12455
12456 switch (m->op) {
12457 case POOL_OP_CREATE_UNMANAGED_SNAP:
12458 case POOL_OP_DELETE_UNMANAGED_SNAP:
12459 {
12460 const std::string* pool_name = nullptr;
12461 const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
12462 if (pg_pool != nullptr) {
12463 pool_name = &osdmap.get_pool_name(m->pool);
12464 }
12465
12466 if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
12467 session->entity_name, session->caps,
12468 session->get_peer_socket_addr(),
12469 pool_name)) {
12470 dout(0) << "got unmanaged-snap pool op from entity with insufficient "
12471 << "privileges. message: " << *m << std::endl
12472 << "caps: " << session->caps << dendl;
12473 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12474 return true;
12475 }
12476 }
12477 break;
12478 default:
12479 if (!session->is_capable("osd", MON_CAP_W)) {
12480 dout(0) << "got pool op from entity with insufficient privileges. "
12481 << "message: " << *m << std::endl
12482 << "caps: " << session->caps << dendl;
12483 _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12484 return true;
12485 }
12486 break;
12487 }
12488
12489 return false;
12490 }
12491
12492 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
12493 {
12494 op->mark_osdmon_event(__func__);
12495 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12496
12497 if (enforce_pool_op_caps(op)) {
12498 return true;
12499 }
12500
12501 if (m->fsid != mon->monmap->fsid) {
12502 dout(0) << __func__ << " drop message on fsid " << m->fsid
12503 << " != " << mon->monmap->fsid << " for " << *m << dendl;
12504 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12505 return true;
12506 }
12507
12508 if (m->op == POOL_OP_CREATE)
12509 return preprocess_pool_op_create(op);
12510
12511 const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
12512 if (p == nullptr) {
12513 dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
12514 if (m->op == POOL_OP_DELETE) {
12515 _pool_op_reply(op, 0, osdmap.get_epoch());
12516 } else {
12517 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12518 }
12519 return true;
12520 }
12521
12522 // check if the snap and snapname exist
12523 bool snap_exists = false;
12524 if (p->snap_exists(m->name.c_str()))
12525 snap_exists = true;
12526
12527 switch (m->op) {
12528 case POOL_OP_CREATE_SNAP:
12529 if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
12530 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12531 return true;
12532 }
12533 if (snap_exists) {
12534 _pool_op_reply(op, 0, osdmap.get_epoch());
12535 return true;
12536 }
12537 return false;
12538 case POOL_OP_CREATE_UNMANAGED_SNAP:
12539 if (p->is_pool_snaps_mode()) {
12540 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12541 return true;
12542 }
12543 return false;
12544 case POOL_OP_DELETE_SNAP:
12545 if (p->is_unmanaged_snaps_mode()) {
12546 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12547 return true;
12548 }
12549 if (!snap_exists) {
12550 _pool_op_reply(op, 0, osdmap.get_epoch());
12551 return true;
12552 }
12553 return false;
12554 case POOL_OP_DELETE_UNMANAGED_SNAP:
12555 if (p->is_pool_snaps_mode()) {
12556 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12557 return true;
12558 }
12559 if (p->is_removed_snap(m->snapid)) {
12560 _pool_op_reply(op, 0, osdmap.get_epoch());
12561 return true;
12562 }
12563 return false;
12564 case POOL_OP_DELETE:
12565 if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
12566 _pool_op_reply(op, 0, osdmap.get_epoch());
12567 return true;
12568 }
12569 return false;
12570 case POOL_OP_AUID_CHANGE:
12571 return false;
12572 default:
12573 ceph_abort();
12574 break;
12575 }
12576
12577 return false;
12578 }
12579
12580 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
12581 {
12582 op->mark_osdmon_event(__func__);
12583 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12584 int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
12585 if (pool >= 0) {
12586 _pool_op_reply(op, 0, osdmap.get_epoch());
12587 return true;
12588 }
12589
12590 return false;
12591 }
12592
12593 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
12594 {
12595 op->mark_osdmon_event(__func__);
12596 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12597 dout(10) << "prepare_pool_op " << *m << dendl;
12598 if (m->op == POOL_OP_CREATE) {
12599 return prepare_pool_op_create(op);
12600 } else if (m->op == POOL_OP_DELETE) {
12601 return prepare_pool_op_delete(op);
12602 }
12603
12604 int ret = 0;
12605 bool changed = false;
12606
12607 if (!osdmap.have_pg_pool(m->pool)) {
12608 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12609 return false;
12610 }
12611
12612 const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
12613
12614 switch (m->op) {
12615 case POOL_OP_CREATE_SNAP:
12616 if (pool->is_tier()) {
12617 ret = -EINVAL;
12618 _pool_op_reply(op, ret, osdmap.get_epoch());
12619 return false;
12620 } // else, fall through
12621 case POOL_OP_DELETE_SNAP:
12622 if (!pool->is_unmanaged_snaps_mode()) {
12623 bool snap_exists = pool->snap_exists(m->name.c_str());
12624 if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
12625 || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
12626 ret = 0;
12627 } else {
12628 break;
12629 }
12630 } else {
12631 ret = -EINVAL;
12632 }
12633 _pool_op_reply(op, ret, osdmap.get_epoch());
12634 return false;
12635
12636 case POOL_OP_DELETE_UNMANAGED_SNAP:
12637 // we won't allow removal of an unmanaged snapshot from a pool
12638 // not in unmanaged snaps mode.
12639 if (!pool->is_unmanaged_snaps_mode()) {
12640 _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
12641 return false;
12642 }
12643 /* fall-thru */
12644 case POOL_OP_CREATE_UNMANAGED_SNAP:
12645 // but we will allow creating an unmanaged snapshot on any pool
12646 // as long as it is not in 'pool' snaps mode.
12647 if (pool->is_pool_snaps_mode()) {
12648 _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12649 return false;
12650 }
12651 }
12652
12653 // projected pool info
12654 pg_pool_t pp;
12655 if (pending_inc.new_pools.count(m->pool))
12656 pp = pending_inc.new_pools[m->pool];
12657 else
12658 pp = *osdmap.get_pg_pool(m->pool);
12659
12660 bufferlist reply_data;
12661
12662 // pool snaps vs unmanaged snaps are mutually exclusive
12663 switch (m->op) {
12664 case POOL_OP_CREATE_SNAP:
12665 case POOL_OP_DELETE_SNAP:
12666 if (pp.is_unmanaged_snaps_mode()) {
12667 ret = -EINVAL;
12668 goto out;
12669 }
12670 break;
12671
12672 case POOL_OP_CREATE_UNMANAGED_SNAP:
12673 case POOL_OP_DELETE_UNMANAGED_SNAP:
12674 if (pp.is_pool_snaps_mode()) {
12675 ret = -EINVAL;
12676 goto out;
12677 }
12678 }
12679
12680 switch (m->op) {
12681 case POOL_OP_CREATE_SNAP:
12682 if (!pp.snap_exists(m->name.c_str())) {
12683 pp.add_snap(m->name.c_str(), ceph_clock_now());
12684 dout(10) << "create snap in pool " << m->pool << " " << m->name
12685 << " seq " << pp.get_snap_epoch() << dendl;
12686 changed = true;
12687 }
12688 break;
12689
12690 case POOL_OP_DELETE_SNAP:
12691 {
12692 snapid_t s = pp.snap_exists(m->name.c_str());
12693 if (s) {
12694 pp.remove_snap(s);
12695 pending_inc.new_removed_snaps[m->pool].insert(s);
12696 changed = true;
12697 }
12698 }
12699 break;
12700
12701 case POOL_OP_CREATE_UNMANAGED_SNAP:
12702 {
12703 uint64_t snapid;
12704 pp.add_unmanaged_snap(snapid);
12705 encode(snapid, reply_data);
12706 changed = true;
12707 }
12708 break;
12709
12710 case POOL_OP_DELETE_UNMANAGED_SNAP:
12711 if (!pp.is_removed_snap(m->snapid)) {
12712 if (m->snapid > pp.get_snap_seq()) {
12713 _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12714 return false;
12715 }
12716 pp.remove_unmanaged_snap(m->snapid);
12717 pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
12718 changed = true;
12719 }
12720 break;
12721
12722 case POOL_OP_AUID_CHANGE:
12723 _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
12724 return false;
12725
12726 default:
12727 ceph_abort();
12728 break;
12729 }
12730
12731 if (changed) {
12732 pp.set_snap_epoch(pending_inc.epoch);
12733 pending_inc.new_pools[m->pool] = pp;
12734 }
12735
12736 out:
12737 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
12738 return true;
12739 }
12740
12741 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
12742 {
12743 op->mark_osdmon_event(__func__);
12744 int err = prepare_new_pool(op);
12745 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
12746 return true;
12747 }
12748
12749 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
12750 ostream *ss)
12751 {
12752 const string& poolstr = osdmap.get_pool_name(pool_id);
12753
12754 // If the Pool is in use by CephFS, refuse to delete it
12755 FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12756 if (pending_fsmap.pool_in_use(pool_id)) {
12757 *ss << "pool '" << poolstr << "' is in use by CephFS";
12758 return -EBUSY;
12759 }
12760
12761 if (pool.tier_of >= 0) {
12762 *ss << "pool '" << poolstr << "' is a tier of '"
12763 << osdmap.get_pool_name(pool.tier_of) << "'";
12764 return -EBUSY;
12765 }
12766 if (!pool.tiers.empty()) {
12767 *ss << "pool '" << poolstr << "' has tiers";
12768 for(auto tier : pool.tiers) {
12769 *ss << " " << osdmap.get_pool_name(tier);
12770 }
12771 return -EBUSY;
12772 }
12773
12774 if (!g_conf()->mon_allow_pool_delete) {
12775 *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12776 return -EPERM;
12777 }
12778
12779 if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
12780 *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
12781 return -EPERM;
12782 }
12783
12784 *ss << "pool '" << poolstr << "' removed";
12785 return 0;
12786 }
12787
12788 /**
12789 * Check if it is safe to add a tier to a base pool
12790 *
12791 * @return
12792 * True if the operation should proceed, false if we should abort here
12793 * (abort doesn't necessarily mean error, could be idempotency)
12794 */
12795 bool OSDMonitor::_check_become_tier(
12796 const int64_t tier_pool_id, const pg_pool_t *tier_pool,
12797 const int64_t base_pool_id, const pg_pool_t *base_pool,
12798 int *err,
12799 ostream *ss) const
12800 {
12801 const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
12802 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12803
12804 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12805 if (pending_fsmap.pool_in_use(tier_pool_id)) {
12806 *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
12807 *err = -EBUSY;
12808 return false;
12809 }
12810
12811 if (base_pool->tiers.count(tier_pool_id)) {
12812 ceph_assert(tier_pool->tier_of == base_pool_id);
12813 *err = 0;
12814 *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
12815 << base_pool_name << "'";
12816 return false;
12817 }
12818
12819 if (base_pool->is_tier()) {
12820 *ss << "pool '" << base_pool_name << "' is already a tier of '"
12821 << osdmap.get_pool_name(base_pool->tier_of) << "', "
12822 << "multiple tiers are not yet supported.";
12823 *err = -EINVAL;
12824 return false;
12825 }
12826
12827 if (tier_pool->has_tiers()) {
12828 *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
12829 for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
12830 it != tier_pool->tiers.end(); ++it)
12831 *ss << "'" << osdmap.get_pool_name(*it) << "',";
12832 *ss << " multiple tiers are not yet supported.";
12833 *err = -EINVAL;
12834 return false;
12835 }
12836
12837 if (tier_pool->is_tier()) {
12838 *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
12839 << osdmap.get_pool_name(tier_pool->tier_of) << "'";
12840 *err = -EINVAL;
12841 return false;
12842 }
12843
12844 *err = 0;
12845 return true;
12846 }
12847
12848
12849 /**
12850 * Check if it is safe to remove a tier from this base pool
12851 *
12852 * @return
12853 * True if the operation should proceed, false if we should abort here
12854 * (abort doesn't necessarily mean error, could be idempotency)
12855 */
12856 bool OSDMonitor::_check_remove_tier(
12857 const int64_t base_pool_id, const pg_pool_t *base_pool,
12858 const pg_pool_t *tier_pool,
12859 int *err, ostream *ss) const
12860 {
12861 const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12862
12863 // Apply CephFS-specific checks
12864 const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12865 if (pending_fsmap.pool_in_use(base_pool_id)) {
12866 if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
12867 // If the underlying pool is erasure coded and does not allow EC
12868 // overwrites, we can't permit the removal of the replicated tier that
12869 // CephFS relies on to access it
12870 *ss << "pool '" << base_pool_name <<
12871 "' does not allow EC overwrites and is in use by CephFS"
12872 " via its tier";
12873 *err = -EBUSY;
12874 return false;
12875 }
12876
12877 if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
12878 *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
12879 "tier is still in use as a writeback cache. Change the cache "
12880 "mode and flush the cache before removing it";
12881 *err = -EBUSY;
12882 return false;
12883 }
12884 }
12885
12886 *err = 0;
12887 return true;
12888 }
12889
12890 int OSDMonitor::_prepare_remove_pool(
12891 int64_t pool, ostream *ss, bool no_fake)
12892 {
12893 dout(10) << __func__ << " " << pool << dendl;
12894 const pg_pool_t *p = osdmap.get_pg_pool(pool);
12895 int r = _check_remove_pool(pool, *p, ss);
12896 if (r < 0)
12897 return r;
12898
12899 auto new_pool = pending_inc.new_pools.find(pool);
12900 if (new_pool != pending_inc.new_pools.end()) {
12901 // if there is a problem with the pending info, wait and retry
12902 // this op.
12903 const auto& p = new_pool->second;
12904 int r = _check_remove_pool(pool, p, ss);
12905 if (r < 0)
12906 return -EAGAIN;
12907 }
12908
12909 if (pending_inc.old_pools.count(pool)) {
12910 dout(10) << __func__ << " " << pool << " already pending removal"
12911 << dendl;
12912 return 0;
12913 }
12914
12915 if (g_conf()->mon_fake_pool_delete && !no_fake) {
12916 string old_name = osdmap.get_pool_name(pool);
12917 string new_name = old_name + "." + stringify(pool) + ".DELETED";
12918 dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
12919 << old_name << " -> " << new_name << dendl;
12920 pending_inc.new_pool_names[pool] = new_name;
12921 return 0;
12922 }
12923
12924 // remove
12925 pending_inc.old_pools.insert(pool);
12926
12927 // remove any pg_temp mappings for this pool
12928 for (auto p = osdmap.pg_temp->begin();
12929 p != osdmap.pg_temp->end();
12930 ++p) {
12931 if (p->first.pool() == pool) {
12932 dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
12933 << p->first << dendl;
12934 pending_inc.new_pg_temp[p->first].clear();
12935 }
12936 }
12937 // remove any primary_temp mappings for this pool
12938 for (auto p = osdmap.primary_temp->begin();
12939 p != osdmap.primary_temp->end();
12940 ++p) {
12941 if (p->first.pool() == pool) {
12942 dout(10) << __func__ << " " << pool
12943 << " removing obsolete primary_temp" << p->first << dendl;
12944 pending_inc.new_primary_temp[p->first] = -1;
12945 }
12946 }
12947 // remove any pg_upmap mappings for this pool
12948 for (auto& p : osdmap.pg_upmap) {
12949 if (p.first.pool() == pool) {
12950 dout(10) << __func__ << " " << pool
12951 << " removing obsolete pg_upmap "
12952 << p.first << dendl;
12953 pending_inc.old_pg_upmap.insert(p.first);
12954 }
12955 }
12956 // remove any pending pg_upmap mappings for this pool
12957 {
12958 auto it = pending_inc.new_pg_upmap.begin();
12959 while (it != pending_inc.new_pg_upmap.end()) {
12960 if (it->first.pool() == pool) {
12961 dout(10) << __func__ << " " << pool
12962 << " removing pending pg_upmap "
12963 << it->first << dendl;
12964 it = pending_inc.new_pg_upmap.erase(it);
12965 } else {
12966 it++;
12967 }
12968 }
12969 }
12970 // remove any pg_upmap_items mappings for this pool
12971 for (auto& p : osdmap.pg_upmap_items) {
12972 if (p.first.pool() == pool) {
12973 dout(10) << __func__ << " " << pool
12974 << " removing obsolete pg_upmap_items " << p.first
12975 << dendl;
12976 pending_inc.old_pg_upmap_items.insert(p.first);
12977 }
12978 }
12979 // remove any pending pg_upmap mappings for this pool
12980 {
12981 auto it = pending_inc.new_pg_upmap_items.begin();
12982 while (it != pending_inc.new_pg_upmap_items.end()) {
12983 if (it->first.pool() == pool) {
12984 dout(10) << __func__ << " " << pool
12985 << " removing pending pg_upmap_items "
12986 << it->first << dendl;
12987 it = pending_inc.new_pg_upmap_items.erase(it);
12988 } else {
12989 it++;
12990 }
12991 }
12992 }
12993
12994 // remove any choose_args for this pool
12995 CrushWrapper newcrush;
12996 _get_pending_crush(newcrush);
12997 if (newcrush.have_choose_args(pool)) {
12998 dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
12999 newcrush.rm_choose_args(pool);
13000 pending_inc.crush.clear();
13001 newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13002 }
13003 return 0;
13004 }
13005
13006 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13007 {
13008 dout(10) << "_prepare_rename_pool " << pool << dendl;
13009 if (pending_inc.old_pools.count(pool)) {
13010 dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13011 return -ENOENT;
13012 }
13013 for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13014 p != pending_inc.new_pool_names.end();
13015 ++p) {
13016 if (p->second == newname && p->first != pool) {
13017 return -EEXIST;
13018 }
13019 }
13020
13021 pending_inc.new_pool_names[pool] = newname;
13022 return 0;
13023 }
13024
13025 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13026 {
13027 op->mark_osdmon_event(__func__);
13028 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13029 ostringstream ss;
13030 int ret = _prepare_remove_pool(m->pool, &ss, false);
13031 if (ret == -EAGAIN) {
13032 wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13033 return true;
13034 }
13035 if (ret < 0)
13036 dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13037 wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13038 pending_inc.epoch));
13039 return true;
13040 }
13041
13042 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13043 int ret, epoch_t epoch, bufferlist *blp)
13044 {
13045 op->mark_osdmon_event(__func__);
13046 MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13047 dout(20) << "_pool_op_reply " << ret << dendl;
13048 MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13049 ret, epoch, get_last_committed(), blp);
13050 mon->send_reply(op, reply);
13051 }
13052
13053 void OSDMonitor::convert_pool_priorities(void)
13054 {
13055 pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13056 int64_t max_prio = 0;
13057 int64_t min_prio = 0;
13058 for (const auto &i : osdmap.get_pools()) {
13059 const auto &pool = i.second;
13060
13061 if (pool.opts.is_set(key)) {
13062 int64_t prio;
13063 pool.opts.get(key, &prio);
13064 if (prio > max_prio)
13065 max_prio = prio;
13066 if (prio < min_prio)
13067 min_prio = prio;
13068 }
13069 }
13070 if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13071 dout(20) << __func__ << " nothing to fix" << dendl;
13072 return;
13073 }
13074 // Current pool priorities exceeds new maximum
13075 for (const auto &i : osdmap.get_pools()) {
13076 const auto pool_id = i.first;
13077 pg_pool_t pool = i.second;
13078
13079 int64_t prio = 0;
13080 pool.opts.get(key, &prio);
13081 int64_t n;
13082
13083 if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13084 // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13085 n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13086 } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13087 // Scaled priority range OSD_POOL_PRIORITY_MIN to 0
13088 n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13089 } else {
13090 continue;
13091 }
13092 if (n == 0) {
13093 pool.opts.unset(key);
13094 } else {
13095 pool.opts.set(key, static_cast<int64_t>(n));
13096 }
13097 dout(10) << __func__ << " pool " << pool_id
13098 << " recovery_priority adjusted "
13099 << prio << " to " << n << dendl;
13100 pool.last_change = pending_inc.epoch;
13101 pending_inc.new_pools[pool_id] = pool;
13102 }
13103 }