]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
a21b04a688db063601d40a786af63e2b2c12e0a9
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/range/adaptor/reversed.hpp>
28
29 #ifdef HAVE_SYS_PARAM_H
30 #include <sys/param.h>
31 #endif
32
33 #ifdef HAVE_SYS_MOUNT_H
34 #include <sys/mount.h>
35 #endif
36
37 #include "osd/PG.h"
38 #include "osd/scrubber/scrub_machine.h"
39 #include "osd/scrubber/pg_scrubber.h"
40
41 #include "include/types.h"
42 #include "include/compat.h"
43 #include "include/random.h"
44 #include "include/scope_guard.h"
45
46 #include "OSD.h"
47 #include "OSDMap.h"
48 #include "Watch.h"
49 #include "osdc/Objecter.h"
50
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
60
61 #include "os/ObjectStore.h"
62 #ifdef HAVE_LIBFUSE
63 #include "os/FuseStore.h"
64 #endif
65
66 #include "PrimaryLogPG.h"
67
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
70
71 #include "mon/MonClient.h"
72
73 #include "messages/MLog.h"
74
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
90
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery2.h"
96 #include "messages/MOSDPGLog.h"
97 #include "messages/MOSDPGRemove.h"
98 #include "messages/MOSDPGInfo.h"
99 #include "messages/MOSDPGInfo2.h"
100 #include "messages/MOSDPGCreate.h"
101 #include "messages/MOSDPGCreate2.h"
102 #include "messages/MBackfillReserve.h"
103 #include "messages/MRecoveryReserve.h"
104 #include "messages/MOSDForceRecovery.h"
105 #include "messages/MOSDECSubOpWrite.h"
106 #include "messages/MOSDECSubOpWriteReply.h"
107 #include "messages/MOSDECSubOpRead.h"
108 #include "messages/MOSDECSubOpReadReply.h"
109 #include "messages/MOSDPGCreated.h"
110 #include "messages/MOSDPGUpdateLogMissing.h"
111 #include "messages/MOSDPGUpdateLogMissingReply.h"
112
113 #include "messages/MOSDPeeringOp.h"
114
115 #include "messages/MOSDAlive.h"
116
117 #include "messages/MOSDScrub.h"
118 #include "messages/MOSDScrub2.h"
119
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
122
123 #include "messages/MPGStats.h"
124
125 #include "messages/MMonGetPurgedSnaps.h"
126 #include "messages/MMonGetPurgedSnapsReply.h"
127
128 #include "common/perf_counters.h"
129 #include "common/Timer.h"
130 #include "common/LogClient.h"
131 #include "common/AsyncReserver.h"
132 #include "common/HeartbeatMap.h"
133 #include "common/admin_socket.h"
134 #include "common/ceph_context.h"
135
136 #include "global/signal_handler.h"
137 #include "global/pidfile.h"
138
139 #include "include/color.h"
140 #include "perfglue/cpu_profiler.h"
141 #include "perfglue/heap_profiler.h"
142
143 #include "osd/ClassHandler.h"
144 #include "osd/OpRequest.h"
145
146 #include "auth/AuthAuthorizeHandler.h"
147 #include "auth/RotatingKeyRing.h"
148
149 #include "objclass/objclass.h"
150
151 #include "common/cmdparse.h"
152 #include "include/str_list.h"
153 #include "include/util.h"
154
155 #include "include/ceph_assert.h"
156 #include "common/config.h"
157 #include "common/EventTrace.h"
158
159 #include "json_spirit/json_spirit_reader.h"
160 #include "json_spirit/json_spirit_writer.h"
161
162 #ifdef WITH_LTTNG
163 #define TRACEPOINT_DEFINE
164 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165 #include "tracing/osd.h"
166 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
167 #undef TRACEPOINT_DEFINE
168 #else
169 #define tracepoint(...)
170 #endif
171
172 #include "osd_tracer.h"
173
174
175 #define dout_context cct
176 #define dout_subsys ceph_subsys_osd
177 #undef dout_prefix
178 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
179
180 using std::deque;
181 using std::list;
182 using std::lock_guard;
183 using std::make_pair;
184 using std::make_tuple;
185 using std::make_unique;
186 using std::map;
187 using std::ostream;
188 using std::ostringstream;
189 using std::pair;
190 using std::set;
191 using std::string;
192 using std::stringstream;
193 using std::to_string;
194 using std::unique_ptr;
195 using std::vector;
196
197 using ceph::bufferlist;
198 using ceph::bufferptr;
199 using ceph::decode;
200 using ceph::encode;
201 using ceph::fixed_u_to_string;
202 using ceph::Formatter;
203 using ceph::heartbeat_handle_d;
204 using ceph::make_mutex;
205
206 using namespace ceph::osd::scheduler;
207 using TOPNSPC::common::cmd_getval;
208 using TOPNSPC::common::cmd_getval_or;
209
210 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
211 return *_dout << "osd." << whoami << " " << epoch << " ";
212 }
213
214
215 //Initial features in new superblock.
216 //Features here are also automatically upgraded
217 CompatSet OSD::get_osd_initial_compat_set() {
218 CompatSet::FeatureSet ceph_osd_feature_compat;
219 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
220 CompatSet::FeatureSet ceph_osd_feature_incompat;
221 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
222 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
223 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
224 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
225 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
235 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
236 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
237 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
238 ceph_osd_feature_incompat);
239 }
240
241 //Features are added here that this OSD supports.
242 CompatSet OSD::get_osd_compat_set() {
243 CompatSet compat = get_osd_initial_compat_set();
244 //Any features here can be set in code, but not in initial superblock
245 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
246 return compat;
247 }
248
249 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
250 osd(osd),
251 cct(osd->cct),
252 whoami(osd->whoami), store(osd->store.get()),
253 log_client(osd->log_client), clog(osd->clog),
254 pg_recovery_stats(osd->pg_recovery_stats),
255 cluster_messenger(osd->cluster_messenger),
256 client_messenger(osd->client_messenger),
257 logger(osd->logger),
258 recoverystate_perf(osd->recoverystate_perf),
259 monc(osd->monc),
260 osd_max_object_size(cct->_conf, "osd_max_object_size"),
261 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
262 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
263 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
264 max_oldest_map(0),
265 m_scrub_queue{cct, *this},
266 agent_valid_iterator(false),
267 agent_ops(0),
268 flush_mode_high_count(0),
269 agent_active(true),
270 agent_thread(this),
271 agent_stop_flag(false),
272 agent_timer(osd->client_messenger->cct, agent_timer_lock),
273 last_recalibrate(ceph_clock_now()),
274 promote_max_objects(0),
275 promote_max_bytes(0),
276 poolctx(poolctx),
277 objecter(make_unique<Objecter>(osd->client_messenger->cct,
278 osd->objecter_messenger,
279 osd->monc, poolctx)),
280 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
281 watch_timer(osd->client_messenger->cct, watch_lock),
282 next_notif_id(0),
283 recovery_request_timer(cct, recovery_request_lock, false),
284 sleep_timer(cct, sleep_lock, false),
285 reserver_finisher(cct),
286 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
287 cct->_conf->osd_min_recovery_priority),
288 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
289 cct->_conf->osd_min_recovery_priority),
290 snap_reserver(cct, &reserver_finisher,
291 cct->_conf->osd_max_trimming_pgs),
292 recovery_ops_active(0),
293 recovery_ops_reserved(0),
294 recovery_paused(false),
295 map_cache(cct, cct->_conf->osd_map_cache_size),
296 map_bl_cache(cct->_conf->osd_map_cache_size),
297 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
298 cur_state(NONE),
299 cur_ratio(0), physical_ratio(0),
300 boot_epoch(0), up_epoch(0), bind_epoch(0)
301 {
302 objecter->init();
303
304 for (int i = 0; i < m_objecter_finishers; i++) {
305 ostringstream str;
306 str << "objecter-finisher-" << i;
307 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
308 objecter_finishers.push_back(std::move(fin));
309 }
310 }
311
312 #ifdef PG_DEBUG_REFS
313 void OSDService::add_pgid(spg_t pgid, PG *pg) {
314 std::lock_guard l(pgid_lock);
315 if (!pgid_tracker.count(pgid)) {
316 live_pgs[pgid] = pg;
317 }
318 pgid_tracker[pgid]++;
319 }
320 void OSDService::remove_pgid(spg_t pgid, PG *pg)
321 {
322 std::lock_guard l(pgid_lock);
323 ceph_assert(pgid_tracker.count(pgid));
324 ceph_assert(pgid_tracker[pgid] > 0);
325 pgid_tracker[pgid]--;
326 if (pgid_tracker[pgid] == 0) {
327 pgid_tracker.erase(pgid);
328 live_pgs.erase(pgid);
329 }
330 }
331 void OSDService::dump_live_pgids()
332 {
333 std::lock_guard l(pgid_lock);
334 derr << "live pgids:" << dendl;
335 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
336 i != pgid_tracker.cend();
337 ++i) {
338 derr << "\t" << *i << dendl;
339 live_pgs[i->first]->dump_live_ids();
340 }
341 }
342 #endif
343
344
345 ceph::signedspan OSDService::get_mnow()
346 {
347 return ceph::mono_clock::now() - osd->startup_time;
348 }
349
350 void OSDService::identify_splits_and_merges(
351 OSDMapRef old_map,
352 OSDMapRef new_map,
353 spg_t pgid,
354 set<pair<spg_t,epoch_t>> *split_children,
355 set<pair<spg_t,epoch_t>> *merge_pgs)
356 {
357 if (!old_map->have_pg_pool(pgid.pool())) {
358 return;
359 }
360 int old_pgnum = old_map->get_pg_num(pgid.pool());
361 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
362 if (p == osd->pg_num_history.pg_nums.end()) {
363 return;
364 }
365 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
366 << " to e" << new_map->get_epoch()
367 << " pg_nums " << p->second << dendl;
368 deque<spg_t> queue;
369 queue.push_back(pgid);
370 set<spg_t> did;
371 while (!queue.empty()) {
372 auto cur = queue.front();
373 queue.pop_front();
374 did.insert(cur);
375 unsigned pgnum = old_pgnum;
376 for (auto q = p->second.lower_bound(old_map->get_epoch());
377 q != p->second.end() &&
378 q->first <= new_map->get_epoch();
379 ++q) {
380 if (pgnum < q->second) {
381 // split?
382 if (cur.ps() < pgnum) {
383 set<spg_t> children;
384 if (cur.is_split(pgnum, q->second, &children)) {
385 dout(20) << __func__ << " " << cur << " e" << q->first
386 << " pg_num " << pgnum << " -> " << q->second
387 << " children " << children << dendl;
388 for (auto i : children) {
389 split_children->insert(make_pair(i, q->first));
390 if (!did.count(i))
391 queue.push_back(i);
392 }
393 }
394 } else if (cur.ps() < q->second) {
395 dout(20) << __func__ << " " << cur << " e" << q->first
396 << " pg_num " << pgnum << " -> " << q->second
397 << " is a child" << dendl;
398 // normally we'd capture this from the parent, but it's
399 // possible the parent doesn't exist yet (it will be
400 // fabricated to allow an intervening merge). note this PG
401 // as a split child here to be sure we catch it.
402 split_children->insert(make_pair(cur, q->first));
403 } else {
404 dout(20) << __func__ << " " << cur << " e" << q->first
405 << " pg_num " << pgnum << " -> " << q->second
406 << " is post-split, skipping" << dendl;
407 }
408 } else if (merge_pgs) {
409 // merge?
410 if (cur.ps() >= q->second) {
411 if (cur.ps() < pgnum) {
412 spg_t parent;
413 if (cur.is_merge_source(pgnum, q->second, &parent)) {
414 set<spg_t> children;
415 parent.is_split(q->second, pgnum, &children);
416 dout(20) << __func__ << " " << cur << " e" << q->first
417 << " pg_num " << pgnum << " -> " << q->second
418 << " is merge source, target " << parent
419 << ", source(s) " << children << dendl;
420 merge_pgs->insert(make_pair(parent, q->first));
421 if (!did.count(parent)) {
422 // queue (and re-scan) parent in case it might not exist yet
423 // and there are some future splits pending on it
424 queue.push_back(parent);
425 }
426 for (auto c : children) {
427 merge_pgs->insert(make_pair(c, q->first));
428 if (!did.count(c))
429 queue.push_back(c);
430 }
431 }
432 } else {
433 dout(20) << __func__ << " " << cur << " e" << q->first
434 << " pg_num " << pgnum << " -> " << q->second
435 << " is beyond old pgnum, skipping" << dendl;
436 }
437 } else {
438 set<spg_t> children;
439 if (cur.is_split(q->second, pgnum, &children)) {
440 dout(20) << __func__ << " " << cur << " e" << q->first
441 << " pg_num " << pgnum << " -> " << q->second
442 << " is merge target, source " << children << dendl;
443 for (auto c : children) {
444 merge_pgs->insert(make_pair(c, q->first));
445 if (!did.count(c))
446 queue.push_back(c);
447 }
448 merge_pgs->insert(make_pair(cur, q->first));
449 }
450 }
451 }
452 pgnum = q->second;
453 }
454 }
455 }
456
457 void OSDService::need_heartbeat_peer_update()
458 {
459 osd->need_heartbeat_peer_update();
460 }
461
462 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
463 {
464 std::lock_guard l(hb_stamp_lock);
465 if (peer >= hb_stamps.size()) {
466 hb_stamps.resize(peer + 1);
467 }
468 if (!hb_stamps[peer]) {
469 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
470 }
471 return hb_stamps[peer];
472 }
473
474 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
475 {
476 osd->enqueue_peering_evt(
477 spgid,
478 PGPeeringEventRef(
479 std::make_shared<PGPeeringEvent>(
480 epoch, epoch,
481 RenewLease())));
482 }
483
484 void OSDService::start_shutdown()
485 {
486 {
487 std::lock_guard l(agent_timer_lock);
488 agent_timer.shutdown();
489 }
490
491 {
492 std::lock_guard l(sleep_lock);
493 sleep_timer.shutdown();
494 }
495
496 {
497 std::lock_guard l(recovery_request_lock);
498 recovery_request_timer.shutdown();
499 }
500 }
501
502 void OSDService::shutdown_reserver()
503 {
504 reserver_finisher.wait_for_empty();
505 reserver_finisher.stop();
506 }
507
508 void OSDService::shutdown()
509 {
510 mono_timer.suspend();
511
512 {
513 std::lock_guard l(watch_lock);
514 watch_timer.shutdown();
515 }
516
517 objecter->shutdown();
518 for (auto& f : objecter_finishers) {
519 f->wait_for_empty();
520 f->stop();
521 }
522
523 publish_map(OSDMapRef());
524 next_osdmap = OSDMapRef();
525 }
526
527 void OSDService::init()
528 {
529 reserver_finisher.start();
530 for (auto& f : objecter_finishers) {
531 f->start();
532 }
533 objecter->set_client_incarnation(0);
534
535 // deprioritize objecter in daemonperf output
536 objecter->get_logger()->set_prio_adjust(-3);
537
538 watch_timer.init();
539 agent_timer.init();
540 mono_timer.resume();
541
542 agent_thread.create("osd_srv_agent");
543
544 if (cct->_conf->osd_recovery_delay_start)
545 defer_recovery(cct->_conf->osd_recovery_delay_start);
546 }
547
548 void OSDService::final_init()
549 {
550 objecter->start(osdmap.get());
551 }
552
553 void OSDService::activate_map()
554 {
555 // wake/unwake the tiering agent
556 std::lock_guard l{agent_lock};
557 agent_active =
558 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
559 osd->is_active();
560 agent_cond.notify_all();
561 }
562
563 void OSDService::request_osdmap_update(epoch_t e)
564 {
565 osd->osdmap_subscribe(e, false);
566 }
567
568
569 class AgentTimeoutCB : public Context {
570 PGRef pg;
571 public:
572 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
573 void finish(int) override {
574 pg->agent_choose_mode_restart();
575 }
576 };
577
578 void OSDService::agent_entry()
579 {
580 dout(10) << __func__ << " start" << dendl;
581 std::unique_lock agent_locker{agent_lock};
582
583 while (!agent_stop_flag) {
584 if (agent_queue.empty()) {
585 dout(20) << __func__ << " empty queue" << dendl;
586 agent_cond.wait(agent_locker);
587 continue;
588 }
589 uint64_t level = agent_queue.rbegin()->first;
590 set<PGRef>& top = agent_queue.rbegin()->second;
591 dout(10) << __func__
592 << " tiers " << agent_queue.size()
593 << ", top is " << level
594 << " with pgs " << top.size()
595 << ", ops " << agent_ops << "/"
596 << cct->_conf->osd_agent_max_ops
597 << (agent_active ? " active" : " NOT ACTIVE")
598 << dendl;
599 dout(20) << __func__ << " oids " << agent_oids << dendl;
600 int max = cct->_conf->osd_agent_max_ops - agent_ops;
601 int agent_flush_quota = max;
602 if (!flush_mode_high_count)
603 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
604 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
605 agent_cond.wait(agent_locker);
606 continue;
607 }
608
609 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
610 agent_queue_pos = top.begin();
611 agent_valid_iterator = true;
612 }
613 PGRef pg = *agent_queue_pos;
614 dout(10) << "high_count " << flush_mode_high_count
615 << " agent_ops " << agent_ops
616 << " flush_quota " << agent_flush_quota << dendl;
617 agent_locker.unlock();
618 if (!pg->agent_work(max, agent_flush_quota)) {
619 dout(10) << __func__ << " " << pg->pg_id
620 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
621 << " seconds" << dendl;
622
623 logger->inc(l_osd_tier_delay);
624 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
625 std::lock_guard timer_locker{agent_timer_lock};
626 Context *cb = new AgentTimeoutCB(pg);
627 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
628 }
629 agent_locker.lock();
630 }
631 dout(10) << __func__ << " finish" << dendl;
632 }
633
634 void OSDService::agent_stop()
635 {
636 {
637 std::lock_guard l(agent_lock);
638
639 // By this time all ops should be cancelled
640 ceph_assert(agent_ops == 0);
641 // By this time all PGs are shutdown and dequeued
642 if (!agent_queue.empty()) {
643 set<PGRef>& top = agent_queue.rbegin()->second;
644 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
645 ceph_abort_msg("agent queue not empty");
646 }
647
648 agent_stop_flag = true;
649 agent_cond.notify_all();
650 }
651 agent_thread.join();
652 }
653
654 // -------------------------------------
655
656 void OSDService::promote_throttle_recalibrate()
657 {
658 utime_t now = ceph_clock_now();
659 double dur = now - last_recalibrate;
660 last_recalibrate = now;
661 unsigned prob = promote_probability_millis;
662
663 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
664 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
665
666 unsigned min_prob = 1;
667
668 uint64_t attempts, obj, bytes;
669 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
670 dout(10) << __func__ << " " << attempts << " attempts, promoted "
671 << obj << " objects and " << byte_u_t(bytes) << "; target "
672 << target_obj_sec << " obj/sec or "
673 << byte_u_t(target_bytes_sec) << "/sec"
674 << dendl;
675
676 // calculate what the probability *should* be, given the targets
677 unsigned new_prob;
678 if (attempts && dur > 0) {
679 uint64_t avg_size = 1;
680 if (obj)
681 avg_size = std::max<uint64_t>(bytes / obj, 1);
682 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
683 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
684 / (double)attempts;
685 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
686 << avg_size << dendl;
687 if (target_obj_sec && target_bytes_sec)
688 new_prob = std::min(po, pb);
689 else if (target_obj_sec)
690 new_prob = po;
691 else if (target_bytes_sec)
692 new_prob = pb;
693 else
694 new_prob = 1000;
695 } else {
696 new_prob = 1000;
697 }
698 dout(20) << __func__ << " new_prob " << new_prob << dendl;
699
700 // correct for persistent skew between target rate and actual rate, adjust
701 double ratio = 1.0;
702 unsigned actual = 0;
703 if (attempts && obj) {
704 actual = obj * 1000 / attempts;
705 ratio = (double)actual / (double)prob;
706 new_prob = (double)new_prob / ratio;
707 }
708 new_prob = std::max(new_prob, min_prob);
709 new_prob = std::min(new_prob, 1000u);
710
711 // adjust
712 prob = (prob + new_prob) / 2;
713 prob = std::max(prob, min_prob);
714 prob = std::min(prob, 1000u);
715 dout(10) << __func__ << " actual " << actual
716 << ", actual/prob ratio " << ratio
717 << ", adjusted new_prob " << new_prob
718 << ", prob " << promote_probability_millis << " -> " << prob
719 << dendl;
720 promote_probability_millis = prob;
721
722 // set hard limits for this interval to mitigate stampedes
723 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
724 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
725 }
726
727 // -------------------------------------
728
729 float OSDService::get_failsafe_full_ratio()
730 {
731 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
732 if (full_ratio > 1.0) full_ratio /= 100.0;
733 return full_ratio;
734 }
735
736 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
737 {
738 // The OSDMap ratios take precendence. So if the failsafe is .95 and
739 // the admin sets the cluster full to .96, the failsafe moves up to .96
740 // too. (Not that having failsafe == full is ideal, but it's better than
741 // dropping writes before the clusters appears full.)
742 OSDMapRef osdmap = get_osdmap();
743 if (!osdmap || osdmap->get_epoch() == 0) {
744 return NONE;
745 }
746 float nearfull_ratio = osdmap->get_nearfull_ratio();
747 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
748 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
749 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
750
751 if (osdmap->require_osd_release < ceph_release_t::luminous) {
752 // use the failsafe for nearfull and full; the mon isn't using the
753 // flags anyway because we're mid-upgrade.
754 full_ratio = failsafe_ratio;
755 backfillfull_ratio = failsafe_ratio;
756 nearfull_ratio = failsafe_ratio;
757 } else if (full_ratio <= 0 ||
758 backfillfull_ratio <= 0 ||
759 nearfull_ratio <= 0) {
760 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
761 // use failsafe flag. ick. the monitor did something wrong or the user
762 // did something stupid.
763 full_ratio = failsafe_ratio;
764 backfillfull_ratio = failsafe_ratio;
765 nearfull_ratio = failsafe_ratio;
766 }
767
768 if (injectfull_state > NONE && injectfull) {
769 inject = "(Injected)";
770 return injectfull_state;
771 } else if (pratio > failsafe_ratio) {
772 return FAILSAFE;
773 } else if (ratio > full_ratio) {
774 return FULL;
775 } else if (ratio > backfillfull_ratio) {
776 return BACKFILLFULL;
777 } else if (pratio > nearfull_ratio) {
778 return NEARFULL;
779 }
780 return NONE;
781 }
782
783 void OSDService::check_full_status(float ratio, float pratio)
784 {
785 std::lock_guard l(full_status_lock);
786
787 cur_ratio = ratio;
788 physical_ratio = pratio;
789
790 string inject;
791 s_names new_state;
792 new_state = recalc_full_state(ratio, pratio, inject);
793
794 dout(20) << __func__ << " cur ratio " << ratio
795 << ", physical ratio " << pratio
796 << ", new state " << get_full_state_name(new_state)
797 << " " << inject
798 << dendl;
799
800 // warn
801 if (cur_state != new_state) {
802 dout(10) << __func__ << " " << get_full_state_name(cur_state)
803 << " -> " << get_full_state_name(new_state) << dendl;
804 if (new_state == FAILSAFE) {
805 clog->error() << "full status failsafe engaged, dropping updates, now "
806 << (int)roundf(ratio * 100) << "% full";
807 } else if (cur_state == FAILSAFE) {
808 clog->error() << "full status failsafe disengaged, no longer dropping "
809 << "updates, now " << (int)roundf(ratio * 100) << "% full";
810 }
811 cur_state = new_state;
812 }
813 }
814
815 bool OSDService::need_fullness_update()
816 {
817 OSDMapRef osdmap = get_osdmap();
818 s_names cur = NONE;
819 if (osdmap->exists(whoami)) {
820 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
821 cur = FULL;
822 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
823 cur = BACKFILLFULL;
824 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
825 cur = NEARFULL;
826 }
827 }
828 s_names want = NONE;
829 if (is_full())
830 want = FULL;
831 else if (is_backfillfull())
832 want = BACKFILLFULL;
833 else if (is_nearfull())
834 want = NEARFULL;
835 return want != cur;
836 }
837
838 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
839 {
840 if (injectfull && injectfull_state >= type) {
841 // injectfull is either a count of the number of times to return failsafe full
842 // or if -1 then always return full
843 if (injectfull > 0)
844 --injectfull;
845 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
846 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
847 << dendl;
848 return true;
849 }
850 return false;
851 }
852
853 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
854 {
855 std::lock_guard l(full_status_lock);
856
857 if (_check_inject_full(dpp, type))
858 return true;
859
860 if (cur_state >= type)
861 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
862 << " physical " << physical_ratio << dendl;
863
864 return cur_state >= type;
865 }
866
867 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
868 {
869 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
870 {
871 std::lock_guard l(full_status_lock);
872 if (_check_inject_full(dpp, type)) {
873 return true;
874 }
875 }
876
877 float pratio;
878 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
879
880 string notused;
881 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
882
883 if (tentative_state >= type)
884 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
885
886 return tentative_state >= type;
887 }
888
889 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
890 {
891 return _check_full(dpp, FAILSAFE);
892 }
893
894 bool OSDService::check_full(DoutPrefixProvider *dpp) const
895 {
896 return _check_full(dpp, FULL);
897 }
898
899 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
900 {
901 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
902 }
903
904 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
905 {
906 return _check_full(dpp, BACKFILLFULL);
907 }
908
909 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
910 {
911 return _check_full(dpp, NEARFULL);
912 }
913
914 bool OSDService::is_failsafe_full() const
915 {
916 std::lock_guard l(full_status_lock);
917 return cur_state == FAILSAFE;
918 }
919
920 bool OSDService::is_full() const
921 {
922 std::lock_guard l(full_status_lock);
923 return cur_state >= FULL;
924 }
925
926 bool OSDService::is_backfillfull() const
927 {
928 std::lock_guard l(full_status_lock);
929 return cur_state >= BACKFILLFULL;
930 }
931
932 bool OSDService::is_nearfull() const
933 {
934 std::lock_guard l(full_status_lock);
935 return cur_state >= NEARFULL;
936 }
937
938 void OSDService::set_injectfull(s_names type, int64_t count)
939 {
940 std::lock_guard l(full_status_lock);
941 injectfull_state = type;
942 injectfull = count;
943 }
944
945 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
946 osd_alert_list_t& alerts)
947 {
948 uint64_t bytes = stbuf.total;
949 uint64_t avail = stbuf.available;
950 uint64_t used = stbuf.get_used_raw();
951
952 // For testing fake statfs values so it doesn't matter if all
953 // OSDs are using the same partition.
954 if (cct->_conf->fake_statfs_for_testing) {
955 uint64_t total_num_bytes = 0;
956 vector<PGRef> pgs;
957 osd->_get_pgs(&pgs);
958 for (auto p : pgs) {
959 total_num_bytes += p->get_stats_num_bytes();
960 }
961 bytes = cct->_conf->fake_statfs_for_testing;
962 if (total_num_bytes < bytes)
963 avail = bytes - total_num_bytes;
964 else
965 avail = 0;
966 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
967 << " adjust available " << avail
968 << dendl;
969 used = bytes - avail;
970 }
971
972 logger->set(l_osd_stat_bytes, bytes);
973 logger->set(l_osd_stat_bytes_used, used);
974 logger->set(l_osd_stat_bytes_avail, avail);
975
976 std::lock_guard l(stat_lock);
977 osd_stat.statfs = stbuf;
978 osd_stat.os_alerts.clear();
979 osd_stat.os_alerts[whoami].swap(alerts);
980 if (cct->_conf->fake_statfs_for_testing) {
981 osd_stat.statfs.total = bytes;
982 osd_stat.statfs.available = avail;
983 // For testing don't want used to go negative, so clear reserved
984 osd_stat.statfs.internally_reserved = 0;
985 }
986 }
987
988 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
989 int num_pgs)
990 {
991 utime_t now = ceph_clock_now();
992 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
993 std::lock_guard l(stat_lock);
994 osd_stat.hb_peers.swap(hb_peers);
995 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
996 osd_stat.num_pgs = num_pgs;
997 // Clean entries that aren't updated
998 // This is called often enough that we can just remove 1 at a time
999 for (auto i: osd_stat.hb_pingtime) {
1000 if (i.second.last_update == 0)
1001 continue;
1002 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1003 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1004 << " last_update " << i.second.last_update << dendl;
1005 osd_stat.hb_pingtime.erase(i.first);
1006 break;
1007 }
1008 }
1009 return osd_stat;
1010 }
1011
1012 void OSDService::inc_osd_stat_repaired()
1013 {
1014 std::lock_guard l(stat_lock);
1015 osd_stat.num_shards_repaired++;
1016 return;
1017 }
1018
1019 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1020 uint64_t adjust_used)
1021 {
1022 *pratio =
1023 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1024
1025 if (adjust_used) {
1026 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1027 if (new_stat.statfs.available > adjust_used)
1028 new_stat.statfs.available -= adjust_used;
1029 else
1030 new_stat.statfs.available = 0;
1031 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1032 }
1033
1034 // Check all pgs and adjust kb_used to include all pending backfill data
1035 int backfill_adjusted = 0;
1036 vector<PGRef> pgs;
1037 osd->_get_pgs(&pgs);
1038 for (auto p : pgs) {
1039 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1040 }
1041 if (backfill_adjusted) {
1042 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1043 }
1044 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1045 }
1046
1047 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1048 {
1049 OSDMapRef next_map = get_nextmap_reserved();
1050 // service map is always newer/newest
1051 ceph_assert(from_epoch <= next_map->get_epoch());
1052
1053 if (next_map->is_down(peer) ||
1054 next_map->get_info(peer).up_from > from_epoch) {
1055 m->put();
1056 release_map(next_map);
1057 return;
1058 }
1059 ConnectionRef peer_con;
1060 if (peer == whoami) {
1061 peer_con = osd->cluster_messenger->get_loopback_connection();
1062 } else {
1063 peer_con = osd->cluster_messenger->connect_to_osd(
1064 next_map->get_cluster_addrs(peer), false, true);
1065 }
1066 maybe_share_map(peer_con.get(), next_map);
1067 peer_con->send_message(m);
1068 release_map(next_map);
1069 }
1070
1071 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1072 {
1073 OSDMapRef next_map = get_nextmap_reserved();
1074 // service map is always newer/newest
1075 ceph_assert(from_epoch <= next_map->get_epoch());
1076
1077 for (auto& iter : messages) {
1078 if (next_map->is_down(iter.first) ||
1079 next_map->get_info(iter.first).up_from > from_epoch) {
1080 iter.second->put();
1081 continue;
1082 }
1083 ConnectionRef peer_con;
1084 if (iter.first == whoami) {
1085 peer_con = osd->cluster_messenger->get_loopback_connection();
1086 } else {
1087 peer_con = osd->cluster_messenger->connect_to_osd(
1088 next_map->get_cluster_addrs(iter.first), false, true);
1089 }
1090 maybe_share_map(peer_con.get(), next_map);
1091 peer_con->send_message(iter.second);
1092 }
1093 release_map(next_map);
1094 }
1095 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1096 {
1097 OSDMapRef next_map = get_nextmap_reserved();
1098 // service map is always newer/newest
1099 ceph_assert(from_epoch <= next_map->get_epoch());
1100
1101 if (next_map->is_down(peer) ||
1102 next_map->get_info(peer).up_from > from_epoch) {
1103 release_map(next_map);
1104 return NULL;
1105 }
1106 ConnectionRef con;
1107 if (peer == whoami) {
1108 con = osd->cluster_messenger->get_loopback_connection();
1109 } else {
1110 con = osd->cluster_messenger->connect_to_osd(
1111 next_map->get_cluster_addrs(peer), false, true);
1112 }
1113 release_map(next_map);
1114 return con;
1115 }
1116
1117 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1118 {
1119 OSDMapRef next_map = get_nextmap_reserved();
1120 // service map is always newer/newest
1121 ceph_assert(from_epoch <= next_map->get_epoch());
1122
1123 pair<ConnectionRef,ConnectionRef> ret;
1124 if (next_map->is_down(peer) ||
1125 next_map->get_info(peer).up_from > from_epoch) {
1126 release_map(next_map);
1127 return ret;
1128 }
1129 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1130 next_map->get_hb_back_addrs(peer));
1131 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1132 next_map->get_hb_front_addrs(peer));
1133 release_map(next_map);
1134 return ret;
1135 }
1136
1137 entity_name_t OSDService::get_cluster_msgr_name() const
1138 {
1139 return cluster_messenger->get_myname();
1140 }
1141
1142 void OSDService::queue_want_pg_temp(pg_t pgid,
1143 const vector<int>& want,
1144 bool forced)
1145 {
1146 std::lock_guard l(pg_temp_lock);
1147 auto p = pg_temp_pending.find(pgid);
1148 if (p == pg_temp_pending.end() ||
1149 p->second.acting != want ||
1150 forced) {
1151 pg_temp_wanted[pgid] = {want, forced};
1152 }
1153 }
1154
1155 void OSDService::remove_want_pg_temp(pg_t pgid)
1156 {
1157 std::lock_guard l(pg_temp_lock);
1158 pg_temp_wanted.erase(pgid);
1159 pg_temp_pending.erase(pgid);
1160 }
1161
1162 void OSDService::_sent_pg_temp()
1163 {
1164 #ifdef HAVE_STDLIB_MAP_SPLICING
1165 pg_temp_pending.merge(pg_temp_wanted);
1166 #else
1167 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1168 make_move_iterator(end(pg_temp_wanted)));
1169 #endif
1170 pg_temp_wanted.clear();
1171 }
1172
1173 void OSDService::requeue_pg_temp()
1174 {
1175 std::lock_guard l(pg_temp_lock);
1176 // wanted overrides pending. note that remove_want_pg_temp
1177 // clears the item out of both.
1178 unsigned old_wanted = pg_temp_wanted.size();
1179 unsigned old_pending = pg_temp_pending.size();
1180 _sent_pg_temp();
1181 pg_temp_wanted.swap(pg_temp_pending);
1182 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1183 << pg_temp_wanted.size() << dendl;
1184 }
1185
1186 std::ostream& operator<<(std::ostream& out,
1187 const OSDService::pg_temp_t& pg_temp)
1188 {
1189 out << pg_temp.acting;
1190 if (pg_temp.forced) {
1191 out << " (forced)";
1192 }
1193 return out;
1194 }
1195
1196 void OSDService::send_pg_temp()
1197 {
1198 std::lock_guard l(pg_temp_lock);
1199 if (pg_temp_wanted.empty())
1200 return;
1201 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1202 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1203 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1204 auto& m = ms[pg_temp.forced];
1205 if (!m) {
1206 m = new MOSDPGTemp(osdmap->get_epoch());
1207 m->forced = pg_temp.forced;
1208 }
1209 m->pg_temp.emplace(pgid, pg_temp.acting);
1210 }
1211 for (auto m : ms) {
1212 if (m) {
1213 monc->send_mon_message(m);
1214 }
1215 }
1216 _sent_pg_temp();
1217 }
1218
1219 void OSDService::send_pg_created(pg_t pgid)
1220 {
1221 std::lock_guard l(pg_created_lock);
1222 dout(20) << __func__ << dendl;
1223 auto o = get_osdmap();
1224 if (o->require_osd_release >= ceph_release_t::luminous) {
1225 pg_created.insert(pgid);
1226 monc->send_mon_message(new MOSDPGCreated(pgid));
1227 }
1228 }
1229
1230 void OSDService::send_pg_created()
1231 {
1232 std::lock_guard l(pg_created_lock);
1233 dout(20) << __func__ << dendl;
1234 auto o = get_osdmap();
1235 if (o->require_osd_release >= ceph_release_t::luminous) {
1236 for (auto pgid : pg_created) {
1237 monc->send_mon_message(new MOSDPGCreated(pgid));
1238 }
1239 }
1240 }
1241
1242 void OSDService::prune_pg_created()
1243 {
1244 std::lock_guard l(pg_created_lock);
1245 dout(20) << __func__ << dendl;
1246 auto o = get_osdmap();
1247 auto i = pg_created.begin();
1248 while (i != pg_created.end()) {
1249 auto p = o->get_pg_pool(i->pool());
1250 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1251 dout(20) << __func__ << " pruning " << *i << dendl;
1252 i = pg_created.erase(i);
1253 } else {
1254 dout(20) << __func__ << " keeping " << *i << dendl;
1255 ++i;
1256 }
1257 }
1258 }
1259
1260
1261 // --------------------------------------
1262 // dispatch
1263
1264 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1265 epoch_t *_bind_epoch) const
1266 {
1267 std::lock_guard l(epoch_lock);
1268 if (_boot_epoch)
1269 *_boot_epoch = boot_epoch;
1270 if (_up_epoch)
1271 *_up_epoch = up_epoch;
1272 if (_bind_epoch)
1273 *_bind_epoch = bind_epoch;
1274 }
1275
1276 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1277 const epoch_t *_bind_epoch)
1278 {
1279 std::lock_guard l(epoch_lock);
1280 if (_boot_epoch) {
1281 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1282 boot_epoch = *_boot_epoch;
1283 }
1284 if (_up_epoch) {
1285 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1286 up_epoch = *_up_epoch;
1287 }
1288 if (_bind_epoch) {
1289 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1290 bind_epoch = *_bind_epoch;
1291 }
1292 }
1293
1294 bool OSDService::prepare_to_stop()
1295 {
1296 std::unique_lock l(is_stopping_lock);
1297 if (get_state() != NOT_STOPPING)
1298 return false;
1299
1300 OSDMapRef osdmap = get_osdmap();
1301 if (osdmap && osdmap->is_up(whoami)) {
1302 dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
1303 set_state(PREPARING_TO_STOP);
1304 monc->send_mon_message(
1305 new MOSDMarkMeDown(
1306 monc->get_fsid(),
1307 whoami,
1308 osdmap->get_addrs(whoami),
1309 osdmap->get_epoch(),
1310 true, // request ack
1311 true // mark as down and dead
1312 ));
1313 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1314 is_stopping_cond.wait_for(l, timeout,
1315 [this] { return get_state() == STOPPING; });
1316 }
1317
1318 dout(0) << __func__ << " starting shutdown" << dendl;
1319 set_state(STOPPING);
1320 return true;
1321 }
1322
1323 void OSDService::got_stop_ack()
1324 {
1325 std::scoped_lock l(is_stopping_lock);
1326 if (get_state() == PREPARING_TO_STOP) {
1327 dout(0) << __func__ << " starting shutdown" << dendl;
1328 set_state(STOPPING);
1329 is_stopping_cond.notify_all();
1330 } else {
1331 dout(10) << __func__ << " ignoring msg" << dendl;
1332 }
1333 }
1334
1335 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1336 OSDSuperblock& sblock)
1337 {
1338 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1339 osdmap->get_encoding_features());
1340 m->oldest_map = max_oldest_map;
1341 m->newest_map = sblock.newest_map;
1342
1343 int max = cct->_conf->osd_map_message_max;
1344 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1345
1346 if (since < m->oldest_map) {
1347 // we don't have the next map the target wants, so start with a
1348 // full map.
1349 bufferlist bl;
1350 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1351 << since << ", starting with full map" << dendl;
1352 since = m->oldest_map;
1353 if (!get_map_bl(since, bl)) {
1354 derr << __func__ << " missing full map " << since << dendl;
1355 goto panic;
1356 }
1357 max--;
1358 max_bytes -= bl.length();
1359 m->maps[since] = std::move(bl);
1360 }
1361 for (epoch_t e = since + 1; e <= to; ++e) {
1362 bufferlist bl;
1363 if (get_inc_map_bl(e, bl)) {
1364 m->incremental_maps[e] = std::move(bl);
1365 } else {
1366 dout(10) << __func__ << " missing incremental map " << e << dendl;
1367 if (!get_map_bl(e, bl)) {
1368 derr << __func__ << " also missing full map " << e << dendl;
1369 goto panic;
1370 }
1371 m->maps[e] = std::move(bl);
1372 }
1373 max--;
1374 max_bytes -= bl.length();
1375 if (max <= 0 || max_bytes <= 0) {
1376 break;
1377 }
1378 }
1379 return m;
1380
1381 panic:
1382 if (!m->maps.empty() ||
1383 !m->incremental_maps.empty()) {
1384 // send what we have so far
1385 return m;
1386 }
1387 // send something
1388 bufferlist bl;
1389 if (get_inc_map_bl(m->newest_map, bl)) {
1390 m->incremental_maps[m->newest_map] = std::move(bl);
1391 } else {
1392 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1393 if (!get_map_bl(m->newest_map, bl)) {
1394 derr << __func__ << " unable to load latest full map " << m->newest_map
1395 << dendl;
1396 ceph_abort();
1397 }
1398 m->maps[m->newest_map] = std::move(bl);
1399 }
1400 return m;
1401 }
1402
1403 void OSDService::send_map(MOSDMap *m, Connection *con)
1404 {
1405 con->send_message(m);
1406 }
1407
1408 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1409 const OSDMapRef& osdmap)
1410 {
1411 epoch_t to = osdmap->get_epoch();
1412 dout(10) << "send_incremental_map " << since << " -> " << to
1413 << " to " << con << " " << con->get_peer_addr() << dendl;
1414
1415 MOSDMap *m = NULL;
1416 while (!m) {
1417 OSDSuperblock sblock(get_superblock());
1418 if (since < sblock.oldest_map) {
1419 // just send latest full map
1420 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1421 osdmap->get_encoding_features());
1422 m->oldest_map = max_oldest_map;
1423 m->newest_map = sblock.newest_map;
1424 get_map_bl(to, m->maps[to]);
1425 send_map(m, con);
1426 return;
1427 }
1428
1429 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1430 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1431 << ", only sending most recent" << dendl;
1432 since = to - cct->_conf->osd_map_share_max_epochs;
1433 }
1434
1435 m = build_incremental_map_msg(since, to, sblock);
1436 }
1437 send_map(m, con);
1438 }
1439
1440 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1441 {
1442 bool found = map_bl_cache.lookup(e, &bl);
1443 if (found) {
1444 logger->inc(l_osd_map_bl_cache_hit);
1445 return true;
1446 }
1447 logger->inc(l_osd_map_bl_cache_miss);
1448 found = store->read(meta_ch,
1449 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1450 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1451 if (found) {
1452 _add_map_bl(e, bl);
1453 }
1454 return found;
1455 }
1456
1457 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1458 {
1459 std::lock_guard l(map_cache_lock);
1460 bool found = map_bl_inc_cache.lookup(e, &bl);
1461 if (found) {
1462 logger->inc(l_osd_map_bl_cache_hit);
1463 return true;
1464 }
1465 logger->inc(l_osd_map_bl_cache_miss);
1466 found = store->read(meta_ch,
1467 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1468 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1469 if (found) {
1470 _add_map_inc_bl(e, bl);
1471 }
1472 return found;
1473 }
1474
1475 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1476 {
1477 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1478 // cache a contiguous buffer
1479 if (bl.get_num_buffers() > 1) {
1480 bl.rebuild();
1481 }
1482 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1483 map_bl_cache.add(e, bl);
1484 }
1485
1486 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1487 {
1488 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1489 // cache a contiguous buffer
1490 if (bl.get_num_buffers() > 1) {
1491 bl.rebuild();
1492 }
1493 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1494 map_bl_inc_cache.add(e, bl);
1495 }
1496
1497 OSDMapRef OSDService::_add_map(OSDMap *o)
1498 {
1499 epoch_t e = o->get_epoch();
1500
1501 if (cct->_conf->osd_map_dedup) {
1502 // Dedup against an existing map at a nearby epoch
1503 OSDMapRef for_dedup = map_cache.lower_bound(e);
1504 if (for_dedup) {
1505 OSDMap::dedup(for_dedup.get(), o);
1506 }
1507 }
1508 bool existed;
1509 OSDMapRef l = map_cache.add(e, o, &existed);
1510 if (existed) {
1511 delete o;
1512 }
1513 return l;
1514 }
1515
1516 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1517 {
1518 std::lock_guard l(map_cache_lock);
1519 OSDMapRef retval = map_cache.lookup(epoch);
1520 if (retval) {
1521 dout(30) << "get_map " << epoch << " -cached" << dendl;
1522 logger->inc(l_osd_map_cache_hit);
1523 return retval;
1524 }
1525 {
1526 logger->inc(l_osd_map_cache_miss);
1527 epoch_t lb = map_cache.cached_key_lower_bound();
1528 if (epoch < lb) {
1529 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1530 logger->inc(l_osd_map_cache_miss_low);
1531 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1532 }
1533 }
1534
1535 OSDMap *map = new OSDMap;
1536 if (epoch > 0) {
1537 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1538 bufferlist bl;
1539 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1540 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1541 delete map;
1542 return OSDMapRef();
1543 }
1544 map->decode(bl);
1545 } else {
1546 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1547 }
1548 return _add_map(map);
1549 }
1550
1551 // ops
1552
1553
1554 void OSDService::reply_op_error(OpRequestRef op, int err)
1555 {
1556 reply_op_error(op, err, eversion_t(), 0, {});
1557 }
1558
1559 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1560 version_t uv,
1561 vector<pg_log_op_return_item_t> op_returns)
1562 {
1563 auto m = op->get_req<MOSDOp>();
1564 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1565 int flags;
1566 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1567
1568 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1569 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1570 reply->set_reply_versions(v, uv);
1571 reply->set_op_returns(op_returns);
1572 m->get_connection()->send_message(reply);
1573 }
1574
1575 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1576 {
1577 if (!cct->_conf->osd_debug_misdirected_ops) {
1578 return;
1579 }
1580
1581 auto m = op->get_req<MOSDOp>();
1582 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1583
1584 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1585
1586 if (pg->is_ec_pg()) {
1587 /**
1588 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1589 * can get this result:
1590 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1591 * [CRUSH_ITEM_NONE, 2, 3]/3
1592 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1593 * [3, 2, 3]/3
1594 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1595 * -- misdirected op
1596 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1597 * it and fulfils it
1598 *
1599 * We can't compute the op target based on the sending map epoch due to
1600 * splitting. The simplest thing is to detect such cases here and drop
1601 * them without an error (the client will resend anyway).
1602 */
1603 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1604 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1605 if (!opmap) {
1606 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1607 << m->get_map_epoch() << ", dropping" << dendl;
1608 return;
1609 }
1610 pg_t _pgid = m->get_raw_pg();
1611 spg_t pgid;
1612 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1613 _pgid = opmap->raw_pg_to_pg(_pgid);
1614 if (opmap->get_primary_shard(_pgid, &pgid) &&
1615 pgid.shard != pg->pg_id.shard) {
1616 dout(7) << __func__ << ": " << *pg << " primary changed since "
1617 << m->get_map_epoch() << ", dropping" << dendl;
1618 return;
1619 }
1620 }
1621
1622 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1623 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1624 << " pg " << m->get_raw_pg()
1625 << " to osd." << whoami
1626 << " not " << pg->get_acting()
1627 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1628 }
1629
1630 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1631 {
1632 osd->op_shardedwq.queue(std::move(qi));
1633 }
1634
1635 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1636 {
1637 osd->op_shardedwq.queue_front(std::move(qi));
1638 }
1639
1640 void OSDService::queue_recovery_context(
1641 PG *pg,
1642 GenContext<ThreadPool::TPHandle&> *c)
1643 {
1644 epoch_t e = get_osdmap_epoch();
1645 enqueue_back(
1646 OpSchedulerItem(
1647 unique_ptr<OpSchedulerItem::OpQueueable>(
1648 new PGRecoveryContext(pg->get_pgid(), c, e)),
1649 cct->_conf->osd_recovery_cost,
1650 cct->_conf->osd_recovery_priority,
1651 ceph_clock_now(),
1652 0,
1653 e));
1654 }
1655
1656 void OSDService::queue_for_snap_trim(PG *pg)
1657 {
1658 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1659 enqueue_back(
1660 OpSchedulerItem(
1661 unique_ptr<OpSchedulerItem::OpQueueable>(
1662 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1663 cct->_conf->osd_snap_trim_cost,
1664 cct->_conf->osd_snap_trim_priority,
1665 ceph_clock_now(),
1666 0,
1667 pg->get_osdmap_epoch()));
1668 }
1669
1670 template <class MSG_TYPE>
1671 void OSDService::queue_scrub_event_msg(PG* pg,
1672 Scrub::scrub_prio_t with_priority,
1673 unsigned int qu_priority,
1674 Scrub::act_token_t act_token)
1675 {
1676 const auto epoch = pg->get_osdmap_epoch();
1677 auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1678 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1679 << ". Epoch: " << epoch << " token: " << act_token << dendl;
1680
1681 enqueue_back(OpSchedulerItem(
1682 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1683 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1684 }
1685
1686 template <class MSG_TYPE>
1687 void OSDService::queue_scrub_event_msg(PG* pg,
1688 Scrub::scrub_prio_t with_priority)
1689 {
1690 const auto epoch = pg->get_osdmap_epoch();
1691 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1692 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1693
1694 enqueue_back(OpSchedulerItem(
1695 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1696 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1697 }
1698
1699 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1700 {
1701 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1702 }
1703
1704 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1705 {
1706 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1707 }
1708
1709 void OSDService::queue_for_rep_scrub(PG* pg,
1710 Scrub::scrub_prio_t with_priority,
1711 unsigned int qu_priority,
1712 Scrub::act_token_t act_token)
1713 {
1714 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
1715 }
1716
1717 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1718 Scrub::scrub_prio_t with_priority,
1719 unsigned int qu_priority,
1720 Scrub::act_token_t act_token)
1721 {
1722 // Resulting scrub event: 'SchedReplica'
1723 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1724 act_token);
1725 }
1726
1727 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1728 {
1729 // Resulting scrub event: 'RemotesReserved'
1730 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1731 }
1732
1733 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1734 {
1735 // Resulting scrub event: 'ReservationFailure'
1736 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1737 }
1738
1739 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1740 {
1741 // Resulting scrub event: 'InternalSchedScrub'
1742 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1743 }
1744
1745 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1746 {
1747 // Resulting scrub event: 'ActivePushesUpd'
1748 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1749 }
1750
1751 void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1752 {
1753 // Resulting scrub event: 'SelectedChunkFree'
1754 queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1755 }
1756
1757 void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1758 {
1759 // Resulting scrub event: 'ChunkIsBusy'
1760 queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1761 }
1762
1763 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1764 {
1765 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1766 }
1767
1768 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1769 {
1770 // Resulting scrub event: 'Unblocked'
1771 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1772 }
1773
1774 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1775 {
1776 // Resulting scrub event: 'DigestUpdate'
1777 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1778 }
1779
1780 void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1781 {
1782 // Resulting scrub event: 'IntLocalMapDone'
1783 queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1784 }
1785
1786 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1787 {
1788 // Resulting scrub event: 'GotReplicas'
1789 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1790 }
1791
1792 void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
1793 {
1794 // Resulting scrub event: 'MapsCompared'
1795 queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
1796 }
1797
1798 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1799 {
1800 // Resulting scrub event: 'ReplicaPushesUpd'
1801 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1802 }
1803
1804 void OSDService::queue_scrub_is_finished(PG *pg)
1805 {
1806 // Resulting scrub event: 'ScrubFinished'
1807 queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1808 }
1809
1810 void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1811 {
1812 // Resulting scrub event: 'NextChunk'
1813 queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1814 }
1815
1816 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1817 {
1818 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1819 enqueue_back(
1820 OpSchedulerItem(
1821 unique_ptr<OpSchedulerItem::OpQueueable>(
1822 new PGDelete(pgid, e)),
1823 cct->_conf->osd_pg_delete_cost,
1824 cct->_conf->osd_pg_delete_priority,
1825 ceph_clock_now(),
1826 0,
1827 e));
1828 }
1829
1830 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1831 {
1832 return osd->try_finish_pg_delete(pg, old_pg_num);
1833 }
1834
1835 // ---
1836
1837 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1838 {
1839 std::lock_guard l(merge_lock);
1840 dout(10) << __func__ << " " << pg->pg_id << dendl;
1841 ready_to_merge_source[pg->pg_id.pgid] = version;
1842 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1843 _send_ready_to_merge();
1844 }
1845
1846 void OSDService::set_ready_to_merge_target(PG *pg,
1847 eversion_t version,
1848 epoch_t last_epoch_started,
1849 epoch_t last_epoch_clean)
1850 {
1851 std::lock_guard l(merge_lock);
1852 dout(10) << __func__ << " " << pg->pg_id << dendl;
1853 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1854 make_tuple(version,
1855 last_epoch_started,
1856 last_epoch_clean)));
1857 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1858 _send_ready_to_merge();
1859 }
1860
1861 void OSDService::set_not_ready_to_merge_source(pg_t source)
1862 {
1863 std::lock_guard l(merge_lock);
1864 dout(10) << __func__ << " " << source << dendl;
1865 not_ready_to_merge_source.insert(source);
1866 assert(ready_to_merge_source.count(source) == 0);
1867 _send_ready_to_merge();
1868 }
1869
1870 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1871 {
1872 std::lock_guard l(merge_lock);
1873 dout(10) << __func__ << " " << target << " source " << source << dendl;
1874 not_ready_to_merge_target[target] = source;
1875 assert(ready_to_merge_target.count(target) == 0);
1876 _send_ready_to_merge();
1877 }
1878
1879 void OSDService::send_ready_to_merge()
1880 {
1881 std::lock_guard l(merge_lock);
1882 _send_ready_to_merge();
1883 }
1884
1885 void OSDService::_send_ready_to_merge()
1886 {
1887 dout(20) << __func__
1888 << " ready_to_merge_source " << ready_to_merge_source
1889 << " not_ready_to_merge_source " << not_ready_to_merge_source
1890 << " ready_to_merge_target " << ready_to_merge_target
1891 << " not_ready_to_merge_target " << not_ready_to_merge_target
1892 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1893 << dendl;
1894 for (auto src : not_ready_to_merge_source) {
1895 if (sent_ready_to_merge_source.count(src) == 0) {
1896 monc->send_mon_message(new MOSDPGReadyToMerge(
1897 src,
1898 {}, {}, 0, 0,
1899 false,
1900 osdmap->get_epoch()));
1901 sent_ready_to_merge_source.insert(src);
1902 }
1903 }
1904 for (auto p : not_ready_to_merge_target) {
1905 if (sent_ready_to_merge_source.count(p.second) == 0) {
1906 monc->send_mon_message(new MOSDPGReadyToMerge(
1907 p.second,
1908 {}, {}, 0, 0,
1909 false,
1910 osdmap->get_epoch()));
1911 sent_ready_to_merge_source.insert(p.second);
1912 }
1913 }
1914 for (auto src : ready_to_merge_source) {
1915 if (not_ready_to_merge_source.count(src.first) ||
1916 not_ready_to_merge_target.count(src.first.get_parent())) {
1917 continue;
1918 }
1919 auto p = ready_to_merge_target.find(src.first.get_parent());
1920 if (p != ready_to_merge_target.end() &&
1921 sent_ready_to_merge_source.count(src.first) == 0) {
1922 monc->send_mon_message(new MOSDPGReadyToMerge(
1923 src.first, // source pgid
1924 src.second, // src version
1925 std::get<0>(p->second), // target version
1926 std::get<1>(p->second), // PG's last_epoch_started
1927 std::get<2>(p->second), // PG's last_epoch_clean
1928 true,
1929 osdmap->get_epoch()));
1930 sent_ready_to_merge_source.insert(src.first);
1931 }
1932 }
1933 }
1934
1935 void OSDService::clear_ready_to_merge(PG *pg)
1936 {
1937 std::lock_guard l(merge_lock);
1938 dout(10) << __func__ << " " << pg->pg_id << dendl;
1939 ready_to_merge_source.erase(pg->pg_id.pgid);
1940 ready_to_merge_target.erase(pg->pg_id.pgid);
1941 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1942 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1943 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1944 }
1945
1946 void OSDService::clear_sent_ready_to_merge()
1947 {
1948 std::lock_guard l(merge_lock);
1949 sent_ready_to_merge_source.clear();
1950 }
1951
1952 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1953 {
1954 std::lock_guard l(merge_lock);
1955 auto i = sent_ready_to_merge_source.begin();
1956 while (i != sent_ready_to_merge_source.end()) {
1957 if (!osdmap->pg_exists(*i)) {
1958 dout(10) << __func__ << " " << *i << dendl;
1959 i = sent_ready_to_merge_source.erase(i);
1960 } else {
1961 ++i;
1962 }
1963 }
1964 }
1965
1966 // ---
1967
1968 void OSDService::_queue_for_recovery(
1969 std::pair<epoch_t, PGRef> p,
1970 uint64_t reserved_pushes)
1971 {
1972 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1973 enqueue_back(
1974 OpSchedulerItem(
1975 unique_ptr<OpSchedulerItem::OpQueueable>(
1976 new PGRecovery(
1977 p.second->get_pgid(), p.first, reserved_pushes)),
1978 cct->_conf->osd_recovery_cost,
1979 cct->_conf->osd_recovery_priority,
1980 ceph_clock_now(),
1981 0,
1982 p.first));
1983 }
1984
1985 // ====================================================================
1986 // OSD
1987
1988 #undef dout_prefix
1989 #define dout_prefix *_dout
1990
1991 // Commands shared between OSD's console and admin console:
1992 namespace ceph::osd_cmds {
1993
1994 int heap(CephContext& cct,
1995 const cmdmap_t& cmdmap,
1996 std::ostream& outos,
1997 std::ostream& erros);
1998
1999 } // namespace ceph::osd_cmds
2000
2001 int OSD::mkfs(CephContext *cct,
2002 std::unique_ptr<ObjectStore> store,
2003 uuid_d fsid,
2004 int whoami,
2005 string osdspec_affinity)
2006 {
2007 int ret;
2008
2009 OSDSuperblock sb;
2010 bufferlist sbbl;
2011 // if we are fed a uuid for this osd, use it.
2012 store->set_fsid(cct->_conf->osd_uuid);
2013
2014 ret = store->mkfs();
2015 if (ret) {
2016 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2017 << cpp_strerror(ret) << dendl;
2018 return ret;
2019 }
2020
2021 store->set_cache_shards(1); // doesn't matter for mkfs!
2022
2023 ret = store->mount();
2024 if (ret) {
2025 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2026 << cpp_strerror(ret) << dendl;
2027 return ret;
2028 }
2029
2030 auto umount_store = make_scope_guard([&] {
2031 store->umount();
2032 });
2033
2034 ObjectStore::CollectionHandle ch =
2035 store->open_collection(coll_t::meta());
2036 if (ch) {
2037 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2038 if (ret < 0) {
2039 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2040 return ret;
2041 }
2042 /* if we already have superblock, check content of superblock */
2043 dout(0) << " have superblock" << dendl;
2044 auto p = sbbl.cbegin();
2045 decode(sb, p);
2046 if (whoami != sb.whoami) {
2047 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2048 << dendl;
2049 return -EINVAL;
2050 }
2051 if (fsid != sb.cluster_fsid) {
2052 derr << "provided cluster fsid " << fsid
2053 << " != superblock's " << sb.cluster_fsid << dendl;
2054 return -EINVAL;
2055 }
2056 } else {
2057 // create superblock
2058 sb.cluster_fsid = fsid;
2059 sb.osd_fsid = store->get_fsid();
2060 sb.whoami = whoami;
2061 sb.compat_features = get_osd_initial_compat_set();
2062
2063 bufferlist bl;
2064 encode(sb, bl);
2065
2066 ObjectStore::CollectionHandle ch = store->create_new_collection(
2067 coll_t::meta());
2068 ObjectStore::Transaction t;
2069 t.create_collection(coll_t::meta(), 0);
2070 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2071 ret = store->queue_transaction(ch, std::move(t));
2072 if (ret) {
2073 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2074 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2075 return ret;
2076 }
2077 ch->flush();
2078 }
2079
2080 ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2081 if (ret) {
2082 derr << "OSD::mkfs: failed to write fsid file: error "
2083 << cpp_strerror(ret) << dendl;
2084 }
2085 return ret;
2086 }
2087
2088 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2089 {
2090 char val[80];
2091 int r;
2092
2093 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2094 r = store->write_meta("magic", val);
2095 if (r < 0)
2096 return r;
2097
2098 snprintf(val, sizeof(val), "%d", whoami);
2099 r = store->write_meta("whoami", val);
2100 if (r < 0)
2101 return r;
2102
2103 cluster_fsid.print(val);
2104 r = store->write_meta("ceph_fsid", val);
2105 if (r < 0)
2106 return r;
2107
2108 string key = cct->_conf.get_val<string>("key");
2109 if (key.size()) {
2110 r = store->write_meta("osd_key", key);
2111 if (r < 0)
2112 return r;
2113 } else {
2114 string keyfile = cct->_conf.get_val<string>("keyfile");
2115 if (!keyfile.empty()) {
2116 bufferlist keybl;
2117 string err;
2118 r = keybl.read_file(keyfile.c_str(), &err);
2119 if (r < 0) {
2120 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2121 << err << ": " << cpp_strerror(r) << dendl;
2122 return r;
2123 }
2124 r = store->write_meta("osd_key", keybl.to_str());
2125 if (r < 0)
2126 return r;
2127 }
2128 }
2129 if (!osdspec_affinity.empty()) {
2130 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2131 if (r < 0)
2132 return r;
2133 }
2134
2135 r = store->write_meta("ready", "ready");
2136 if (r < 0)
2137 return r;
2138
2139 return 0;
2140 }
2141
2142 int OSD::peek_meta(ObjectStore *store,
2143 std::string *magic,
2144 uuid_d *cluster_fsid,
2145 uuid_d *osd_fsid,
2146 int *whoami,
2147 ceph_release_t *require_osd_release)
2148 {
2149 string val;
2150
2151 int r = store->read_meta("magic", &val);
2152 if (r < 0)
2153 return r;
2154 *magic = val;
2155
2156 r = store->read_meta("whoami", &val);
2157 if (r < 0)
2158 return r;
2159 *whoami = atoi(val.c_str());
2160
2161 r = store->read_meta("ceph_fsid", &val);
2162 if (r < 0)
2163 return r;
2164 r = cluster_fsid->parse(val.c_str());
2165 if (!r)
2166 return -EINVAL;
2167
2168 r = store->read_meta("fsid", &val);
2169 if (r < 0) {
2170 *osd_fsid = uuid_d();
2171 } else {
2172 r = osd_fsid->parse(val.c_str());
2173 if (!r)
2174 return -EINVAL;
2175 }
2176
2177 r = store->read_meta("require_osd_release", &val);
2178 if (r >= 0) {
2179 *require_osd_release = ceph_release_from_name(val);
2180 }
2181
2182 return 0;
2183 }
2184
2185
2186 #undef dout_prefix
2187 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2188
2189 // cons/des
2190
2191 OSD::OSD(CephContext *cct_,
2192 std::unique_ptr<ObjectStore> store_,
2193 int id,
2194 Messenger *internal_messenger,
2195 Messenger *external_messenger,
2196 Messenger *hb_client_front,
2197 Messenger *hb_client_back,
2198 Messenger *hb_front_serverm,
2199 Messenger *hb_back_serverm,
2200 Messenger *osdc_messenger,
2201 MonClient *mc,
2202 const std::string &dev, const std::string &jdev,
2203 ceph::async::io_context_pool& poolctx) :
2204 Dispatcher(cct_),
2205 tick_timer(cct, osd_lock),
2206 tick_timer_without_osd_lock(cct, tick_timer_lock),
2207 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2208 cluster_messenger(internal_messenger),
2209 client_messenger(external_messenger),
2210 objecter_messenger(osdc_messenger),
2211 monc(mc),
2212 mgrc(cct_, client_messenger, &mc->monmap),
2213 logger(create_logger()),
2214 recoverystate_perf(create_recoverystate_perf()),
2215 store(std::move(store_)),
2216 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2217 clog(log_client.create_channel()),
2218 whoami(id),
2219 dev_path(dev), journal_path(jdev),
2220 store_is_rotational(store->is_rotational()),
2221 trace_endpoint("0.0.0.0", 0, "osd"),
2222 asok_hook(NULL),
2223 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2224 "osd_pg_epoch_max_lag_factor")),
2225 osd_compat(get_osd_compat_set()),
2226 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2227 get_num_op_threads()),
2228 heartbeat_stop(false),
2229 heartbeat_need_update(true),
2230 hb_front_client_messenger(hb_client_front),
2231 hb_back_client_messenger(hb_client_back),
2232 hb_front_server_messenger(hb_front_serverm),
2233 hb_back_server_messenger(hb_back_serverm),
2234 daily_loadavg(0.0),
2235 heartbeat_thread(this),
2236 heartbeat_dispatcher(this),
2237 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2238 cct->_conf->osd_num_op_tracker_shard),
2239 test_ops_hook(NULL),
2240 op_shardedwq(
2241 this,
2242 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2243 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2244 &osd_op_tp),
2245 last_pg_create_epoch(0),
2246 boot_finisher(cct),
2247 up_thru_wanted(0),
2248 requested_full_first(0),
2249 requested_full_last(0),
2250 service(this, poolctx)
2251 {
2252
2253 if (!gss_ktfile_client.empty()) {
2254 // Assert we can export environment variable
2255 /*
2256 The default client keytab is used, if it is present and readable,
2257 to automatically obtain initial credentials for GSSAPI client
2258 applications. The principal name of the first entry in the client
2259 keytab is used by default when obtaining initial credentials.
2260 1. The KRB5_CLIENT_KTNAME environment variable.
2261 2. The default_client_keytab_name profile variable in [libdefaults].
2262 3. The hardcoded default, DEFCKTNAME.
2263 */
2264 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2265 gss_ktfile_client.c_str(), 1));
2266 ceph_assert(set_result == 0);
2267 }
2268
2269 monc->set_messenger(client_messenger);
2270 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2271 cct->_conf->osd_op_log_threshold);
2272 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2273 cct->_conf->osd_op_history_duration);
2274 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2275 cct->_conf->osd_op_history_slow_op_threshold);
2276 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2277 #ifdef WITH_BLKIN
2278 std::stringstream ss;
2279 ss << "osd." << whoami;
2280 trace_endpoint.copy_name(ss.str());
2281 #endif
2282
2283 // initialize shards
2284 num_shards = get_num_op_shards();
2285 for (uint32_t i = 0; i < num_shards; i++) {
2286 OSDShard *one_shard = new OSDShard(
2287 i,
2288 cct,
2289 this);
2290 shards.push_back(one_shard);
2291 }
2292 }
2293
2294 OSD::~OSD()
2295 {
2296 while (!shards.empty()) {
2297 delete shards.back();
2298 shards.pop_back();
2299 }
2300 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2301 cct->get_perfcounters_collection()->remove(logger);
2302 delete recoverystate_perf;
2303 delete logger;
2304 }
2305
2306 double OSD::get_tick_interval() const
2307 {
2308 // vary +/- 5% to avoid scrub scheduling livelocks
2309 constexpr auto delta = 0.05;
2310 return (OSD_TICK_INTERVAL *
2311 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2312 }
2313
2314 void OSD::handle_signal(int signum)
2315 {
2316 ceph_assert(signum == SIGINT || signum == SIGTERM);
2317 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2318 shutdown();
2319 }
2320
2321 int OSD::pre_init()
2322 {
2323 std::lock_guard lock(osd_lock);
2324 if (is_stopping())
2325 return 0;
2326
2327 if (store->test_mount_in_use()) {
2328 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2329 << "currently in use. (Is ceph-osd already running?)" << dendl;
2330 return -EBUSY;
2331 }
2332
2333 cct->_conf.add_observer(this);
2334 return 0;
2335 }
2336
2337 int OSD::set_numa_affinity()
2338 {
2339 // storage numa node
2340 int store_node = -1;
2341 store->get_numa_node(&store_node, nullptr, nullptr);
2342 if (store_node >= 0) {
2343 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2344 }
2345
2346 // check network numa node(s)
2347 int front_node = -1, back_node = -1;
2348 string front_iface = pick_iface(
2349 cct,
2350 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2351 string back_iface = pick_iface(
2352 cct,
2353 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2354 int r = get_iface_numa_node(front_iface, &front_node);
2355 if (r >= 0 && front_node >= 0) {
2356 dout(1) << __func__ << " public network " << front_iface << " numa node "
2357 << front_node << dendl;
2358 r = get_iface_numa_node(back_iface, &back_node);
2359 if (r >= 0 && back_node >= 0) {
2360 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2361 << back_node << dendl;
2362 if (front_node == back_node &&
2363 front_node == store_node) {
2364 dout(1) << " objectstore and network numa nodes all match" << dendl;
2365 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2366 numa_node = front_node;
2367 }
2368 } else if (front_node != back_node) {
2369 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2370 << dendl;
2371 } else {
2372 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2373 << dendl;
2374 }
2375 } else if (back_node == -2) {
2376 dout(1) << __func__ << " cluster network " << back_iface
2377 << " ports numa nodes do not match" << dendl;
2378 } else {
2379 derr << __func__ << " unable to identify cluster interface '" << back_iface
2380 << "' numa node: " << cpp_strerror(r) << dendl;
2381 }
2382 } else if (front_node == -2) {
2383 dout(1) << __func__ << " public network " << front_iface
2384 << " ports numa nodes do not match" << dendl;
2385 } else {
2386 derr << __func__ << " unable to identify public interface '" << front_iface
2387 << "' numa node: " << cpp_strerror(r) << dendl;
2388 }
2389 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2390 // this takes precedence over the automagic logic above
2391 numa_node = node;
2392 }
2393 if (numa_node >= 0) {
2394 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2395 if (r < 0) {
2396 dout(1) << __func__ << " unable to determine numa node " << numa_node
2397 << " CPUs" << dendl;
2398 numa_node = -1;
2399 } else {
2400 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2401 << " cpus "
2402 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2403 << dendl;
2404 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2405 if (r < 0) {
2406 r = -errno;
2407 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2408 << dendl;
2409 numa_node = -1;
2410 }
2411 }
2412 } else {
2413 dout(1) << __func__ << " not setting numa affinity" << dendl;
2414 }
2415 return 0;
2416 }
2417
2418 // asok
2419
2420 class OSDSocketHook : public AdminSocketHook {
2421 OSD *osd;
2422 public:
2423 explicit OSDSocketHook(OSD *o) : osd(o) {}
2424 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2425 Formatter *f,
2426 std::ostream& ss,
2427 bufferlist& out) override {
2428 ceph_abort("should use async hook");
2429 }
2430 void call_async(
2431 std::string_view prefix,
2432 const cmdmap_t& cmdmap,
2433 Formatter *f,
2434 const bufferlist& inbl,
2435 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2436 try {
2437 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2438 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2439 bufferlist empty;
2440 on_finish(-EINVAL, e.what(), empty);
2441 }
2442 }
2443 };
2444
2445 std::set<int64_t> OSD::get_mapped_pools()
2446 {
2447 std::set<int64_t> pools;
2448 std::vector<spg_t> pgids;
2449 _get_pgids(&pgids);
2450 for (const auto &pgid : pgids) {
2451 pools.insert(pgid.pool());
2452 }
2453 return pools;
2454 }
2455
2456 OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2457 stringstream& ss,
2458 bool only_primary)
2459 {
2460 string pgidstr;
2461 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2462 ss << "no pgid specified";
2463 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2464 }
2465
2466 pg_t pgid;
2467 if (!pgid.parse(pgidstr.c_str())) {
2468 ss << "couldn't parse pgid '" << pgidstr << "'";
2469 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2470 }
2471
2472 spg_t pcand;
2473 PGRef pg;
2474 if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2475 if (pg->is_primary() || !only_primary) {
2476 return OSD::PGRefOrError{pg, 0};
2477 }
2478
2479 ss << "not primary for pgid " << pgid;
2480 pg->unlock();
2481 return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2482 } else {
2483 ss << "i don't have pgid " << pgid;
2484 return OSD::PGRefOrError{std::nullopt, -ENOENT};
2485 }
2486 }
2487
2488 // note that the cmdmap is explicitly copied into asok_route_to_pg()
2489 int OSD::asok_route_to_pg(
2490 bool only_primary,
2491 std::string_view prefix,
2492 cmdmap_t cmdmap,
2493 Formatter* f,
2494 stringstream& ss,
2495 const bufferlist& inbl,
2496 bufferlist& outbl,
2497 std::function<void(int, const std::string&, bufferlist&)> on_finish)
2498 {
2499 auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2500
2501 if (!target_pg.has_value()) {
2502 // 'ss' and 'ret' already contain the error information
2503 on_finish(ret, ss.str(), outbl);
2504 return ret;
2505 }
2506
2507 // the PG was locked by locate_asok_target()
2508 try {
2509 (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2510 (*target_pg)->unlock();
2511 return 0; // the pg handler calls on_finish directly
2512 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2513 (*target_pg)->unlock();
2514 ss << e.what();
2515 on_finish(ret, ss.str(), outbl);
2516 return -EINVAL;
2517 }
2518 }
2519
2520 void OSD::asok_command(
2521 std::string_view prefix, const cmdmap_t& cmdmap,
2522 Formatter *f,
2523 const bufferlist& inbl,
2524 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2525 {
2526 int ret = 0;
2527 stringstream ss; // stderr error message stream
2528 bufferlist outbl; // if empty at end, we'll dump formatter as output
2529
2530 // --- PG commands are routed here to PG::do_command ---
2531 if (prefix == "pg" ||
2532 prefix == "query" ||
2533 prefix == "mark_unfound_lost" ||
2534 prefix == "list_unfound" ||
2535 prefix == "scrub" ||
2536 prefix == "deep_scrub"
2537 ) {
2538 string pgidstr;
2539 pg_t pgid;
2540 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2541 ss << "no pgid specified";
2542 ret = -EINVAL;
2543 goto out;
2544 }
2545 if (!pgid.parse(pgidstr.c_str())) {
2546 ss << "couldn't parse pgid '" << pgidstr << "'";
2547 ret = -EINVAL;
2548 goto out;
2549 }
2550 spg_t pcand;
2551 PGRef pg;
2552 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2553 (pg = _lookup_lock_pg(pcand))) {
2554 if (pg->is_primary()) {
2555 cmdmap_t new_cmdmap = cmdmap;
2556 try {
2557 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2558 pg->unlock();
2559 return; // the pg handler calls on_finish directly
2560 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2561 pg->unlock();
2562 ss << e.what();
2563 ret = -EINVAL;
2564 goto out;
2565 }
2566 } else {
2567 ss << "not primary for pgid " << pgid;
2568 // do not reply; they will get newer maps and realize they
2569 // need to resend.
2570 pg->unlock();
2571 ret = -EAGAIN;
2572 goto out;
2573 }
2574 } else {
2575 ss << "i don't have pgid " << pgid;
2576 ret = -ENOENT;
2577 }
2578 }
2579
2580 // --- PG commands that will be answered even if !primary ---
2581
2582 else if (prefix == "scrubdebug") {
2583 asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2584 return;
2585 }
2586
2587 // --- OSD commands follow ---
2588
2589 else if (prefix == "status") {
2590 lock_guard l(osd_lock);
2591 f->open_object_section("status");
2592 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2593 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2594 f->dump_unsigned("whoami", superblock.whoami);
2595 f->dump_string("state", get_state_name(get_state()));
2596 f->dump_unsigned("oldest_map", superblock.oldest_map);
2597 f->dump_unsigned("newest_map", superblock.newest_map);
2598 f->dump_unsigned("num_pgs", num_pgs);
2599 f->close_section();
2600 } else if (prefix == "flush_journal") {
2601 store->flush_journal();
2602 } else if (prefix == "dump_ops_in_flight" ||
2603 prefix == "ops" ||
2604 prefix == "dump_blocked_ops" ||
2605 prefix == "dump_historic_ops" ||
2606 prefix == "dump_historic_ops_by_duration" ||
2607 prefix == "dump_historic_slow_ops") {
2608
2609 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2610 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2611 will start to track new ops received afterwards.";
2612
2613 set<string> filters;
2614 vector<string> filter_str;
2615 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2616 copy(filter_str.begin(), filter_str.end(),
2617 inserter(filters, filters.end()));
2618 }
2619
2620 if (prefix == "dump_ops_in_flight" ||
2621 prefix == "ops") {
2622 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2623 ss << error_str;
2624 ret = -EINVAL;
2625 goto out;
2626 }
2627 }
2628 if (prefix == "dump_blocked_ops") {
2629 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2630 ss << error_str;
2631 ret = -EINVAL;
2632 goto out;
2633 }
2634 }
2635 if (prefix == "dump_historic_ops") {
2636 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2637 ss << error_str;
2638 ret = -EINVAL;
2639 goto out;
2640 }
2641 }
2642 if (prefix == "dump_historic_ops_by_duration") {
2643 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2644 ss << error_str;
2645 ret = -EINVAL;
2646 goto out;
2647 }
2648 }
2649 if (prefix == "dump_historic_slow_ops") {
2650 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2651 ss << error_str;
2652 ret = -EINVAL;
2653 goto out;
2654 }
2655 }
2656 } else if (prefix == "dump_op_pq_state") {
2657 f->open_object_section("pq");
2658 op_shardedwq.dump(f);
2659 f->close_section();
2660 } else if (prefix == "dump_blocklist") {
2661 list<pair<entity_addr_t,utime_t> > bl;
2662 list<pair<entity_addr_t,utime_t> > rbl;
2663 OSDMapRef curmap = service.get_osdmap();
2664 curmap->get_blocklist(&bl, &rbl);
2665
2666 f->open_array_section("blocklist");
2667 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2668 it != bl.end(); ++it) {
2669 f->open_object_section("entry");
2670 f->open_object_section("entity_addr_t");
2671 it->first.dump(f);
2672 f->close_section(); //entity_addr_t
2673 it->second.localtime(f->dump_stream("expire_time"));
2674 f->close_section(); //entry
2675 }
2676 f->close_section(); //blocklist
2677 f->open_array_section("range_blocklist");
2678 for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
2679 it != rbl.end(); ++it) {
2680 f->open_object_section("entry");
2681 f->open_object_section("entity_addr_t");
2682 it->first.dump(f);
2683 f->close_section(); //entity_addr_t
2684 it->second.localtime(f->dump_stream("expire_time"));
2685 f->close_section(); //entry
2686 }
2687 f->close_section(); //blocklist
2688 } else if (prefix == "dump_watchers") {
2689 list<obj_watch_item_t> watchers;
2690 // scan pg's
2691 vector<PGRef> pgs;
2692 _get_pgs(&pgs);
2693 for (auto& pg : pgs) {
2694 list<obj_watch_item_t> pg_watchers;
2695 pg->get_watchers(&pg_watchers);
2696 watchers.splice(watchers.end(), pg_watchers);
2697 }
2698
2699 f->open_array_section("watchers");
2700 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2701 it != watchers.end(); ++it) {
2702
2703 f->open_object_section("watch");
2704
2705 f->dump_string("namespace", it->obj.nspace);
2706 f->dump_string("object", it->obj.oid.name);
2707
2708 f->open_object_section("entity_name");
2709 it->wi.name.dump(f);
2710 f->close_section(); //entity_name_t
2711
2712 f->dump_unsigned("cookie", it->wi.cookie);
2713 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2714
2715 f->open_object_section("entity_addr_t");
2716 it->wi.addr.dump(f);
2717 f->close_section(); //entity_addr_t
2718
2719 f->close_section(); //watch
2720 }
2721
2722 f->close_section(); //watchers
2723 } else if (prefix == "dump_recovery_reservations") {
2724 f->open_object_section("reservations");
2725 f->open_object_section("local_reservations");
2726 service.local_reserver.dump(f);
2727 f->close_section();
2728 f->open_object_section("remote_reservations");
2729 service.remote_reserver.dump(f);
2730 f->close_section();
2731 f->close_section();
2732 } else if (prefix == "dump_scrub_reservations") {
2733 f->open_object_section("scrub_reservations");
2734 service.get_scrub_services().dump_scrub_reservations(f);
2735 f->close_section();
2736 } else if (prefix == "get_latest_osdmap") {
2737 get_latest_osdmap();
2738 } else if (prefix == "set_heap_property") {
2739 string property;
2740 int64_t value = 0;
2741 string error;
2742 bool success = false;
2743 if (!cmd_getval(cmdmap, "property", property)) {
2744 error = "unable to get property";
2745 success = false;
2746 } else if (!cmd_getval(cmdmap, "value", value)) {
2747 error = "unable to get value";
2748 success = false;
2749 } else if (value < 0) {
2750 error = "negative value not allowed";
2751 success = false;
2752 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2753 error = "invalid property";
2754 success = false;
2755 } else {
2756 success = true;
2757 }
2758 f->open_object_section("result");
2759 f->dump_string("error", error);
2760 f->dump_bool("success", success);
2761 f->close_section();
2762 } else if (prefix == "get_heap_property") {
2763 string property;
2764 size_t value = 0;
2765 string error;
2766 bool success = false;
2767 if (!cmd_getval(cmdmap, "property", property)) {
2768 error = "unable to get property";
2769 success = false;
2770 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2771 error = "invalid property";
2772 success = false;
2773 } else {
2774 success = true;
2775 }
2776 f->open_object_section("result");
2777 f->dump_string("error", error);
2778 f->dump_bool("success", success);
2779 f->dump_int("value", value);
2780 f->close_section();
2781 } else if (prefix == "dump_objectstore_kv_stats") {
2782 store->get_db_statistics(f);
2783 } else if (prefix == "dump_scrubs") {
2784 service.get_scrub_services().dump_scrubs(f);
2785 } else if (prefix == "calc_objectstore_db_histogram") {
2786 store->generate_db_histogram(f);
2787 } else if (prefix == "flush_store_cache") {
2788 store->flush_cache(&ss);
2789 } else if (prefix == "dump_pgstate_history") {
2790 f->open_object_section("pgstate_history");
2791 f->open_array_section("pgs");
2792 vector<PGRef> pgs;
2793 _get_pgs(&pgs);
2794 for (auto& pg : pgs) {
2795 f->open_object_section("pg");
2796 f->dump_stream("pg") << pg->pg_id;
2797 f->dump_string("currently", pg->get_current_state());
2798 pg->dump_pgstate_history(f);
2799 f->close_section();
2800 }
2801 f->close_section();
2802 f->close_section();
2803 } else if (prefix == "compact") {
2804 dout(1) << "triggering manual compaction" << dendl;
2805 auto start = ceph::coarse_mono_clock::now();
2806 store->compact();
2807 auto end = ceph::coarse_mono_clock::now();
2808 double duration = std::chrono::duration<double>(end-start).count();
2809 dout(1) << "finished manual compaction in "
2810 << duration
2811 << " seconds" << dendl;
2812 f->open_object_section("compact_result");
2813 f->dump_float("elapsed_time", duration);
2814 f->close_section();
2815 } else if (prefix == "get_mapped_pools") {
2816 f->open_array_section("mapped_pools");
2817 set<int64_t> poollist = get_mapped_pools();
2818 for (auto pool : poollist) {
2819 f->dump_int("pool_id", pool);
2820 }
2821 f->close_section();
2822 } else if (prefix == "smart") {
2823 string devid;
2824 cmd_getval(cmdmap, "devid", devid);
2825 ostringstream out;
2826 probe_smart(devid, out);
2827 outbl.append(out.str());
2828 } else if (prefix == "list_devices") {
2829 set<string> devnames;
2830 store->get_devices(&devnames);
2831 f->open_array_section("list_devices");
2832 for (auto dev : devnames) {
2833 if (dev.find("dm-") == 0) {
2834 continue;
2835 }
2836 string err;
2837 f->open_object_section("device");
2838 f->dump_string("device", "/dev/" + dev);
2839 f->dump_string("device_id", get_device_id(dev, &err));
2840 f->close_section();
2841 }
2842 f->close_section();
2843 } else if (prefix == "send_beacon") {
2844 lock_guard l(osd_lock);
2845 if (is_active()) {
2846 send_beacon(ceph::coarse_mono_clock::now());
2847 }
2848 }
2849
2850 else if (prefix == "cluster_log") {
2851 vector<string> msg;
2852 cmd_getval(cmdmap, "message", msg);
2853 if (msg.empty()) {
2854 ret = -EINVAL;
2855 ss << "ignoring empty log message";
2856 goto out;
2857 }
2858 string message = msg.front();
2859 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2860 message += " " + *a;
2861 string lvl;
2862 cmd_getval(cmdmap, "level", lvl);
2863 clog_type level = string_to_clog_type(lvl);
2864 if (level < 0) {
2865 ret = -EINVAL;
2866 ss << "unknown level '" << lvl << "'";
2867 goto out;
2868 }
2869 clog->do_log(level, message);
2870 }
2871
2872 else if (prefix == "bench") {
2873 // default count 1G, size 4MB
2874 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2875 int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2876 int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2877 int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
2878 double elapsed = 0.0;
2879
2880 ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
2881 if (ret != 0) {
2882 goto out;
2883 }
2884
2885 double rate = count / elapsed;
2886 double iops = rate / bsize;
2887 f->open_object_section("osd_bench_results");
2888 f->dump_int("bytes_written", count);
2889 f->dump_int("blocksize", bsize);
2890 f->dump_float("elapsed_sec", elapsed);
2891 f->dump_float("bytes_per_sec", rate);
2892 f->dump_float("iops", iops);
2893 f->close_section();
2894 }
2895
2896 else if (prefix == "flush_pg_stats") {
2897 mgrc.send_pgstats();
2898 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2899 }
2900
2901 else if (prefix == "heap") {
2902 std::stringstream outss;
2903 ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
2904 outbl.append(outss);
2905 }
2906
2907 else if (prefix == "debug dump_missing") {
2908 f->open_array_section("pgs");
2909 vector<PGRef> pgs;
2910 _get_pgs(&pgs);
2911 for (auto& pg : pgs) {
2912 string s = stringify(pg->pg_id);
2913 f->open_array_section(s.c_str());
2914 pg->lock();
2915 pg->dump_missing(f);
2916 pg->unlock();
2917 f->close_section();
2918 }
2919 f->close_section();
2920 }
2921
2922 else if (prefix == "debug kick_recovery_wq") {
2923 int64_t delay;
2924 cmd_getval(cmdmap, "delay", delay);
2925 ostringstream oss;
2926 oss << delay;
2927 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2928 if (ret != 0) {
2929 ss << "kick_recovery_wq: error setting "
2930 << "osd_recovery_delay_start to '" << delay << "': error "
2931 << ret;
2932 goto out;
2933 }
2934 cct->_conf.apply_changes(nullptr);
2935 ss << "kicking recovery queue. set osd_recovery_delay_start "
2936 << "to " << cct->_conf->osd_recovery_delay_start;
2937 }
2938
2939 else if (prefix == "cpu_profiler") {
2940 ostringstream ds;
2941 string arg;
2942 cmd_getval(cmdmap, "arg", arg);
2943 vector<string> argvec;
2944 get_str_vec(arg, argvec);
2945 cpu_profiler_handle_command(argvec, ds);
2946 outbl.append(ds.str());
2947 }
2948
2949 else if (prefix == "dump_pg_recovery_stats") {
2950 lock_guard l(osd_lock);
2951 pg_recovery_stats.dump_formatted(f);
2952 }
2953
2954 else if (prefix == "reset_pg_recovery_stats") {
2955 lock_guard l(osd_lock);
2956 pg_recovery_stats.reset();
2957 }
2958
2959 else if (prefix == "perf histogram dump") {
2960 std::string logger;
2961 std::string counter;
2962 cmd_getval(cmdmap, "logger", logger);
2963 cmd_getval(cmdmap, "counter", counter);
2964 cct->get_perfcounters_collection()->dump_formatted_histograms(
2965 f, false, logger, counter);
2966 }
2967
2968 else if (prefix == "cache drop") {
2969 lock_guard l(osd_lock);
2970 dout(20) << "clearing all caches" << dendl;
2971 // Clear the objectstore's cache - onode and buffer for Bluestore,
2972 // system's pagecache for Filestore
2973 ret = store->flush_cache(&ss);
2974 if (ret < 0) {
2975 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2976 goto out;
2977 }
2978 // Clear the objectcontext cache (per PG)
2979 vector<PGRef> pgs;
2980 _get_pgs(&pgs);
2981 for (auto& pg: pgs) {
2982 pg->clear_cache();
2983 }
2984 }
2985
2986 else if (prefix == "cache status") {
2987 lock_guard l(osd_lock);
2988 int obj_ctx_count = 0;
2989 vector<PGRef> pgs;
2990 _get_pgs(&pgs);
2991 for (auto& pg: pgs) {
2992 obj_ctx_count += pg->get_cache_obj_count();
2993 }
2994 f->open_object_section("cache_status");
2995 f->dump_int("object_ctx", obj_ctx_count);
2996 store->dump_cache_stats(f);
2997 f->close_section();
2998 }
2999
3000 else if (prefix == "scrub_purged_snaps") {
3001 lock_guard l(osd_lock);
3002 scrub_purged_snaps();
3003 }
3004
3005 else if (prefix == "dump_osd_network") {
3006 lock_guard l(osd_lock);
3007 int64_t value = 0;
3008 if (!(cmd_getval(cmdmap, "value", value))) {
3009 // Convert milliseconds to microseconds
3010 value = static_cast<double>(g_conf().get_val<double>(
3011 "mon_warn_on_slow_ping_time")) * 1000;
3012 if (value == 0) {
3013 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3014 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3015 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3016 }
3017 } else {
3018 // Convert user input to microseconds
3019 value *= 1000;
3020 }
3021 if (value < 0) value = 0;
3022
3023 struct osd_ping_time_t {
3024 uint32_t pingtime;
3025 int to;
3026 bool back;
3027 std::array<uint32_t,3> times;
3028 std::array<uint32_t,3> min;
3029 std::array<uint32_t,3> max;
3030 uint32_t last;
3031 uint32_t last_update;
3032
3033 bool operator<(const osd_ping_time_t& rhs) const {
3034 if (pingtime < rhs.pingtime)
3035 return true;
3036 if (pingtime > rhs.pingtime)
3037 return false;
3038 if (to < rhs.to)
3039 return true;
3040 if (to > rhs.to)
3041 return false;
3042 return back;
3043 }
3044 };
3045
3046 set<osd_ping_time_t> sorted;
3047 // Get pingtimes under lock and not on the stack
3048 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3049 service.get_hb_pingtime(pingtimes);
3050 for (auto j : *pingtimes) {
3051 if (j.second.last_update == 0)
3052 continue;
3053 osd_ping_time_t item;
3054 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3055 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3056 if (item.pingtime >= value) {
3057 item.to = j.first;
3058 item.times[0] = j.second.back_pingtime[0];
3059 item.times[1] = j.second.back_pingtime[1];
3060 item.times[2] = j.second.back_pingtime[2];
3061 item.min[0] = j.second.back_min[0];
3062 item.min[1] = j.second.back_min[1];
3063 item.min[2] = j.second.back_min[2];
3064 item.max[0] = j.second.back_max[0];
3065 item.max[1] = j.second.back_max[1];
3066 item.max[2] = j.second.back_max[2];
3067 item.last = j.second.back_last;
3068 item.back = true;
3069 item.last_update = j.second.last_update;
3070 sorted.emplace(item);
3071 }
3072 if (j.second.front_last == 0)
3073 continue;
3074 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3075 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3076 if (item.pingtime >= value) {
3077 item.to = j.first;
3078 item.times[0] = j.second.front_pingtime[0];
3079 item.times[1] = j.second.front_pingtime[1];
3080 item.times[2] = j.second.front_pingtime[2];
3081 item.min[0] = j.second.front_min[0];
3082 item.min[1] = j.second.front_min[1];
3083 item.min[2] = j.second.front_min[2];
3084 item.max[0] = j.second.front_max[0];
3085 item.max[1] = j.second.front_max[1];
3086 item.max[2] = j.second.front_max[2];
3087 item.last = j.second.front_last;
3088 item.last_update = j.second.last_update;
3089 item.back = false;
3090 sorted.emplace(item);
3091 }
3092 }
3093 delete pingtimes;
3094 //
3095 // Network ping times (1min 5min 15min)
3096 f->open_object_section("network_ping_times");
3097 f->dump_int("threshold", value / 1000);
3098 f->open_array_section("entries");
3099 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3100 ceph_assert(sitem.pingtime >= value);
3101 f->open_object_section("entry");
3102
3103 const time_t lu(sitem.last_update);
3104 char buffer[26];
3105 string lustr(ctime_r(&lu, buffer));
3106 lustr.pop_back(); // Remove trailing \n
3107 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3108 f->dump_string("last update", lustr);
3109 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3110 f->dump_int("from osd", whoami);
3111 f->dump_int("to osd", sitem.to);
3112 f->dump_string("interface", (sitem.back ? "back" : "front"));
3113 f->open_object_section("average");
3114 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3115 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3116 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3117 f->close_section(); // average
3118 f->open_object_section("min");
3119 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3120 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3121 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3122 f->close_section(); // min
3123 f->open_object_section("max");
3124 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3125 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3126 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3127 f->close_section(); // max
3128 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3129 f->close_section(); // entry
3130 }
3131 f->close_section(); // entries
3132 f->close_section(); // network_ping_times
3133 } else if (prefix == "dump_pool_statfs") {
3134 lock_guard l(osd_lock);
3135
3136 int64_t p = 0;
3137 if (!(cmd_getval(cmdmap, "poolid", p))) {
3138 ss << "Error dumping pool statfs: no poolid provided";
3139 ret = -EINVAL;
3140 goto out;
3141 }
3142
3143 store_statfs_t st;
3144 bool per_pool_omap_stats = false;
3145
3146 ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3147 if (ret < 0) {
3148 ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3149 goto out;
3150 } else {
3151 ss << "dumping pool statfs...";
3152 f->open_object_section("pool_statfs");
3153 f->dump_int("poolid", p);
3154 st.dump(f);
3155 f->close_section();
3156 }
3157 } else {
3158 ceph_abort_msg("broken asok registration");
3159 }
3160
3161 out:
3162 on_finish(ret, ss.str(), outbl);
3163 }
3164
3165 int OSD::run_osd_bench_test(
3166 int64_t count,
3167 int64_t bsize,
3168 int64_t osize,
3169 int64_t onum,
3170 double *elapsed,
3171 ostream &ss)
3172 {
3173 int ret = 0;
3174 uint32_t duration = cct->_conf->osd_bench_duration;
3175
3176 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3177 // let us limit the block size because the next checks rely on it
3178 // having a sane value. If we allow any block size to be set things
3179 // can still go sideways.
3180 ss << "block 'size' values are capped at "
3181 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3182 << " a higher value, please adjust 'osd_bench_max_block_size'";
3183 ret = -EINVAL;
3184 return ret;
3185 } else if (bsize < (int64_t) (1 << 20)) {
3186 // entering the realm of small block sizes.
3187 // limit the count to a sane value, assuming a configurable amount of
3188 // IOPS and duration, so that the OSD doesn't get hung up on this,
3189 // preventing timeouts from going off
3190 int64_t max_count =
3191 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3192 if (count > max_count) {
3193 ss << "'count' values greater than " << max_count
3194 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3195 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3196 << " for " << duration << " seconds,"
3197 << " can cause ill effects on osd. "
3198 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3199 << " value if you wish to use a higher 'count'.";
3200 ret = -EINVAL;
3201 return ret;
3202 }
3203 } else {
3204 // 1MB block sizes are big enough so that we get more stuff done.
3205 // However, to avoid the osd from getting hung on this and having
3206 // timers being triggered, we are going to limit the count assuming
3207 // a configurable throughput and duration.
3208 // NOTE: max_count is the total amount of bytes that we believe we
3209 // will be able to write during 'duration' for the given
3210 // throughput. The block size hardly impacts this unless it's
3211 // way too big. Given we already check how big the block size
3212 // is, it's safe to assume everything will check out.
3213 int64_t max_count =
3214 cct->_conf->osd_bench_large_size_max_throughput * duration;
3215 if (count > max_count) {
3216 ss << "'count' values greater than " << max_count
3217 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3218 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3219 << " for " << duration << " seconds,"
3220 << " can cause ill effects on osd. "
3221 << " Please adjust 'osd_bench_large_size_max_throughput'"
3222 << " with a higher value if you wish to use a higher 'count'.";
3223 ret = -EINVAL;
3224 return ret;
3225 }
3226 }
3227
3228 if (osize && bsize > osize) {
3229 bsize = osize;
3230 }
3231
3232 dout(1) << " bench count " << count
3233 << " bsize " << byte_u_t(bsize) << dendl;
3234
3235 ObjectStore::Transaction cleanupt;
3236
3237 if (osize && onum) {
3238 bufferlist bl;
3239 bufferptr bp(osize);
3240 memset(bp.c_str(), 'a', bp.length());
3241 bl.push_back(std::move(bp));
3242 bl.rebuild_page_aligned();
3243 for (int i=0; i<onum; ++i) {
3244 char nm[30];
3245 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3246 object_t oid(nm);
3247 hobject_t soid(sobject_t(oid, 0));
3248 ObjectStore::Transaction t;
3249 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3250 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3251 cleanupt.remove(coll_t(), ghobject_t(soid));
3252 }
3253 }
3254
3255 bufferlist bl;
3256 bufferptr bp(bsize);
3257 memset(bp.c_str(), 'a', bp.length());
3258 bl.push_back(std::move(bp));
3259 bl.rebuild_page_aligned();
3260
3261 {
3262 C_SaferCond waiter;
3263 if (!service.meta_ch->flush_commit(&waiter)) {
3264 waiter.wait();
3265 }
3266 }
3267
3268 utime_t start = ceph_clock_now();
3269 for (int64_t pos = 0; pos < count; pos += bsize) {
3270 char nm[30];
3271 unsigned offset = 0;
3272 if (onum && osize) {
3273 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3274 offset = rand() % (osize / bsize) * bsize;
3275 } else {
3276 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3277 }
3278 object_t oid(nm);
3279 hobject_t soid(sobject_t(oid, 0));
3280 ObjectStore::Transaction t;
3281 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3282 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3283 if (!onum || !osize) {
3284 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3285 }
3286 }
3287
3288 {
3289 C_SaferCond waiter;
3290 if (!service.meta_ch->flush_commit(&waiter)) {
3291 waiter.wait();
3292 }
3293 }
3294 utime_t end = ceph_clock_now();
3295 *elapsed = end - start;
3296
3297 // clean up
3298 store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3299 {
3300 C_SaferCond waiter;
3301 if (!service.meta_ch->flush_commit(&waiter)) {
3302 waiter.wait();
3303 }
3304 }
3305
3306 return ret;
3307 }
3308
3309 class TestOpsSocketHook : public AdminSocketHook {
3310 OSDService *service;
3311 ObjectStore *store;
3312 public:
3313 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3314 int call(std::string_view command, const cmdmap_t& cmdmap,
3315 Formatter *f,
3316 std::ostream& errss,
3317 bufferlist& out) override {
3318 int r = 0;
3319 stringstream outss;
3320 try {
3321 test_ops(service, store, command, cmdmap, outss);
3322 out.append(outss);
3323 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3324 errss << e.what();
3325 r = -EINVAL;
3326 }
3327 return r;
3328 }
3329 void test_ops(OSDService *service, ObjectStore *store,
3330 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3331
3332 };
3333
3334 class OSD::C_Tick : public Context {
3335 OSD *osd;
3336 public:
3337 explicit C_Tick(OSD *o) : osd(o) {}
3338 void finish(int r) override {
3339 osd->tick();
3340 }
3341 };
3342
3343 class OSD::C_Tick_WithoutOSDLock : public Context {
3344 OSD *osd;
3345 public:
3346 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3347 void finish(int r) override {
3348 osd->tick_without_osd_lock();
3349 }
3350 };
3351
3352 int OSD::enable_disable_fuse(bool stop)
3353 {
3354 #ifdef HAVE_LIBFUSE
3355 int r;
3356 string mntpath = cct->_conf->osd_data + "/fuse";
3357 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3358 dout(1) << __func__ << " disabling" << dendl;
3359 fuse_store->stop();
3360 delete fuse_store;
3361 fuse_store = NULL;
3362 r = ::rmdir(mntpath.c_str());
3363 if (r < 0) {
3364 r = -errno;
3365 derr << __func__ << " failed to rmdir " << mntpath << ": "
3366 << cpp_strerror(r) << dendl;
3367 return r;
3368 }
3369 return 0;
3370 }
3371 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3372 dout(1) << __func__ << " enabling" << dendl;
3373 r = ::mkdir(mntpath.c_str(), 0700);
3374 if (r < 0)
3375 r = -errno;
3376 if (r < 0 && r != -EEXIST) {
3377 derr << __func__ << " unable to create " << mntpath << ": "
3378 << cpp_strerror(r) << dendl;
3379 return r;
3380 }
3381 fuse_store = new FuseStore(store.get(), mntpath);
3382 r = fuse_store->start();
3383 if (r < 0) {
3384 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3385 delete fuse_store;
3386 fuse_store = NULL;
3387 return r;
3388 }
3389 }
3390 #endif // HAVE_LIBFUSE
3391 return 0;
3392 }
3393
3394 size_t OSD::get_num_cache_shards()
3395 {
3396 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3397 }
3398
3399 int OSD::get_num_op_shards()
3400 {
3401 if (cct->_conf->osd_op_num_shards)
3402 return cct->_conf->osd_op_num_shards;
3403 if (store_is_rotational)
3404 return cct->_conf->osd_op_num_shards_hdd;
3405 else
3406 return cct->_conf->osd_op_num_shards_ssd;
3407 }
3408
3409 int OSD::get_num_op_threads()
3410 {
3411 if (cct->_conf->osd_op_num_threads_per_shard)
3412 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3413 if (store_is_rotational)
3414 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3415 else
3416 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3417 }
3418
3419 float OSD::get_osd_recovery_sleep()
3420 {
3421 if (cct->_conf->osd_recovery_sleep)
3422 return cct->_conf->osd_recovery_sleep;
3423 if (!store_is_rotational && !journal_is_rotational)
3424 return cct->_conf->osd_recovery_sleep_ssd;
3425 else if (store_is_rotational && !journal_is_rotational)
3426 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3427 else
3428 return cct->_conf->osd_recovery_sleep_hdd;
3429 }
3430
3431 float OSD::get_osd_delete_sleep()
3432 {
3433 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3434 if (osd_delete_sleep > 0)
3435 return osd_delete_sleep;
3436 if (!store_is_rotational && !journal_is_rotational)
3437 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3438 if (store_is_rotational && !journal_is_rotational)
3439 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3440 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3441 }
3442
3443 int OSD::get_recovery_max_active()
3444 {
3445 if (cct->_conf->osd_recovery_max_active)
3446 return cct->_conf->osd_recovery_max_active;
3447 if (store_is_rotational)
3448 return cct->_conf->osd_recovery_max_active_hdd;
3449 else
3450 return cct->_conf->osd_recovery_max_active_ssd;
3451 }
3452
3453 float OSD::get_osd_snap_trim_sleep()
3454 {
3455 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3456 if (osd_snap_trim_sleep > 0)
3457 return osd_snap_trim_sleep;
3458 if (!store_is_rotational && !journal_is_rotational)
3459 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3460 if (store_is_rotational && !journal_is_rotational)
3461 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3462 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3463 }
3464
3465 int OSD::init()
3466 {
3467 OSDMapRef osdmap;
3468 CompatSet initial, diff;
3469 std::lock_guard lock(osd_lock);
3470 if (is_stopping())
3471 return 0;
3472 tracing::osd::tracer.init("osd");
3473 tick_timer.init();
3474 tick_timer_without_osd_lock.init();
3475 service.recovery_request_timer.init();
3476 service.sleep_timer.init();
3477
3478 boot_finisher.start();
3479
3480 {
3481 string val;
3482 store->read_meta("require_osd_release", &val);
3483 last_require_osd_release = ceph_release_from_name(val);
3484 }
3485
3486 // mount.
3487 dout(2) << "init " << dev_path
3488 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3489 << dendl;
3490 dout(2) << "journal " << journal_path << dendl;
3491 ceph_assert(store); // call pre_init() first!
3492
3493 store->set_cache_shards(get_num_cache_shards());
3494
3495 int rotating_auth_attempts = 0;
3496 auto rotating_auth_timeout =
3497 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3498
3499 int r = store->mount();
3500 if (r < 0) {
3501 derr << "OSD:init: unable to mount object store" << dendl;
3502 return r;
3503 }
3504 journal_is_rotational = store->is_journal_rotational();
3505 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3506 << dendl;
3507
3508 enable_disable_fuse(false);
3509
3510 dout(2) << "boot" << dendl;
3511
3512 service.meta_ch = store->open_collection(coll_t::meta());
3513 if (!service.meta_ch) {
3514 derr << "OSD:init: unable to open meta collection"
3515 << dendl;
3516 r = -ENOENT;
3517 goto out;
3518 }
3519 // initialize the daily loadavg with current 15min loadavg
3520 double loadavgs[3];
3521 if (getloadavg(loadavgs, 3) == 3) {
3522 daily_loadavg = loadavgs[2];
3523 } else {
3524 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3525 daily_loadavg = 1.0;
3526 }
3527
3528 // sanity check long object name handling
3529 {
3530 hobject_t l;
3531 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3532 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3533 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3534 r = store->validate_hobject_key(l);
3535 if (r < 0) {
3536 derr << "backend (" << store->get_type() << ") is unable to support max "
3537 << "object name[space] len" << dendl;
3538 derr << " osd max object name len = "
3539 << cct->_conf->osd_max_object_name_len << dendl;
3540 derr << " osd max object namespace len = "
3541 << cct->_conf->osd_max_object_namespace_len << dendl;
3542 derr << cpp_strerror(r) << dendl;
3543 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3544 goto out;
3545 }
3546 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3547 << dendl;
3548 } else {
3549 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3550 }
3551 }
3552
3553 // read superblock
3554 r = read_superblock();
3555 if (r < 0) {
3556 derr << "OSD::init() : unable to read osd superblock" << dendl;
3557 r = -EINVAL;
3558 goto out;
3559 }
3560
3561 if (osd_compat.compare(superblock.compat_features) < 0) {
3562 derr << "The disk uses features unsupported by the executable." << dendl;
3563 derr << " ondisk features " << superblock.compat_features << dendl;
3564 derr << " daemon features " << osd_compat << dendl;
3565
3566 if (osd_compat.writeable(superblock.compat_features)) {
3567 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3568 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3569 r = -EOPNOTSUPP;
3570 goto out;
3571 }
3572 else {
3573 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3574 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3575 r = -EOPNOTSUPP;
3576 goto out;
3577 }
3578 }
3579
3580 assert_warn(whoami == superblock.whoami);
3581 if (whoami != superblock.whoami) {
3582 derr << "OSD::init: superblock says osd"
3583 << superblock.whoami << " but I am osd." << whoami << dendl;
3584 r = -EINVAL;
3585 goto out;
3586 }
3587
3588 startup_time = ceph::mono_clock::now();
3589
3590 // load up "current" osdmap
3591 assert_warn(!get_osdmap());
3592 if (get_osdmap()) {
3593 derr << "OSD::init: unable to read current osdmap" << dendl;
3594 r = -EINVAL;
3595 goto out;
3596 }
3597 osdmap = get_map(superblock.current_epoch);
3598 set_osdmap(osdmap);
3599
3600 // make sure we don't have legacy pgs deleting
3601 {
3602 vector<coll_t> ls;
3603 int r = store->list_collections(ls);
3604 ceph_assert(r >= 0);
3605 for (auto c : ls) {
3606 spg_t pgid;
3607 if (c.is_pg(&pgid) &&
3608 !osdmap->have_pg_pool(pgid.pool())) {
3609 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3610 if (!store->exists(service.meta_ch, oid)) {
3611 derr << __func__ << " missing pg_pool_t for deleted pool "
3612 << pgid.pool() << " for pg " << pgid
3613 << "; please downgrade to luminous and allow "
3614 << "pg deletion to complete before upgrading" << dendl;
3615 ceph_abort();
3616 }
3617 }
3618 }
3619 }
3620
3621 initial = get_osd_initial_compat_set();
3622 diff = superblock.compat_features.unsupported(initial);
3623 if (superblock.compat_features.merge(initial)) {
3624 // Are we adding SNAPMAPPER2?
3625 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3626 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3627 << dendl;
3628 auto ch = service.meta_ch;
3629 auto hoid = make_snapmapper_oid();
3630 unsigned max = cct->_conf->osd_target_transaction_size;
3631 r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
3632 if (r < 0)
3633 goto out;
3634 }
3635 // We need to persist the new compat_set before we
3636 // do anything else
3637 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3638 ObjectStore::Transaction t;
3639 write_superblock(t);
3640 r = store->queue_transaction(service.meta_ch, std::move(t));
3641 if (r < 0)
3642 goto out;
3643 }
3644
3645 // make sure snap mapper object exists
3646 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3647 dout(10) << "init creating/touching snapmapper object" << dendl;
3648 ObjectStore::Transaction t;
3649 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3650 r = store->queue_transaction(service.meta_ch, std::move(t));
3651 if (r < 0)
3652 goto out;
3653 }
3654 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3655 dout(10) << "init creating/touching purged_snaps object" << dendl;
3656 ObjectStore::Transaction t;
3657 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3658 r = store->queue_transaction(service.meta_ch, std::move(t));
3659 if (r < 0)
3660 goto out;
3661 }
3662
3663 if (cct->_conf->osd_open_classes_on_start) {
3664 int r = ClassHandler::get_instance().open_all_classes();
3665 if (r)
3666 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3667 }
3668
3669 check_osdmap_features();
3670
3671 {
3672 epoch_t bind_epoch = osdmap->get_epoch();
3673 service.set_epochs(NULL, NULL, &bind_epoch);
3674 }
3675
3676 clear_temp_objects();
3677
3678 // initialize osdmap references in sharded wq
3679 for (auto& shard : shards) {
3680 std::lock_guard l(shard->osdmap_lock);
3681 shard->shard_osdmap = osdmap;
3682 }
3683
3684 // load up pgs (as they previously existed)
3685 load_pgs();
3686
3687 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3688
3689 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3690 dout(2) << "compacting object store's omap" << dendl;
3691 store->compact();
3692 }
3693
3694 // prime osd stats
3695 {
3696 struct store_statfs_t stbuf;
3697 osd_alert_list_t alerts;
3698 int r = store->statfs(&stbuf, &alerts);
3699 ceph_assert(r == 0);
3700 service.set_statfs(stbuf, alerts);
3701 }
3702
3703 // client_messenger's auth_client will be set up by monc->init() later.
3704 for (auto m : { cluster_messenger,
3705 objecter_messenger,
3706 hb_front_client_messenger,
3707 hb_back_client_messenger,
3708 hb_front_server_messenger,
3709 hb_back_server_messenger } ) {
3710 m->set_auth_client(monc);
3711 }
3712 for (auto m : { client_messenger,
3713 cluster_messenger,
3714 hb_front_server_messenger,
3715 hb_back_server_messenger }) {
3716 m->set_auth_server(monc);
3717 }
3718 monc->set_handle_authentication_dispatcher(this);
3719
3720 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3721 | CEPH_ENTITY_TYPE_MGR);
3722 r = monc->init();
3723 if (r < 0)
3724 goto out;
3725
3726 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3727 mgrc.set_perf_metric_query_cb(
3728 [this](const ConfigPayload &config_payload) {
3729 set_perf_queries(config_payload);
3730 },
3731 [this] {
3732 return get_perf_reports();
3733 });
3734 mgrc.init();
3735
3736 // tell monc about log_client so it will know about mon session resets
3737 monc->set_log_client(&log_client);
3738 update_log_config();
3739
3740 // i'm ready!
3741 client_messenger->add_dispatcher_tail(&mgrc);
3742 client_messenger->add_dispatcher_tail(this);
3743 cluster_messenger->add_dispatcher_head(this);
3744
3745 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3746 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3747 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3748 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3749
3750 objecter_messenger->add_dispatcher_head(service.objecter.get());
3751
3752 service.init();
3753 service.publish_map(osdmap);
3754 service.publish_superblock(superblock);
3755 service.max_oldest_map = superblock.oldest_map;
3756
3757 for (auto& shard : shards) {
3758 // put PGs in a temporary set because we may modify pg_slots
3759 // unordered_map below.
3760 set<PGRef> pgs;
3761 for (auto& i : shard->pg_slots) {
3762 PGRef pg = i.second->pg;
3763 if (!pg) {
3764 continue;
3765 }
3766 pgs.insert(pg);
3767 }
3768 for (auto pg : pgs) {
3769 std::scoped_lock l{*pg};
3770 set<pair<spg_t,epoch_t>> new_children;
3771 set<pair<spg_t,epoch_t>> merge_pgs;
3772 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3773 &new_children, &merge_pgs);
3774 if (!new_children.empty()) {
3775 for (auto shard : shards) {
3776 shard->prime_splits(osdmap, &new_children);
3777 }
3778 assert(new_children.empty());
3779 }
3780 if (!merge_pgs.empty()) {
3781 for (auto shard : shards) {
3782 shard->prime_merges(osdmap, &merge_pgs);
3783 }
3784 assert(merge_pgs.empty());
3785 }
3786 }
3787 }
3788
3789 osd_op_tp.start();
3790
3791 // start the heartbeat
3792 heartbeat_thread.create("osd_srv_heartbt");
3793
3794 // tick
3795 tick_timer.add_event_after(get_tick_interval(),
3796 new C_Tick(this));
3797 {
3798 std::lock_guard l(tick_timer_lock);
3799 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3800 new C_Tick_WithoutOSDLock(this));
3801 }
3802
3803 osd_lock.unlock();
3804
3805 r = monc->authenticate();
3806 if (r < 0) {
3807 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3808 << dendl;
3809 exit(1);
3810 }
3811
3812 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3813 derr << "unable to obtain rotating service keys; retrying" << dendl;
3814 ++rotating_auth_attempts;
3815 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3816 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3817 exit(1);
3818 }
3819 }
3820
3821 r = update_crush_device_class();
3822 if (r < 0) {
3823 derr << __func__ << " unable to update_crush_device_class: "
3824 << cpp_strerror(r) << dendl;
3825 exit(1);
3826 }
3827
3828 r = update_crush_location();
3829 if (r < 0) {
3830 derr << __func__ << " unable to update_crush_location: "
3831 << cpp_strerror(r) << dendl;
3832 exit(1);
3833 }
3834
3835 osd_lock.lock();
3836 if (is_stopping())
3837 return 0;
3838
3839 // start objecter *after* we have authenticated, so that we don't ignore
3840 // the OSDMaps it requests.
3841 service.final_init();
3842
3843 check_config();
3844
3845 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3846 consume_map();
3847
3848 dout(0) << "done with init, starting boot process" << dendl;
3849
3850 // subscribe to any pg creations
3851 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3852
3853 // MgrClient needs this (it doesn't have MonClient reference itself)
3854 monc->sub_want("mgrmap", 0, 0);
3855
3856 // we don't need to ask for an osdmap here; objecter will
3857 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3858
3859 monc->renew_subs();
3860
3861 start_boot();
3862
3863 // Override a few options if mclock scheduler is enabled.
3864 maybe_override_max_osd_capacity_for_qos();
3865 maybe_override_options_for_qos();
3866
3867 return 0;
3868
3869 out:
3870 enable_disable_fuse(true);
3871 store->umount();
3872 store.reset();
3873 return r;
3874 }
3875
3876 void OSD::final_init()
3877 {
3878 AdminSocket *admin_socket = cct->get_admin_socket();
3879 asok_hook = new OSDSocketHook(this);
3880 int r = admin_socket->register_command("status", asok_hook,
3881 "high-level status of OSD");
3882 ceph_assert(r == 0);
3883 r = admin_socket->register_command("flush_journal",
3884 asok_hook,
3885 "flush the journal to permanent store");
3886 ceph_assert(r == 0);
3887 r = admin_socket->register_command("dump_ops_in_flight " \
3888 "name=filterstr,type=CephString,n=N,req=false",
3889 asok_hook,
3890 "show the ops currently in flight");
3891 ceph_assert(r == 0);
3892 r = admin_socket->register_command("ops " \
3893 "name=filterstr,type=CephString,n=N,req=false",
3894 asok_hook,
3895 "show the ops currently in flight");
3896 ceph_assert(r == 0);
3897 r = admin_socket->register_command("dump_blocked_ops " \
3898 "name=filterstr,type=CephString,n=N,req=false",
3899 asok_hook,
3900 "show the blocked ops currently in flight");
3901 ceph_assert(r == 0);
3902 r = admin_socket->register_command("dump_historic_ops " \
3903 "name=filterstr,type=CephString,n=N,req=false",
3904 asok_hook,
3905 "show recent ops");
3906 ceph_assert(r == 0);
3907 r = admin_socket->register_command("dump_historic_slow_ops " \
3908 "name=filterstr,type=CephString,n=N,req=false",
3909 asok_hook,
3910 "show slowest recent ops");
3911 ceph_assert(r == 0);
3912 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3913 "name=filterstr,type=CephString,n=N,req=false",
3914 asok_hook,
3915 "show slowest recent ops, sorted by duration");
3916 ceph_assert(r == 0);
3917 r = admin_socket->register_command("dump_op_pq_state",
3918 asok_hook,
3919 "dump op queue state");
3920 ceph_assert(r == 0);
3921 r = admin_socket->register_command("dump_blocklist",
3922 asok_hook,
3923 "dump blocklisted clients and times");
3924 ceph_assert(r == 0);
3925 r = admin_socket->register_command("dump_watchers",
3926 asok_hook,
3927 "show clients which have active watches,"
3928 " and on which objects");
3929 ceph_assert(r == 0);
3930 r = admin_socket->register_command("dump_recovery_reservations",
3931 asok_hook,
3932 "show recovery reservations");
3933 ceph_assert(r == 0);
3934 r = admin_socket->register_command("dump_scrub_reservations",
3935 asok_hook,
3936 "show scrub reservations");
3937 ceph_assert(r == 0);
3938 r = admin_socket->register_command("get_latest_osdmap",
3939 asok_hook,
3940 "force osd to update the latest map from "
3941 "the mon");
3942 ceph_assert(r == 0);
3943
3944 r = admin_socket->register_command("set_heap_property " \
3945 "name=property,type=CephString " \
3946 "name=value,type=CephInt",
3947 asok_hook,
3948 "update malloc extension heap property");
3949 ceph_assert(r == 0);
3950
3951 r = admin_socket->register_command("get_heap_property " \
3952 "name=property,type=CephString",
3953 asok_hook,
3954 "get malloc extension heap property");
3955 ceph_assert(r == 0);
3956
3957 r = admin_socket->register_command("dump_objectstore_kv_stats",
3958 asok_hook,
3959 "print statistics of kvdb which used by bluestore");
3960 ceph_assert(r == 0);
3961
3962 r = admin_socket->register_command("dump_scrubs",
3963 asok_hook,
3964 "print scheduled scrubs");
3965 ceph_assert(r == 0);
3966
3967 r = admin_socket->register_command("calc_objectstore_db_histogram",
3968 asok_hook,
3969 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3970 ceph_assert(r == 0);
3971
3972 r = admin_socket->register_command("flush_store_cache",
3973 asok_hook,
3974 "Flush bluestore internal cache");
3975 ceph_assert(r == 0);
3976 r = admin_socket->register_command("dump_pgstate_history",
3977 asok_hook,
3978 "show recent state history");
3979 ceph_assert(r == 0);
3980
3981 r = admin_socket->register_command("compact",
3982 asok_hook,
3983 "Commpact object store's omap."
3984 " WARNING: Compaction probably slows your requests");
3985 ceph_assert(r == 0);
3986
3987 r = admin_socket->register_command("get_mapped_pools",
3988 asok_hook,
3989 "dump pools whose PG(s) are mapped to this OSD.");
3990
3991 ceph_assert(r == 0);
3992
3993 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3994 asok_hook,
3995 "probe OSD devices for SMART data.");
3996
3997 ceph_assert(r == 0);
3998
3999 r = admin_socket->register_command("list_devices",
4000 asok_hook,
4001 "list OSD devices.");
4002 r = admin_socket->register_command("send_beacon",
4003 asok_hook,
4004 "send OSD beacon to mon immediately");
4005
4006 r = admin_socket->register_command(
4007 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
4008 "Dump osd heartbeat network ping times");
4009 ceph_assert(r == 0);
4010
4011 r = admin_socket->register_command(
4012 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
4013 "Dump store's statistics for the given pool");
4014 ceph_assert(r == 0);
4015
4016 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
4017 // Note: pools are CephString instead of CephPoolname because
4018 // these commands traditionally support both pool names and numbers
4019 r = admin_socket->register_command(
4020 "setomapval " \
4021 "name=pool,type=CephString " \
4022 "name=objname,type=CephObjectname " \
4023 "name=key,type=CephString "\
4024 "name=val,type=CephString",
4025 test_ops_hook,
4026 "set omap key");
4027 ceph_assert(r == 0);
4028 r = admin_socket->register_command(
4029 "rmomapkey " \
4030 "name=pool,type=CephString " \
4031 "name=objname,type=CephObjectname " \
4032 "name=key,type=CephString",
4033 test_ops_hook,
4034 "remove omap key");
4035 ceph_assert(r == 0);
4036 r = admin_socket->register_command(
4037 "setomapheader " \
4038 "name=pool,type=CephString " \
4039 "name=objname,type=CephObjectname " \
4040 "name=header,type=CephString",
4041 test_ops_hook,
4042 "set omap header");
4043 ceph_assert(r == 0);
4044
4045 r = admin_socket->register_command(
4046 "getomap " \
4047 "name=pool,type=CephString " \
4048 "name=objname,type=CephObjectname",
4049 test_ops_hook,
4050 "output entire object map");
4051 ceph_assert(r == 0);
4052
4053 r = admin_socket->register_command(
4054 "truncobj " \
4055 "name=pool,type=CephString " \
4056 "name=objname,type=CephObjectname " \
4057 "name=len,type=CephInt",
4058 test_ops_hook,
4059 "truncate object to length");
4060 ceph_assert(r == 0);
4061
4062 r = admin_socket->register_command(
4063 "injectdataerr " \
4064 "name=pool,type=CephString " \
4065 "name=objname,type=CephObjectname " \
4066 "name=shardid,type=CephInt,req=false,range=0|255",
4067 test_ops_hook,
4068 "inject data error to an object");
4069 ceph_assert(r == 0);
4070
4071 r = admin_socket->register_command(
4072 "injectmdataerr " \
4073 "name=pool,type=CephString " \
4074 "name=objname,type=CephObjectname " \
4075 "name=shardid,type=CephInt,req=false,range=0|255",
4076 test_ops_hook,
4077 "inject metadata error to an object");
4078 ceph_assert(r == 0);
4079 r = admin_socket->register_command(
4080 "set_recovery_delay " \
4081 "name=utime,type=CephInt,req=false",
4082 test_ops_hook,
4083 "Delay osd recovery by specified seconds");
4084 ceph_assert(r == 0);
4085 r = admin_socket->register_command(
4086 "injectfull " \
4087 "name=type,type=CephString,req=false " \
4088 "name=count,type=CephInt,req=false ",
4089 test_ops_hook,
4090 "Inject a full disk (optional count times)");
4091 ceph_assert(r == 0);
4092 r = admin_socket->register_command(
4093 "bench " \
4094 "name=count,type=CephInt,req=false " \
4095 "name=size,type=CephInt,req=false " \
4096 "name=object_size,type=CephInt,req=false " \
4097 "name=object_num,type=CephInt,req=false ",
4098 asok_hook,
4099 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4100 "(default count=1G default size=4MB). Results in log.");
4101 ceph_assert(r == 0);
4102 r = admin_socket->register_command(
4103 "cluster_log " \
4104 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4105 "name=message,type=CephString,n=N",
4106 asok_hook,
4107 "log a message to the cluster log");
4108 ceph_assert(r == 0);
4109 r = admin_socket->register_command(
4110 "flush_pg_stats",
4111 asok_hook,
4112 "flush pg stats");
4113 ceph_assert(r == 0);
4114 r = admin_socket->register_command(
4115 "heap " \
4116 "name=heapcmd,type=CephChoices,strings=" \
4117 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4118 "name=value,type=CephString,req=false",
4119 asok_hook,
4120 "show heap usage info (available only if compiled with tcmalloc)");
4121 ceph_assert(r == 0);
4122 r = admin_socket->register_command(
4123 "debug dump_missing " \
4124 "name=filename,type=CephFilepath",
4125 asok_hook,
4126 "dump missing objects to a named file");
4127 ceph_assert(r == 0);
4128 r = admin_socket->register_command(
4129 "debug kick_recovery_wq " \
4130 "name=delay,type=CephInt,range=0",
4131 asok_hook,
4132 "set osd_recovery_delay_start to <val>");
4133 ceph_assert(r == 0);
4134 r = admin_socket->register_command(
4135 "cpu_profiler " \
4136 "name=arg,type=CephChoices,strings=status|flush",
4137 asok_hook,
4138 "run cpu profiling on daemon");
4139 ceph_assert(r == 0);
4140 r = admin_socket->register_command(
4141 "dump_pg_recovery_stats",
4142 asok_hook,
4143 "dump pg recovery statistics");
4144 ceph_assert(r == 0);
4145 r = admin_socket->register_command(
4146 "reset_pg_recovery_stats",
4147 asok_hook,
4148 "reset pg recovery statistics");
4149 ceph_assert(r == 0);
4150 r = admin_socket->register_command(
4151 "cache drop",
4152 asok_hook,
4153 "Drop all OSD caches");
4154 ceph_assert(r == 0);
4155 r = admin_socket->register_command(
4156 "cache status",
4157 asok_hook,
4158 "Get OSD caches statistics");
4159 ceph_assert(r == 0);
4160 r = admin_socket->register_command(
4161 "scrub_purged_snaps",
4162 asok_hook,
4163 "Scrub purged_snaps vs snapmapper index");
4164 ceph_assert(r == 0);
4165 r = admin_socket->register_command(
4166 "scrubdebug " \
4167 "name=pgid,type=CephPgid " \
4168 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4169 "name=value,type=CephString,req=false",
4170 asok_hook,
4171 "debug the scrubber");
4172 ceph_assert(r == 0);
4173
4174 // -- pg commands --
4175 // old form: ceph pg <pgid> command ...
4176 r = admin_socket->register_command(
4177 "pg " \
4178 "name=pgid,type=CephPgid " \
4179 "name=cmd,type=CephChoices,strings=query",
4180 asok_hook,
4181 "");
4182 ceph_assert(r == 0);
4183 r = admin_socket->register_command(
4184 "pg " \
4185 "name=pgid,type=CephPgid " \
4186 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4187 "name=mulcmd,type=CephChoices,strings=revert|delete",
4188 asok_hook,
4189 "");
4190 ceph_assert(r == 0);
4191 r = admin_socket->register_command(
4192 "pg " \
4193 "name=pgid,type=CephPgid " \
4194 "name=cmd,type=CephChoices,strings=list_unfound " \
4195 "name=offset,type=CephString,req=false",
4196 asok_hook,
4197 "");
4198 ceph_assert(r == 0);
4199 r = admin_socket->register_command(
4200 "pg " \
4201 "name=pgid,type=CephPgid " \
4202 "name=cmd,type=CephChoices,strings=scrub " \
4203 "name=time,type=CephInt,req=false",
4204 asok_hook,
4205 "");
4206 ceph_assert(r == 0);
4207 r = admin_socket->register_command(
4208 "pg " \
4209 "name=pgid,type=CephPgid " \
4210 "name=cmd,type=CephChoices,strings=deep_scrub " \
4211 "name=time,type=CephInt,req=false",
4212 asok_hook,
4213 "");
4214 ceph_assert(r == 0);
4215 // new form: tell <pgid> <cmd> for both cli and rest
4216 r = admin_socket->register_command(
4217 "query",
4218 asok_hook,
4219 "show details of a specific pg");
4220 ceph_assert(r == 0);
4221 r = admin_socket->register_command(
4222 "mark_unfound_lost " \
4223 "name=pgid,type=CephPgid,req=false " \
4224 "name=mulcmd,type=CephChoices,strings=revert|delete",
4225 asok_hook,
4226 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4227 ceph_assert(r == 0);
4228 r = admin_socket->register_command(
4229 "list_unfound " \
4230 "name=pgid,type=CephPgid,req=false " \
4231 "name=offset,type=CephString,req=false",
4232 asok_hook,
4233 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4234 ceph_assert(r == 0);
4235 r = admin_socket->register_command(
4236 "scrub " \
4237 "name=pgid,type=CephPgid,req=false " \
4238 "name=time,type=CephInt,req=false",
4239 asok_hook,
4240 "Trigger a scheduled scrub ");
4241 ceph_assert(r == 0);
4242 r = admin_socket->register_command(
4243 "deep_scrub " \
4244 "name=pgid,type=CephPgid,req=false " \
4245 "name=time,type=CephInt,req=false",
4246 asok_hook,
4247 "Trigger a scheduled deep scrub ");
4248 ceph_assert(r == 0);
4249 }
4250
4251 PerfCounters* OSD::create_logger()
4252 {
4253 PerfCounters* logger = build_osd_logger(cct);
4254 cct->get_perfcounters_collection()->add(logger);
4255 return logger;
4256 }
4257
4258 PerfCounters* OSD::create_recoverystate_perf()
4259 {
4260 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4261 cct->get_perfcounters_collection()->add(recoverystate_perf);
4262 return recoverystate_perf;
4263 }
4264
4265 int OSD::shutdown()
4266 {
4267 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4268 //cct->_conf->osd_fast_shutdown = true;
4269
4270 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4271 << cct->_conf->osd_fast_shutdown
4272 << ", null-fm = " << store->has_null_manager() << dendl;
4273
4274 utime_t start_time_func = ceph_clock_now();
4275
4276 if (cct->_conf->osd_fast_shutdown) {
4277 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4278 if (cct->_conf->osd_fast_shutdown_notify_mon)
4279 service.prepare_to_stop();
4280
4281 // There is no state we need to keep wehn running in NULL-FM moode
4282 if (!store->has_null_manager()) {
4283 cct->_log->flush();
4284 _exit(0);
4285 }
4286 } else if (!service.prepare_to_stop()) {
4287 return 0; // already shutting down
4288 }
4289
4290 osd_lock.lock();
4291 if (is_stopping()) {
4292 osd_lock.unlock();
4293 return 0;
4294 }
4295
4296 if (!cct->_conf->osd_fast_shutdown) {
4297 dout(0) << "shutdown" << dendl;
4298 }
4299
4300 // don't accept new task for this OSD
4301 set_state(STATE_STOPPING);
4302
4303 // Disabled debugging during fast-shutdown
4304 if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4305 cct->_conf.set_val("debug_osd", "100");
4306 cct->_conf.set_val("debug_journal", "100");
4307 cct->_conf.set_val("debug_filestore", "100");
4308 cct->_conf.set_val("debug_bluestore", "100");
4309 cct->_conf.set_val("debug_ms", "100");
4310 cct->_conf.apply_changes(nullptr);
4311 }
4312
4313 if (cct->_conf->osd_fast_shutdown) {
4314 // first, stop new task from being taken from op_shardedwq
4315 // and clear all pending tasks
4316 op_shardedwq.stop_for_fast_shutdown();
4317
4318 utime_t start_time_timer = ceph_clock_now();
4319 tick_timer.shutdown();
4320 {
4321 std::lock_guard l(tick_timer_lock);
4322 tick_timer_without_osd_lock.shutdown();
4323 }
4324
4325 osd_lock.unlock();
4326 utime_t start_time_osd_drain = ceph_clock_now();
4327
4328 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4329 osd_op_tp.drain();
4330 osd_op_tp.stop();
4331
4332 utime_t start_time_umount = ceph_clock_now();
4333 store->prepare_for_fast_shutdown();
4334 std::lock_guard lock(osd_lock);
4335 // TBD: assert in allocator that nothing is being add
4336 store->umount();
4337
4338 utime_t end_time = ceph_clock_now();
4339 if (cct->_conf->osd_fast_shutdown_timeout) {
4340 ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4341 }
4342 dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
4343 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
4344 dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
4345 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
4346 cct->_log->flush();
4347
4348 // now it is safe to exit
4349 _exit(0);
4350 }
4351
4352 // stop MgrClient earlier as it's more like an internal consumer of OSD
4353 mgrc.shutdown();
4354
4355 service.start_shutdown();
4356
4357 // stop sending work to pgs. this just prevents any new work in _process
4358 // from racing with on_shutdown and potentially entering the pg after.
4359 op_shardedwq.drain();
4360
4361 // Shutdown PGs
4362 {
4363 vector<PGRef> pgs;
4364 _get_pgs(&pgs);
4365 for (auto pg : pgs) {
4366 pg->shutdown();
4367 }
4368 }
4369
4370 // drain op queue again (in case PGs requeued something)
4371 op_shardedwq.drain();
4372 {
4373 finished.clear(); // zap waiters (bleh, this is messy)
4374 waiting_for_osdmap.clear();
4375 }
4376
4377 // unregister commands
4378 cct->get_admin_socket()->unregister_commands(asok_hook);
4379 delete asok_hook;
4380 asok_hook = NULL;
4381
4382 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4383 delete test_ops_hook;
4384 test_ops_hook = NULL;
4385
4386 osd_lock.unlock();
4387
4388 {
4389 std::lock_guard l{heartbeat_lock};
4390 heartbeat_stop = true;
4391 heartbeat_cond.notify_all();
4392 heartbeat_peers.clear();
4393 }
4394 heartbeat_thread.join();
4395
4396 hb_back_server_messenger->mark_down_all();
4397 hb_front_server_messenger->mark_down_all();
4398 hb_front_client_messenger->mark_down_all();
4399 hb_back_client_messenger->mark_down_all();
4400
4401 osd_op_tp.drain();
4402 osd_op_tp.stop();
4403 dout(10) << "op sharded tp stopped" << dendl;
4404
4405 dout(10) << "stopping agent" << dendl;
4406 service.agent_stop();
4407
4408 boot_finisher.wait_for_empty();
4409
4410 osd_lock.lock();
4411
4412 boot_finisher.stop();
4413 reset_heartbeat_peers(true);
4414
4415 tick_timer.shutdown();
4416
4417 {
4418 std::lock_guard l(tick_timer_lock);
4419 tick_timer_without_osd_lock.shutdown();
4420 }
4421
4422 // note unmount epoch
4423 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4424 superblock.mounted = service.get_boot_epoch();
4425 superblock.clean_thru = get_osdmap_epoch();
4426 ObjectStore::Transaction t;
4427 write_superblock(t);
4428 int r = store->queue_transaction(service.meta_ch, std::move(t));
4429 if (r) {
4430 derr << "OSD::shutdown: error writing superblock: "
4431 << cpp_strerror(r) << dendl;
4432 }
4433
4434
4435 service.shutdown_reserver();
4436
4437 // Remove PGs
4438 #ifdef PG_DEBUG_REFS
4439 service.dump_live_pgids();
4440 #endif
4441 while (true) {
4442 vector<PGRef> pgs;
4443 _get_pgs(&pgs, true);
4444 if (pgs.empty()) {
4445 break;
4446 }
4447 for (auto& pg : pgs) {
4448 if (pg->is_deleted()) {
4449 continue;
4450 }
4451 dout(20) << " kicking pg " << pg << dendl;
4452 pg->lock();
4453 if (pg->get_num_ref() != 1) {
4454 derr << "pgid " << pg->get_pgid() << " has ref count of "
4455 << pg->get_num_ref() << dendl;
4456 #ifdef PG_DEBUG_REFS
4457 pg->dump_live_ids();
4458 #endif
4459 if (cct->_conf->osd_shutdown_pgref_assert) {
4460 ceph_abort();
4461 }
4462 }
4463 pg->ch.reset();
4464 pg->unlock();
4465 }
4466 }
4467 #ifdef PG_DEBUG_REFS
4468 service.dump_live_pgids();
4469 #endif
4470
4471 osd_lock.unlock();
4472 cct->_conf.remove_observer(this);
4473 osd_lock.lock();
4474
4475 service.meta_ch.reset();
4476
4477 dout(10) << "syncing store" << dendl;
4478 enable_disable_fuse(true);
4479
4480 if (cct->_conf->osd_journal_flush_on_shutdown) {
4481 dout(10) << "flushing journal" << dendl;
4482 store->flush_journal();
4483 }
4484
4485 monc->shutdown();
4486 osd_lock.unlock();
4487 {
4488 std::unique_lock l{map_lock};
4489 set_osdmap(OSDMapRef());
4490 }
4491 for (auto s : shards) {
4492 std::lock_guard l(s->osdmap_lock);
4493 s->shard_osdmap = OSDMapRef();
4494 }
4495 service.shutdown();
4496
4497 std::lock_guard lock(osd_lock);
4498 store->umount();
4499 store.reset();
4500 dout(10) << "Store synced" << dendl;
4501
4502 op_tracker.on_shutdown();
4503
4504 ClassHandler::get_instance().shutdown();
4505 client_messenger->shutdown();
4506 cluster_messenger->shutdown();
4507 hb_front_client_messenger->shutdown();
4508 hb_back_client_messenger->shutdown();
4509 objecter_messenger->shutdown();
4510 hb_front_server_messenger->shutdown();
4511 hb_back_server_messenger->shutdown();
4512
4513 utime_t duration = ceph_clock_now() - start_time_func;
4514 dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4515
4516 tracing::osd::tracer.shutdown();
4517
4518 return r;
4519 }
4520
4521 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4522 {
4523 bool created = false;
4524 while (true) {
4525 dout(10) << __func__ << " cmd: " << cmd << dendl;
4526 vector<string> vcmd{cmd};
4527 bufferlist inbl;
4528 C_SaferCond w;
4529 string outs;
4530 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4531 int r = w.wait();
4532 if (r < 0) {
4533 if (r == -ENOENT && !created) {
4534 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4535 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4536 vector<string> vnewcmd{newcmd};
4537 bufferlist inbl;
4538 C_SaferCond w;
4539 string outs;
4540 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4541 int r = w.wait();
4542 if (r < 0) {
4543 derr << __func__ << " fail: osd does not exist and created failed: "
4544 << cpp_strerror(r) << dendl;
4545 return r;
4546 }
4547 created = true;
4548 continue;
4549 }
4550 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4551 return r;
4552 }
4553 break;
4554 }
4555
4556 return 0;
4557 }
4558
4559 int OSD::update_crush_location()
4560 {
4561 if (!cct->_conf->osd_crush_update_on_start) {
4562 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4563 return 0;
4564 }
4565
4566 char weight[32];
4567 if (cct->_conf->osd_crush_initial_weight >= 0) {
4568 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4569 } else {
4570 struct store_statfs_t st;
4571 osd_alert_list_t alerts;
4572 int r = store->statfs(&st, &alerts);
4573 if (r < 0) {
4574 derr << "statfs: " << cpp_strerror(r) << dendl;
4575 return r;
4576 }
4577 snprintf(weight, sizeof(weight), "%.4lf",
4578 std::max(.00001,
4579 double(st.total) /
4580 double(1ull << 40 /* TB */)));
4581 }
4582
4583 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4584
4585 string cmd =
4586 string("{\"prefix\": \"osd crush create-or-move\", ") +
4587 string("\"id\": ") + stringify(whoami) + ", " +
4588 string("\"weight\":") + weight + ", " +
4589 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4590 return mon_cmd_maybe_osd_create(cmd);
4591 }
4592
4593 int OSD::update_crush_device_class()
4594 {
4595 if (!cct->_conf->osd_class_update_on_start) {
4596 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4597 return 0;
4598 }
4599
4600 string device_class;
4601 int r = store->read_meta("crush_device_class", &device_class);
4602 if (r < 0 || device_class.empty()) {
4603 device_class = store->get_default_device_class();
4604 }
4605
4606 if (device_class.empty()) {
4607 dout(20) << __func__ << " no device class stored locally" << dendl;
4608 return 0;
4609 }
4610
4611 string cmd =
4612 string("{\"prefix\": \"osd crush set-device-class\", ") +
4613 string("\"class\": \"") + device_class + string("\", ") +
4614 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4615
4616 r = mon_cmd_maybe_osd_create(cmd);
4617 if (r == -EBUSY) {
4618 // good, already bound to a device-class
4619 return 0;
4620 } else {
4621 return r;
4622 }
4623 }
4624
4625 void OSD::write_superblock(ObjectStore::Transaction& t)
4626 {
4627 dout(10) << "write_superblock " << superblock << dendl;
4628
4629 //hack: at minimum it's using the baseline feature set
4630 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4631 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4632
4633 bufferlist bl;
4634 encode(superblock, bl);
4635 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4636 }
4637
4638 int OSD::read_superblock()
4639 {
4640 bufferlist bl;
4641 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4642 if (r < 0)
4643 return r;
4644
4645 auto p = bl.cbegin();
4646 decode(superblock, p);
4647
4648 dout(10) << "read_superblock " << superblock << dendl;
4649
4650 return 0;
4651 }
4652
4653 void OSD::clear_temp_objects()
4654 {
4655 dout(10) << __func__ << dendl;
4656 vector<coll_t> ls;
4657 store->list_collections(ls);
4658 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4659 spg_t pgid;
4660 if (!p->is_pg(&pgid))
4661 continue;
4662
4663 // list temp objects
4664 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4665
4666 vector<ghobject_t> temps;
4667 ghobject_t next;
4668 while (1) {
4669 vector<ghobject_t> objects;
4670 auto ch = store->open_collection(*p);
4671 ceph_assert(ch);
4672 store->collection_list(ch, next, ghobject_t::get_max(),
4673 store->get_ideal_list_max(),
4674 &objects, &next);
4675 if (objects.empty())
4676 break;
4677 vector<ghobject_t>::iterator q;
4678 for (q = objects.begin(); q != objects.end(); ++q) {
4679 // Hammer set pool for temps to -1, so check for clean-up
4680 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4681 temps.push_back(*q);
4682 } else {
4683 break;
4684 }
4685 }
4686 // If we saw a non-temp object and hit the break above we can
4687 // break out of the while loop too.
4688 if (q != objects.end())
4689 break;
4690 }
4691 if (!temps.empty()) {
4692 ObjectStore::Transaction t;
4693 int removed = 0;
4694 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4695 dout(20) << " removing " << *p << " object " << *q << dendl;
4696 t.remove(*p, *q);
4697 if (++removed > cct->_conf->osd_target_transaction_size) {
4698 store->queue_transaction(service.meta_ch, std::move(t));
4699 t = ObjectStore::Transaction();
4700 removed = 0;
4701 }
4702 }
4703 if (removed) {
4704 store->queue_transaction(service.meta_ch, std::move(t));
4705 }
4706 }
4707 }
4708 }
4709
4710 void OSD::recursive_remove_collection(CephContext* cct,
4711 ObjectStore *store, spg_t pgid,
4712 coll_t tmp)
4713 {
4714 OSDriver driver(
4715 store,
4716 coll_t(),
4717 make_snapmapper_oid());
4718
4719 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4720 ObjectStore::Transaction t;
4721 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4722
4723 ghobject_t next;
4724 int max = cct->_conf->osd_target_transaction_size;
4725 vector<ghobject_t> objects;
4726 objects.reserve(max);
4727 while (true) {
4728 objects.clear();
4729 store->collection_list(ch, next, ghobject_t::get_max(),
4730 max, &objects, &next);
4731 generic_dout(10) << __func__ << " " << objects << dendl;
4732 if (objects.empty())
4733 break;
4734 for (auto& p: objects) {
4735 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4736 int r = mapper.remove_oid(p.hobj, &_t);
4737 if (r != 0 && r != -ENOENT)
4738 ceph_abort();
4739 t.remove(tmp, p);
4740 }
4741 int r = store->queue_transaction(ch, std::move(t));
4742 ceph_assert(r == 0);
4743 t = ObjectStore::Transaction();
4744 }
4745 t.remove_collection(tmp);
4746 int r = store->queue_transaction(ch, std::move(t));
4747 ceph_assert(r == 0);
4748
4749 C_SaferCond waiter;
4750 if (!ch->flush_commit(&waiter)) {
4751 waiter.wait();
4752 }
4753 }
4754
4755
4756 // ======================================================
4757 // PG's
4758
4759 PG* OSD::_make_pg(
4760 OSDMapRef createmap,
4761 spg_t pgid)
4762 {
4763 dout(10) << __func__ << " " << pgid << dendl;
4764 pg_pool_t pi;
4765 map<string,string> ec_profile;
4766 string name;
4767 if (createmap->have_pg_pool(pgid.pool())) {
4768 pi = *createmap->get_pg_pool(pgid.pool());
4769 name = createmap->get_pool_name(pgid.pool());
4770 if (pi.is_erasure()) {
4771 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4772 }
4773 } else {
4774 // pool was deleted; grab final pg_pool_t off disk.
4775 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4776 bufferlist bl;
4777 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4778 if (r < 0) {
4779 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4780 << dendl;
4781 return nullptr;
4782 }
4783 ceph_assert(r >= 0);
4784 auto p = bl.cbegin();
4785 decode(pi, p);
4786 decode(name, p);
4787 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4788 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4789 << " tombstone" << dendl;
4790 return nullptr;
4791 }
4792 decode(ec_profile, p);
4793 }
4794 PGPool pool(createmap, pgid.pool(), pi, name);
4795 PG *pg;
4796 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4797 pi.type == pg_pool_t::TYPE_ERASURE)
4798 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4799 else
4800 ceph_abort();
4801 return pg;
4802 }
4803
4804 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4805 {
4806 v->clear();
4807 v->reserve(get_num_pgs());
4808 for (auto& s : shards) {
4809 std::lock_guard l(s->shard_lock);
4810 for (auto& j : s->pg_slots) {
4811 if (j.second->pg &&
4812 !j.second->pg->is_deleted()) {
4813 v->push_back(j.second->pg);
4814 if (clear_too) {
4815 s->_detach_pg(j.second.get());
4816 }
4817 }
4818 }
4819 }
4820 }
4821
4822 void OSD::_get_pgids(vector<spg_t> *v)
4823 {
4824 v->clear();
4825 v->reserve(get_num_pgs());
4826 for (auto& s : shards) {
4827 std::lock_guard l(s->shard_lock);
4828 for (auto& j : s->pg_slots) {
4829 if (j.second->pg &&
4830 !j.second->pg->is_deleted()) {
4831 v->push_back(j.first);
4832 }
4833 }
4834 }
4835 }
4836
4837 void OSD::register_pg(PGRef pg)
4838 {
4839 spg_t pgid = pg->get_pgid();
4840 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4841 auto sdata = shards[shard_index];
4842 std::lock_guard l(sdata->shard_lock);
4843 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4844 ceph_assert(r.second);
4845 auto *slot = r.first->second.get();
4846 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4847 sdata->_attach_pg(slot, pg.get());
4848 }
4849
4850 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4851 {
4852 auto sdata = pg->osd_shard;
4853 ceph_assert(sdata);
4854 {
4855 std::lock_guard l(sdata->shard_lock);
4856 auto p = sdata->pg_slots.find(pg->pg_id);
4857 if (p == sdata->pg_slots.end() ||
4858 !p->second->pg) {
4859 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4860 return false;
4861 }
4862 if (p->second->waiting_for_merge_epoch) {
4863 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4864 return false;
4865 }
4866 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4867 sdata->_detach_pg(p->second.get());
4868 }
4869
4870 for (auto shard : shards) {
4871 shard->unprime_split_children(pg->pg_id, old_pg_num);
4872 }
4873
4874 // update pg count now since we might not get an osdmap any time soon.
4875 if (pg->is_primary())
4876 service.logger->dec(l_osd_pg_primary);
4877 else if (pg->is_nonprimary())
4878 service.logger->dec(l_osd_pg_replica); // misnomver
4879 else
4880 service.logger->dec(l_osd_pg_stray);
4881
4882 return true;
4883 }
4884
4885 PGRef OSD::_lookup_pg(spg_t pgid)
4886 {
4887 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4888 auto sdata = shards[shard_index];
4889 std::lock_guard l(sdata->shard_lock);
4890 auto p = sdata->pg_slots.find(pgid);
4891 if (p == sdata->pg_slots.end()) {
4892 return nullptr;
4893 }
4894 return p->second->pg;
4895 }
4896
4897 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4898 {
4899 PGRef pg = _lookup_pg(pgid);
4900 if (!pg) {
4901 return nullptr;
4902 }
4903 pg->lock();
4904 if (!pg->is_deleted()) {
4905 return pg;
4906 }
4907 pg->unlock();
4908 return nullptr;
4909 }
4910
4911 PGRef OSD::lookup_lock_pg(spg_t pgid)
4912 {
4913 return _lookup_lock_pg(pgid);
4914 }
4915
4916 void OSD::load_pgs()
4917 {
4918 ceph_assert(ceph_mutex_is_locked(osd_lock));
4919 dout(0) << "load_pgs" << dendl;
4920
4921 {
4922 auto pghist = make_pg_num_history_oid();
4923 bufferlist bl;
4924 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4925 if (r >= 0 && bl.length() > 0) {
4926 auto p = bl.cbegin();
4927 decode(pg_num_history, p);
4928 }
4929 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4930 }
4931
4932 vector<coll_t> ls;
4933 int r = store->list_collections(ls);
4934 if (r < 0) {
4935 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4936 }
4937
4938 int num = 0;
4939 for (vector<coll_t>::iterator it = ls.begin();
4940 it != ls.end();
4941 ++it) {
4942 spg_t pgid;
4943 if (it->is_temp(&pgid) ||
4944 (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
4945 dout(10) << "load_pgs " << *it
4946 << " removing, legacy or flagged for removal pg" << dendl;
4947 recursive_remove_collection(cct, store.get(), pgid, *it);
4948 continue;
4949 }
4950
4951 if (!it->is_pg(&pgid)) {
4952 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4953 continue;
4954 }
4955
4956 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4957 epoch_t map_epoch = 0;
4958 int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
4959 if (r < 0) {
4960 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4961 << dendl;
4962 continue;
4963 }
4964
4965 PGRef pg;
4966 if (map_epoch > 0) {
4967 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4968 if (!pgosdmap) {
4969 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4970 derr << __func__ << ": could not find map for epoch " << map_epoch
4971 << " on pg " << pgid << ", but the pool is not present in the "
4972 << "current map, so this is probably a result of bug 10617. "
4973 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4974 << "to clean it up later." << dendl;
4975 continue;
4976 } else {
4977 derr << __func__ << ": have pgid " << pgid << " at epoch "
4978 << map_epoch << ", but missing map. Crashing."
4979 << dendl;
4980 ceph_abort_msg("Missing map in load_pgs");
4981 }
4982 }
4983 pg = _make_pg(pgosdmap, pgid);
4984 } else {
4985 pg = _make_pg(get_osdmap(), pgid);
4986 }
4987 if (!pg) {
4988 recursive_remove_collection(cct, store.get(), pgid, *it);
4989 continue;
4990 }
4991
4992 // there can be no waiters here, so we don't call _wake_pg_slot
4993
4994 pg->lock();
4995 pg->ch = store->open_collection(pg->coll);
4996
4997 // read pg state, log
4998 pg->read_state(store.get());
4999
5000 if (pg->dne()) {
5001 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
5002 pg->ch = nullptr;
5003 pg->unlock();
5004 recursive_remove_collection(cct, store.get(), pgid, *it);
5005 continue;
5006 }
5007 {
5008 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5009 assert(NULL != shards[shard_index]);
5010 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5011 }
5012
5013 dout(10) << __func__ << " loaded " << *pg << dendl;
5014 pg->unlock();
5015
5016 register_pg(pg);
5017 ++num;
5018 }
5019 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
5020 }
5021
5022
5023 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5024 const PGCreateInfo *info)
5025 {
5026 spg_t pgid = info->pgid;
5027
5028 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5029 dout(10) << __func__ << " hit max pg, dropping" << dendl;
5030 return nullptr;
5031 }
5032
5033 OSDMapRef startmap = get_map(info->epoch);
5034
5035 if (info->by_mon) {
5036 int64_t pool_id = pgid.pgid.pool();
5037 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5038 if (!pool) {
5039 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5040 return nullptr;
5041 }
5042 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
5043 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5044 // this ensures we do not process old creating messages after the
5045 // pool's initial pgs have been created (and pg are subsequently
5046 // allowed to split or merge).
5047 dout(20) << __func__ << " dropping " << pgid
5048 << "create, pool does not have CREATING flag set" << dendl;
5049 return nullptr;
5050 }
5051 }
5052
5053 int up_primary, acting_primary;
5054 vector<int> up, acting;
5055 startmap->pg_to_up_acting_osds(
5056 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5057
5058 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5059 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5060 store->get_type() != "bluestore") {
5061 clog->warn() << "pg " << pgid
5062 << " is at risk of silent data corruption: "
5063 << "the pool allows ec overwrites but is not stored in "
5064 << "bluestore, so deep scrubbing will not detect bitrot";
5065 }
5066 PeeringCtx rctx;
5067 create_pg_collection(
5068 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5069 init_pg_ondisk(rctx.transaction, pgid, pp);
5070
5071 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
5072
5073 PGRef pg = _make_pg(startmap, pgid);
5074 pg->ch = store->create_new_collection(pg->coll);
5075
5076 {
5077 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5078 assert(NULL != shards[shard_index]);
5079 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5080 }
5081
5082 pg->lock(true);
5083
5084 // we are holding the shard lock
5085 ceph_assert(!pg->is_deleted());
5086
5087 pg->init(
5088 role,
5089 up,
5090 up_primary,
5091 acting,
5092 acting_primary,
5093 info->history,
5094 info->past_intervals,
5095 rctx.transaction);
5096
5097 pg->init_collection_pool_opts();
5098
5099 if (pg->is_primary()) {
5100 std::lock_guard locker{m_perf_queries_lock};
5101 pg->set_dynamic_perf_stats_queries(m_perf_queries);
5102 }
5103
5104 pg->handle_initialize(rctx);
5105 pg->handle_activate_map(rctx);
5106
5107 dispatch_context(rctx, pg.get(), osdmap, nullptr);
5108
5109 dout(10) << __func__ << " new pg " << *pg << dendl;
5110 return pg;
5111 }
5112
5113 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5114 spg_t pgid,
5115 bool is_mon_create)
5116 {
5117 const auto max_pgs_per_osd =
5118 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5119 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5120
5121 if (num_pgs < max_pgs_per_osd) {
5122 return false;
5123 }
5124
5125 std::lock_guard l(pending_creates_lock);
5126 if (is_mon_create) {
5127 pending_creates_from_mon++;
5128 } else {
5129 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5130 pending_creates_from_osd.emplace(pgid, is_primary);
5131 }
5132 dout(1) << __func__ << " withhold creation of pg " << pgid
5133 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
5134 return true;
5135 }
5136
5137 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5138 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5139 // to up set if pg_temp is empty. so an empty pg_temp won't work.
5140 static vector<int32_t> twiddle(const vector<int>& acting) {
5141 if (acting.size() > 1) {
5142 return {acting[0]};
5143 } else {
5144 vector<int32_t> twiddled(acting.begin(), acting.end());
5145 twiddled.push_back(-1);
5146 return twiddled;
5147 }
5148 }
5149
5150 void OSD::resume_creating_pg()
5151 {
5152 bool do_sub_pg_creates = false;
5153 bool have_pending_creates = false;
5154 {
5155 const auto max_pgs_per_osd =
5156 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5157 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5158 if (max_pgs_per_osd <= num_pgs) {
5159 // this could happen if admin decreases this setting before a PG is removed
5160 return;
5161 }
5162 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5163 std::lock_guard l(pending_creates_lock);
5164 if (pending_creates_from_mon > 0) {
5165 dout(20) << __func__ << " pending_creates_from_mon "
5166 << pending_creates_from_mon << dendl;
5167 do_sub_pg_creates = true;
5168 if (pending_creates_from_mon >= spare_pgs) {
5169 spare_pgs = pending_creates_from_mon = 0;
5170 } else {
5171 spare_pgs -= pending_creates_from_mon;
5172 pending_creates_from_mon = 0;
5173 }
5174 }
5175 auto pg = pending_creates_from_osd.cbegin();
5176 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5177 dout(20) << __func__ << " pg " << pg->first << dendl;
5178 vector<int> acting;
5179 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5180 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5181 pg = pending_creates_from_osd.erase(pg);
5182 do_sub_pg_creates = true;
5183 spare_pgs--;
5184 }
5185 have_pending_creates = (pending_creates_from_mon > 0 ||
5186 !pending_creates_from_osd.empty());
5187 }
5188
5189 bool do_renew_subs = false;
5190 if (do_sub_pg_creates) {
5191 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5192 dout(4) << __func__ << ": resolicit pg creates from mon since "
5193 << last_pg_create_epoch << dendl;
5194 do_renew_subs = true;
5195 }
5196 }
5197 version_t start = get_osdmap_epoch() + 1;
5198 if (have_pending_creates) {
5199 // don't miss any new osdmap deleting PGs
5200 if (monc->sub_want("osdmap", start, 0)) {
5201 dout(4) << __func__ << ": resolicit osdmap from mon since "
5202 << start << dendl;
5203 do_renew_subs = true;
5204 }
5205 } else if (do_sub_pg_creates) {
5206 // no need to subscribe the osdmap continuously anymore
5207 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5208 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5209 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5210 << start << dendl;
5211 do_renew_subs = true;
5212 }
5213 }
5214
5215 if (do_renew_subs) {
5216 monc->renew_subs();
5217 }
5218
5219 service.send_pg_temp();
5220 }
5221
5222 void OSD::build_initial_pg_history(
5223 spg_t pgid,
5224 epoch_t created,
5225 utime_t created_stamp,
5226 pg_history_t *h,
5227 PastIntervals *pi)
5228 {
5229 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
5230 *h = pg_history_t(created, created_stamp);
5231
5232 OSDMapRef lastmap = service.get_map(created);
5233 int up_primary, acting_primary;
5234 vector<int> up, acting;
5235 lastmap->pg_to_up_acting_osds(
5236 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5237
5238 ostringstream debug;
5239 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
5240 OSDMapRef osdmap = service.get_map(e);
5241 int new_up_primary, new_acting_primary;
5242 vector<int> new_up, new_acting;
5243 osdmap->pg_to_up_acting_osds(
5244 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5245
5246 // this is a bit imprecise, but sufficient?
5247 struct min_size_predicate_t : public IsPGRecoverablePredicate {
5248 const pg_pool_t *pi;
5249 bool operator()(const set<pg_shard_t> &have) const {
5250 return have.size() >= pi->min_size;
5251 }
5252 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
5253 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5254
5255 bool new_interval = PastIntervals::check_new_interval(
5256 acting_primary,
5257 new_acting_primary,
5258 acting, new_acting,
5259 up_primary,
5260 new_up_primary,
5261 up, new_up,
5262 h->same_interval_since,
5263 h->last_epoch_clean,
5264 osdmap.get(),
5265 lastmap.get(),
5266 pgid.pgid,
5267 min_size_predicate,
5268 pi,
5269 &debug);
5270 if (new_interval) {
5271 h->same_interval_since = e;
5272 if (up != new_up) {
5273 h->same_up_since = e;
5274 }
5275 if (acting_primary != new_acting_primary) {
5276 h->same_primary_since = e;
5277 }
5278 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5279 osdmap->get_pg_num(pgid.pgid.pool()),
5280 nullptr)) {
5281 h->last_epoch_split = e;
5282 }
5283 up = new_up;
5284 acting = new_acting;
5285 up_primary = new_up_primary;
5286 acting_primary = new_acting_primary;
5287 }
5288 lastmap = osdmap;
5289 }
5290 dout(20) << __func__ << " " << debug.str() << dendl;
5291 dout(10) << __func__ << " " << *h << " " << *pi
5292 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5293 pi->get_bounds()) << ")"
5294 << dendl;
5295 }
5296
5297 void OSD::_add_heartbeat_peer(int p)
5298 {
5299 if (p == whoami)
5300 return;
5301 HeartbeatInfo *hi;
5302
5303 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5304 if (i == heartbeat_peers.end()) {
5305 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5306 if (!cons.first)
5307 return;
5308 assert(cons.second);
5309
5310 hi = &heartbeat_peers[p];
5311 hi->peer = p;
5312
5313 auto stamps = service.get_hb_stamps(p);
5314
5315 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5316 sb->peer = p;
5317 sb->stamps = stamps;
5318 hi->hb_interval_start = ceph_clock_now();
5319 hi->con_back = cons.first.get();
5320 hi->con_back->set_priv(sb);
5321
5322 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5323 sf->peer = p;
5324 sf->stamps = stamps;
5325 hi->con_front = cons.second.get();
5326 hi->con_front->set_priv(sf);
5327
5328 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5329 << " " << hi->con_back->get_peer_addr()
5330 << " " << hi->con_front->get_peer_addr()
5331 << dendl;
5332 } else {
5333 hi = &i->second;
5334 }
5335 hi->epoch = get_osdmap_epoch();
5336 }
5337
5338 void OSD::_remove_heartbeat_peer(int n)
5339 {
5340 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5341 ceph_assert(q != heartbeat_peers.end());
5342 dout(20) << " removing heartbeat peer osd." << n
5343 << " " << q->second.con_back->get_peer_addr()
5344 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5345 << dendl;
5346 q->second.clear_mark_down();
5347 heartbeat_peers.erase(q);
5348 }
5349
5350 void OSD::need_heartbeat_peer_update()
5351 {
5352 if (is_stopping())
5353 return;
5354 dout(20) << "need_heartbeat_peer_update" << dendl;
5355 heartbeat_set_peers_need_update();
5356 }
5357
5358 void OSD::maybe_update_heartbeat_peers()
5359 {
5360 ceph_assert(ceph_mutex_is_locked(osd_lock));
5361
5362 if (is_waiting_for_healthy() || is_active()) {
5363 utime_t now = ceph_clock_now();
5364 if (last_heartbeat_resample == utime_t()) {
5365 last_heartbeat_resample = now;
5366 heartbeat_set_peers_need_update();
5367 } else if (!heartbeat_peers_need_update()) {
5368 utime_t dur = now - last_heartbeat_resample;
5369 if (dur > cct->_conf->osd_heartbeat_grace) {
5370 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5371 heartbeat_set_peers_need_update();
5372 last_heartbeat_resample = now;
5373 // automatically clean up any stale heartbeat peers
5374 // if we are unhealthy, then clean all
5375 reset_heartbeat_peers(is_waiting_for_healthy());
5376 }
5377 }
5378 }
5379
5380 if (!heartbeat_peers_need_update())
5381 return;
5382 heartbeat_clear_peers_need_update();
5383
5384 std::lock_guard l(heartbeat_lock);
5385
5386 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5387
5388
5389 // build heartbeat from set
5390 if (is_active()) {
5391 vector<PGRef> pgs;
5392 _get_pgs(&pgs);
5393 for (auto& pg : pgs) {
5394 pg->with_heartbeat_peers([&](int peer) {
5395 if (get_osdmap()->is_up(peer)) {
5396 _add_heartbeat_peer(peer);
5397 }
5398 });
5399 }
5400 }
5401
5402 // include next and previous up osds to ensure we have a fully-connected set
5403 set<int> want, extras;
5404 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5405 if (next >= 0)
5406 want.insert(next);
5407 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5408 if (prev >= 0 && prev != next)
5409 want.insert(prev);
5410
5411 // make sure we have at least **min_down** osds coming from different
5412 // subtree level (e.g., hosts) for fast failure detection.
5413 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5414 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5415 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5416 get_osdmap()->get_random_up_osds_by_subtree(
5417 whoami, subtree, limit, want, &want);
5418
5419 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5420 dout(10) << " adding neighbor peer osd." << *p << dendl;
5421 extras.insert(*p);
5422 _add_heartbeat_peer(*p);
5423 }
5424
5425 // remove down peers; enumerate extras
5426 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5427 while (p != heartbeat_peers.end()) {
5428 if (!get_osdmap()->is_up(p->first)) {
5429 int o = p->first;
5430 ++p;
5431 _remove_heartbeat_peer(o);
5432 continue;
5433 }
5434 if (p->second.epoch < get_osdmap_epoch()) {
5435 extras.insert(p->first);
5436 }
5437 ++p;
5438 }
5439
5440 // too few?
5441 for (int n = next; n >= 0; ) {
5442 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5443 break;
5444 if (!extras.count(n) && !want.count(n) && n != whoami) {
5445 dout(10) << " adding random peer osd." << n << dendl;
5446 extras.insert(n);
5447 _add_heartbeat_peer(n);
5448 }
5449 n = get_osdmap()->get_next_up_osd_after(n);
5450 if (n == next)
5451 break; // came full circle; stop
5452 }
5453
5454 // too many?
5455 for (set<int>::iterator p = extras.begin();
5456 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5457 ++p) {
5458 if (want.count(*p))
5459 continue;
5460 _remove_heartbeat_peer(*p);
5461 }
5462
5463 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5464
5465 // clean up stale failure pending
5466 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5467 if (heartbeat_peers.count(it->first) == 0) {
5468 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5469 failure_pending.erase(it++);
5470 } else {
5471 it++;
5472 }
5473 }
5474 }
5475
5476 void OSD::reset_heartbeat_peers(bool all)
5477 {
5478 ceph_assert(ceph_mutex_is_locked(osd_lock));
5479 dout(10) << "reset_heartbeat_peers" << dendl;
5480 utime_t stale = ceph_clock_now();
5481 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5482 std::lock_guard l(heartbeat_lock);
5483 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5484 auto& [peer, hi] = *it;
5485 if (all || hi.is_stale(stale)) {
5486 hi.clear_mark_down();
5487 // stop sending failure_report to mon too
5488 failure_queue.erase(peer);
5489 failure_pending.erase(peer);
5490 it = heartbeat_peers.erase(it);
5491 } else {
5492 ++it;
5493 }
5494 }
5495 }
5496
5497 void OSD::handle_osd_ping(MOSDPing *m)
5498 {
5499 if (superblock.cluster_fsid != m->fsid) {
5500 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5501 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5502 << dendl;
5503 m->put();
5504 return;
5505 }
5506
5507 int from = m->get_source().num();
5508
5509 heartbeat_lock.lock();
5510 if (is_stopping()) {
5511 heartbeat_lock.unlock();
5512 m->put();
5513 return;
5514 }
5515
5516 utime_t now = ceph_clock_now();
5517 auto mnow = service.get_mnow();
5518 ConnectionRef con(m->get_connection());
5519 OSDMapRef curmap = service.get_osdmap();
5520 if (!curmap) {
5521 heartbeat_lock.unlock();
5522 m->put();
5523 return;
5524 }
5525
5526 auto sref = con->get_priv();
5527 Session *s = static_cast<Session*>(sref.get());
5528 if (!s) {
5529 heartbeat_lock.unlock();
5530 m->put();
5531 return;
5532 }
5533 if (!s->stamps) {
5534 s->peer = from;
5535 s->stamps = service.get_hb_stamps(from);
5536 }
5537
5538 switch (m->op) {
5539
5540 case MOSDPing::PING:
5541 {
5542 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5543 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5544 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5545 if (heartbeat_drop->second == 0) {
5546 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5547 } else {
5548 --heartbeat_drop->second;
5549 dout(5) << "Dropping heartbeat from " << from
5550 << ", " << heartbeat_drop->second
5551 << " remaining to drop" << dendl;
5552 break;
5553 }
5554 } else if (cct->_conf->osd_debug_drop_ping_probability >
5555 ((((double)(rand()%100))/100.0))) {
5556 heartbeat_drop =
5557 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5558 cct->_conf->osd_debug_drop_ping_duration)).first;
5559 dout(5) << "Dropping heartbeat from " << from
5560 << ", " << heartbeat_drop->second
5561 << " remaining to drop" << dendl;
5562 break;
5563 }
5564 }
5565
5566 ceph::signedspan sender_delta_ub{};
5567 s->stamps->got_ping(
5568 m->up_from,
5569 mnow,
5570 m->mono_send_stamp,
5571 m->delta_ub,
5572 &sender_delta_ub);
5573 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5574
5575 if (!cct->get_heartbeat_map()->is_healthy()) {
5576 dout(10) << "internal heartbeat not healthy, dropping ping request"
5577 << dendl;
5578 break;
5579 }
5580
5581 Message *r = new MOSDPing(monc->get_fsid(),
5582 curmap->get_epoch(),
5583 MOSDPing::PING_REPLY,
5584 m->ping_stamp,
5585 m->mono_ping_stamp,
5586 mnow,
5587 service.get_up_epoch(),
5588 cct->_conf->osd_heartbeat_min_size,
5589 sender_delta_ub);
5590 con->send_message(r);
5591
5592 if (curmap->is_up(from)) {
5593 if (is_active()) {
5594 ConnectionRef cluster_con = service.get_con_osd_cluster(
5595 from, curmap->get_epoch());
5596 if (cluster_con) {
5597 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5598 }
5599 }
5600 } else if (!curmap->exists(from) ||
5601 curmap->get_down_at(from) > m->map_epoch) {
5602 // tell them they have died
5603 Message *r = new MOSDPing(monc->get_fsid(),
5604 curmap->get_epoch(),
5605 MOSDPing::YOU_DIED,
5606 m->ping_stamp,
5607 m->mono_ping_stamp,
5608 mnow,
5609 service.get_up_epoch(),
5610 cct->_conf->osd_heartbeat_min_size);
5611 con->send_message(r);
5612 }
5613 }
5614 break;
5615
5616 case MOSDPing::PING_REPLY:
5617 {
5618 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5619 if (i != heartbeat_peers.end()) {
5620 auto acked = i->second.ping_history.find(m->ping_stamp);
5621 if (acked != i->second.ping_history.end()) {
5622 int &unacknowledged = acked->second.second;
5623 if (con == i->second.con_back) {
5624 dout(25) << "handle_osd_ping got reply from osd." << from
5625 << " first_tx " << i->second.first_tx
5626 << " last_tx " << i->second.last_tx
5627 << " last_rx_back " << i->second.last_rx_back
5628 << " -> " << now
5629 << " last_rx_front " << i->second.last_rx_front
5630 << dendl;
5631 i->second.last_rx_back = now;
5632 ceph_assert(unacknowledged > 0);
5633 --unacknowledged;
5634 // if there is no front con, set both stamps.
5635 if (i->second.con_front == NULL) {
5636 i->second.last_rx_front = now;
5637 ceph_assert(unacknowledged > 0);
5638 --unacknowledged;
5639 }
5640 } else if (con == i->second.con_front) {
5641 dout(25) << "handle_osd_ping got reply from osd." << from
5642 << " first_tx " << i->second.first_tx
5643 << " last_tx " << i->second.last_tx
5644 << " last_rx_back " << i->second.last_rx_back
5645 << " last_rx_front " << i->second.last_rx_front
5646 << " -> " << now
5647 << dendl;
5648 i->second.last_rx_front = now;
5649 ceph_assert(unacknowledged > 0);
5650 --unacknowledged;
5651 }
5652
5653 if (unacknowledged == 0) {
5654 // succeeded in getting all replies
5655 dout(25) << "handle_osd_ping got all replies from osd." << from
5656 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5657 << " and older pending ping(s)"
5658 << dendl;
5659
5660 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5661 ++i->second.hb_average_count;
5662 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5663 i->second.hb_total_back += back_pingtime;
5664 if (back_pingtime < i->second.hb_min_back)
5665 i->second.hb_min_back = back_pingtime;
5666 if (back_pingtime > i->second.hb_max_back)
5667 i->second.hb_max_back = back_pingtime;
5668 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5669 i->second.hb_total_front += front_pingtime;
5670 if (front_pingtime < i->second.hb_min_front)
5671 i->second.hb_min_front = front_pingtime;
5672 if (front_pingtime > i->second.hb_max_front)
5673 i->second.hb_max_front = front_pingtime;
5674
5675 ceph_assert(i->second.hb_interval_start != utime_t());
5676 if (i->second.hb_interval_start == utime_t())
5677 i->second.hb_interval_start = now;
5678 int64_t hb_avg_time_period = 60;
5679 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5680 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5681 }
5682 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5683 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5684 uint32_t back_min = i->second.hb_min_back;
5685 uint32_t back_max = i->second.hb_max_back;
5686 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5687 uint32_t front_min = i->second.hb_min_front;
5688 uint32_t front_max = i->second.hb_max_front;
5689
5690 // Reset for new interval
5691 i->second.hb_average_count = 0;
5692 i->second.hb_interval_start = now;
5693 i->second.hb_total_back = i->second.hb_max_back = 0;
5694 i->second.hb_min_back = UINT_MAX;
5695 i->second.hb_total_front = i->second.hb_max_front = 0;
5696 i->second.hb_min_front = UINT_MAX;
5697
5698 // Record per osd interace ping times
5699 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5700 if (i->second.hb_back_pingtime.size() == 0) {
5701 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5702 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5703 i->second.hb_back_pingtime.push_back(back_avg);
5704 i->second.hb_back_min.push_back(back_min);
5705 i->second.hb_back_max.push_back(back_max);
5706 i->second.hb_front_pingtime.push_back(front_avg);
5707 i->second.hb_front_min.push_back(front_min);
5708 i->second.hb_front_max.push_back(front_max);
5709 ++i->second.hb_index;
5710 }
5711 } else {
5712 int index = i->second.hb_index & (hb_vector_size - 1);
5713 i->second.hb_back_pingtime[index] = back_avg;
5714 i->second.hb_back_min[index] = back_min;
5715 i->second.hb_back_max[index] = back_max;
5716 i->second.hb_front_pingtime[index] = front_avg;
5717 i->second.hb_front_min[index] = front_min;
5718 i->second.hb_front_max[index] = front_max;
5719 ++i->second.hb_index;
5720 }
5721
5722 {
5723 std::lock_guard l(service.stat_lock);
5724 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5725 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5726
5727 uint32_t total = 0;
5728 uint32_t min = UINT_MAX;
5729 uint32_t max = 0;
5730 uint32_t count = 0;
5731 uint32_t which = 0;
5732 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5733 for (int32_t k = size - 1 ; k >= 0; --k) {
5734 ++count;
5735 int index = (i->second.hb_index + k) % size;
5736 total += i->second.hb_back_pingtime[index];
5737 if (i->second.hb_back_min[index] < min)
5738 min = i->second.hb_back_min[index];
5739 if (i->second.hb_back_max[index] > max)
5740 max = i->second.hb_back_max[index];
5741 if (count == 1 || count == 5 || count == 15) {
5742 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5743 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5744 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5745 which++;
5746 if (count == 15)
5747 break;
5748 }
5749 }
5750
5751 if (i->second.con_front != NULL) {
5752 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5753
5754 total = 0;
5755 min = UINT_MAX;
5756 max = 0;
5757 count = 0;
5758 which = 0;
5759 for (int32_t k = size - 1 ; k >= 0; --k) {
5760 ++count;
5761 int index = (i->second.hb_index + k) % size;
5762 total += i->second.hb_front_pingtime[index];
5763 if (i->second.hb_front_min[index] < min)
5764 min = i->second.hb_front_min[index];
5765 if (i->second.hb_front_max[index] > max)
5766 max = i->second.hb_front_max[index];
5767 if (count == 1 || count == 5 || count == 15) {
5768 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5769 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5770 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5771 which++;
5772 if (count == 15)
5773 break;
5774 }
5775 }
5776 }
5777 }
5778 } else {
5779 std::lock_guard l(service.stat_lock);
5780 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5781 if (i->second.con_front != NULL)
5782 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5783 }
5784 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5785 }
5786
5787 if (i->second.is_healthy(now)) {
5788 // Cancel false reports
5789 auto failure_queue_entry = failure_queue.find(from);
5790 if (failure_queue_entry != failure_queue.end()) {
5791 dout(10) << "handle_osd_ping canceling queued "
5792 << "failure report for osd." << from << dendl;
5793 failure_queue.erase(failure_queue_entry);
5794 }
5795
5796 auto failure_pending_entry = failure_pending.find(from);
5797 if (failure_pending_entry != failure_pending.end()) {
5798 dout(10) << "handle_osd_ping canceling in-flight "
5799 << "failure report for osd." << from << dendl;
5800 send_still_alive(curmap->get_epoch(),
5801 from,
5802 failure_pending_entry->second.second);
5803 failure_pending.erase(failure_pending_entry);
5804 }
5805 }
5806 } else {
5807 // old replies, deprecated by newly sent pings.
5808 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5809 << ") is found, treat as covered by newly sent pings "
5810 << "and ignore"
5811 << dendl;
5812 }
5813 }
5814
5815 if (m->map_epoch &&
5816 curmap->is_up(from)) {
5817 if (is_active()) {
5818 ConnectionRef cluster_con = service.get_con_osd_cluster(
5819 from, curmap->get_epoch());
5820 if (cluster_con) {
5821 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5822 }
5823 }
5824 }
5825
5826 s->stamps->got_ping_reply(
5827 mnow,
5828 m->mono_send_stamp,
5829 m->delta_ub);
5830 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5831 }
5832 break;
5833
5834 case MOSDPing::YOU_DIED:
5835 dout(10) << "handle_osd_ping " << m->get_source_inst()
5836 << " says i am down in " << m->map_epoch << dendl;
5837 osdmap_subscribe(curmap->get_epoch()+1, false);
5838 break;
5839 }
5840
5841 heartbeat_lock.unlock();
5842 m->put();
5843 }
5844
5845 void OSD::heartbeat_entry()
5846 {
5847 std::unique_lock l(heartbeat_lock);
5848 if (is_stopping())
5849 return;
5850 while (!heartbeat_stop) {
5851 heartbeat();
5852
5853 double wait;
5854 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5855 wait = (float)cct->_conf->osd_heartbeat_interval;
5856 } else {
5857 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5858 }
5859 auto w = ceph::make_timespan(wait);
5860 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5861 heartbeat_cond.wait_for(l, w);
5862 if (is_stopping())
5863 return;
5864 dout(30) << "heartbeat_entry woke up" << dendl;
5865 }
5866 }
5867
5868 void OSD::heartbeat_check()
5869 {
5870 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5871 utime_t now = ceph_clock_now();
5872
5873 // check for incoming heartbeats (move me elsewhere?)
5874 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5875 p != heartbeat_peers.end();
5876 ++p) {
5877
5878 if (p->second.first_tx == utime_t()) {
5879 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5880 << " yet, skipping" << dendl;
5881 continue;
5882 }
5883
5884 dout(25) << "heartbeat_check osd." << p->first
5885 << " first_tx " << p->second.first_tx
5886 << " last_tx " << p->second.last_tx
5887 << " last_rx_back " << p->second.last_rx_back
5888 << " last_rx_front " << p->second.last_rx_front
5889 << dendl;
5890 if (p->second.is_unhealthy(now)) {
5891 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5892 if (p->second.last_rx_back == utime_t() ||
5893 p->second.last_rx_front == utime_t()) {
5894 derr << "heartbeat_check: no reply from "
5895 << p->second.con_front->get_peer_addr().get_sockaddr()
5896 << " osd." << p->first
5897 << " ever on either front or back, first ping sent "
5898 << p->second.first_tx
5899 << " (oldest deadline " << oldest_deadline << ")"
5900 << dendl;
5901 // fail
5902 failure_queue[p->first] = p->second.first_tx;
5903 } else {
5904 derr << "heartbeat_check: no reply from "
5905 << p->second.con_front->get_peer_addr().get_sockaddr()
5906 << " osd." << p->first << " since back " << p->second.last_rx_back
5907 << " front " << p->second.last_rx_front
5908 << " (oldest deadline " << oldest_deadline << ")"
5909 << dendl;
5910 // fail
5911 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5912 }
5913 }
5914 }
5915 }
5916
5917 void OSD::heartbeat()
5918 {
5919 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5920 dout(30) << "heartbeat" << dendl;
5921
5922 auto load_for_logger = service.get_scrub_services().update_load_average();
5923 if (load_for_logger) {
5924 logger->set(l_osd_loadavg, load_for_logger.value());
5925 }
5926 dout(30) << "heartbeat checking stats" << dendl;
5927
5928 // refresh peer list and osd stats
5929 vector<int> hb_peers;
5930 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5931 p != heartbeat_peers.end();
5932 ++p)
5933 hb_peers.push_back(p->first);
5934
5935 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5936 dout(5) << __func__ << " " << new_stat << dendl;
5937 ceph_assert(new_stat.statfs.total);
5938
5939 float pratio;
5940 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5941
5942 service.check_full_status(ratio, pratio);
5943
5944 utime_t now = ceph_clock_now();
5945 auto mnow = service.get_mnow();
5946 utime_t deadline = now;
5947 deadline += cct->_conf->osd_heartbeat_grace;
5948
5949 // send heartbeats
5950 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5951 i != heartbeat_peers.end();
5952 ++i) {
5953 int peer = i->first;
5954 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5955 if (!s) {
5956 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5957 continue;
5958 }
5959 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5960
5961 i->second.last_tx = now;
5962 if (i->second.first_tx == utime_t())
5963 i->second.first_tx = now;
5964 i->second.ping_history[now] = make_pair(deadline,
5965 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5966 if (i->second.hb_interval_start == utime_t())
5967 i->second.hb_interval_start = now;
5968
5969 std::optional<ceph::signedspan> delta_ub;
5970 s->stamps->sent_ping(&delta_ub);
5971
5972 i->second.con_back->send_message(
5973 new MOSDPing(monc->get_fsid(),
5974 service.get_osdmap_epoch(),
5975 MOSDPing::PING,
5976 now,
5977 mnow,
5978 mnow,
5979 service.get_up_epoch(),
5980 cct->_conf->osd_heartbeat_min_size,
5981 delta_ub));
5982
5983 if (i->second.con_front)
5984 i->second.con_front->send_message(
5985 new MOSDPing(monc->get_fsid(),
5986 service.get_osdmap_epoch(),
5987 MOSDPing::PING,
5988 now,
5989 mnow,
5990 mnow,
5991 service.get_up_epoch(),
5992 cct->_conf->osd_heartbeat_min_size,
5993 delta_ub));
5994 }
5995
5996 logger->set(l_osd_hb_to, heartbeat_peers.size());
5997
5998 // hmm.. am i all alone?
5999 dout(30) << "heartbeat lonely?" << dendl;
6000 if (heartbeat_peers.empty()) {
6001 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
6002 last_mon_heartbeat = now;
6003 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
6004 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6005 }
6006 }
6007
6008 dout(30) << "heartbeat done" << dendl;
6009 }
6010
6011 bool OSD::heartbeat_reset(Connection *con)
6012 {
6013 std::lock_guard l(heartbeat_lock);
6014 auto s = con->get_priv();
6015 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
6016 con->set_priv(nullptr);
6017 if (s) {
6018 if (is_stopping()) {
6019 return true;
6020 }
6021 auto session = static_cast<Session*>(s.get());
6022 auto p = heartbeat_peers.find(session->peer);
6023 if (p != heartbeat_peers.end() &&
6024 (p->second.con_back == con ||
6025 p->second.con_front == con)) {
6026 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6027 << ", reopening" << dendl;
6028 p->second.clear_mark_down(con);
6029 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6030 if (newcon.first) {
6031 p->second.con_back = newcon.first.get();
6032 p->second.con_back->set_priv(s);
6033 if (newcon.second) {
6034 p->second.con_front = newcon.second.get();
6035 p->second.con_front->set_priv(s);
6036 }
6037 p->second.ping_history.clear();
6038 } else {
6039 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6040 << ", raced with osdmap update, closing out peer" << dendl;
6041 heartbeat_peers.erase(p);
6042 }
6043 } else {
6044 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6045 }
6046 }
6047 return true;
6048 }
6049
6050
6051
6052 // =========================================
6053
6054 void OSD::tick()
6055 {
6056 ceph_assert(ceph_mutex_is_locked(osd_lock));
6057 dout(10) << "tick" << dendl;
6058
6059 utime_t now = ceph_clock_now();
6060 // throw out any obsolete markdown log
6061 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6062 while (!osd_markdown_log.empty() &&
6063 osd_markdown_log.front() + grace < now)
6064 osd_markdown_log.pop_front();
6065
6066 if (is_active() || is_waiting_for_healthy()) {
6067 maybe_update_heartbeat_peers();
6068 }
6069
6070 if (is_waiting_for_healthy()) {
6071 start_boot();
6072 }
6073
6074 if (is_waiting_for_healthy() || is_booting()) {
6075 std::lock_guard l(heartbeat_lock);
6076 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6077 last_mon_heartbeat = now;
6078 dout(1) << __func__ << " checking mon for new map" << dendl;
6079 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6080 }
6081 }
6082
6083 do_waiters();
6084
6085 // scrub purged_snaps every deep scrub interval
6086 {
6087 const utime_t last = superblock.last_purged_snaps_scrub;
6088 utime_t next = last;
6089 next += cct->_conf->osd_scrub_min_interval;
6090 std::mt19937 rng;
6091 // use a seed that is stable for each scrub interval, but varies
6092 // by OSD to avoid any herds.
6093 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
6094 double r = (rng() % 1024) / 1024.0;
6095 next +=
6096 cct->_conf->osd_scrub_min_interval *
6097 cct->_conf->osd_scrub_interval_randomize_ratio * r;
6098 if (next < ceph_clock_now()) {
6099 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6100 << " next " << next << " ... now" << dendl;
6101 scrub_purged_snaps();
6102 } else {
6103 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6104 << " next " << next << dendl;
6105 }
6106 }
6107
6108 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
6109 }
6110
6111 void OSD::tick_without_osd_lock()
6112 {
6113 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
6114 dout(10) << "tick_without_osd_lock" << dendl;
6115
6116 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6117 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6118 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
6119
6120 // refresh osd stats
6121 struct store_statfs_t stbuf;
6122 osd_alert_list_t alerts;
6123 int r = store->statfs(&stbuf, &alerts);
6124 ceph_assert(r == 0);
6125 service.set_statfs(stbuf, alerts);
6126
6127 // osd_lock is not being held, which means the OSD state
6128 // might change when doing the monitor report
6129 if (is_active() || is_waiting_for_healthy()) {
6130 {
6131 std::lock_guard l{heartbeat_lock};
6132 heartbeat_check();
6133 }
6134 map_lock.lock_shared();
6135 std::lock_guard l(mon_report_lock);
6136
6137 // mon report?
6138 utime_t now = ceph_clock_now();
6139 if (service.need_fullness_update() ||
6140 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
6141 last_mon_report = now;
6142 send_full_update();
6143 send_failures();
6144 }
6145 map_lock.unlock_shared();
6146
6147 epoch_t max_waiting_epoch = 0;
6148 for (auto s : shards) {
6149 max_waiting_epoch = std::max(max_waiting_epoch,
6150 s->get_max_waiting_epoch());
6151 }
6152 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6153 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6154 << ", requesting new map" << dendl;
6155 osdmap_subscribe(superblock.newest_map + 1, false);
6156 }
6157 }
6158
6159 if (is_active()) {
6160 if (!scrub_random_backoff()) {
6161 sched_scrub();
6162 }
6163 service.promote_throttle_recalibrate();
6164 resume_creating_pg();
6165 bool need_send_beacon = false;
6166 const auto now = ceph::coarse_mono_clock::now();
6167 {
6168 // borrow lec lock to pretect last_sent_beacon from changing
6169 std::lock_guard l{min_last_epoch_clean_lock};
6170 const auto elapsed = now - last_sent_beacon;
6171 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6172 cct->_conf->osd_beacon_report_interval) {
6173 need_send_beacon = true;
6174 }
6175 }
6176 if (need_send_beacon) {
6177 send_beacon(now);
6178 }
6179 }
6180
6181 mgrc.update_daemon_health(get_health_metrics());
6182 service.kick_recovery_queue();
6183 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6184 new C_Tick_WithoutOSDLock(this));
6185 }
6186
6187 // Usage:
6188 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6189 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6190 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6191 // getomap <pool> [namespace/]<obj-name>
6192 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6193 // injectmdataerr [namespace/]<obj-name> [shardid]
6194 // injectdataerr [namespace/]<obj-name> [shardid]
6195 //
6196 // set_recovery_delay [utime]
6197 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6198 std::string_view command,
6199 const cmdmap_t& cmdmap, ostream &ss)
6200 {
6201 //Test support
6202 //Support changing the omap on a single osd by using the Admin Socket to
6203 //directly request the osd make a change.
6204 if (command == "setomapval" || command == "rmomapkey" ||
6205 command == "setomapheader" || command == "getomap" ||
6206 command == "truncobj" || command == "injectmdataerr" ||
6207 command == "injectdataerr"
6208 ) {
6209 pg_t rawpg;
6210 int64_t pool;
6211 OSDMapRef curmap = service->get_osdmap();
6212 int r = -1;
6213
6214 string poolstr;
6215
6216 cmd_getval(cmdmap, "pool", poolstr);
6217 pool = curmap->lookup_pg_pool_name(poolstr);
6218 //If we can't find it by name then maybe id specified
6219 if (pool < 0 && isdigit(poolstr[0]))
6220 pool = atoll(poolstr.c_str());
6221 if (pool < 0) {
6222 ss << "Invalid pool '" << poolstr << "''";
6223 return;
6224 }
6225
6226 string objname, nspace;
6227 cmd_getval(cmdmap, "objname", objname);
6228 std::size_t found = objname.find_first_of('/');
6229 if (found != string::npos) {
6230 nspace = objname.substr(0, found);
6231 objname = objname.substr(found+1);
6232 }
6233 object_locator_t oloc(pool, nspace);
6234 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6235
6236 if (r < 0) {
6237 ss << "Invalid namespace/objname";
6238 return;
6239 }
6240
6241 int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
6242 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6243 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6244 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6245 if (curmap->pg_is_ec(rawpg)) {
6246 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6247 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6248 return;
6249 }
6250 }
6251
6252 ObjectStore::Transaction t;
6253
6254 if (command == "setomapval") {
6255 map<string, bufferlist> newattrs;
6256 bufferlist val;
6257 string key, valstr;
6258 cmd_getval(cmdmap, "key", key);
6259 cmd_getval(cmdmap, "val", valstr);
6260
6261 val.append(valstr);
6262 newattrs[key] = val;
6263 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6264 r = store->queue_transaction(service->meta_ch, std::move(t));
6265 if (r < 0)
6266 ss << "error=" << r;
6267 else
6268 ss << "ok";
6269 } else if (command == "rmomapkey") {
6270 string key;
6271 cmd_getval(cmdmap, "key", key);
6272
6273 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6274 r = store->queue_transaction(service->meta_ch, std::move(t));
6275 if (r < 0)
6276 ss << "error=" << r;
6277 else
6278 ss << "ok";
6279 } else if (command == "setomapheader") {
6280 bufferlist newheader;
6281 string headerstr;
6282
6283 cmd_getval(cmdmap, "header", headerstr);
6284 newheader.append(headerstr);
6285 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6286 r = store->queue_transaction(service->meta_ch, std::move(t));
6287 if (r < 0)
6288 ss << "error=" << r;
6289 else
6290 ss << "ok";
6291 } else if (command == "getomap") {
6292 //Debug: Output entire omap
6293 bufferlist hdrbl;
6294 map<string, bufferlist> keyvals;
6295 auto ch = store->open_collection(coll_t(pgid));
6296 if (!ch) {
6297 ss << "unable to open collection for " << pgid;
6298 r = -ENOENT;
6299 } else {
6300 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6301 if (r >= 0) {
6302 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6303 for (map<string, bufferlist>::iterator it = keyvals.begin();
6304 it != keyvals.end(); ++it)
6305 ss << " key=" << (*it).first << " val="
6306 << string((*it).second.c_str(), (*it).second.length());
6307 } else {
6308 ss << "error=" << r;
6309 }
6310 }
6311 } else if (command == "truncobj") {
6312 int64_t trunclen;
6313 cmd_getval(cmdmap, "len", trunclen);
6314 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6315 r = store->queue_transaction(service->meta_ch, std::move(t));
6316 if (r < 0)
6317 ss << "error=" << r;
6318 else
6319 ss << "ok";
6320 } else if (command == "injectdataerr") {
6321 store->inject_data_error(gobj);
6322 ss << "ok";
6323 } else if (command == "injectmdataerr") {
6324 store->inject_mdata_error(gobj);
6325 ss << "ok";
6326 }
6327 return;
6328 }
6329 if (command == "set_recovery_delay") {
6330 int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
6331 ostringstream oss;
6332 oss << delay;
6333 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6334 oss.str().c_str());
6335 if (r != 0) {
6336 ss << "set_recovery_delay: error setting "
6337 << "osd_recovery_delay_start to '" << delay << "': error "
6338 << r;
6339 return;
6340 }
6341 service->cct->_conf.apply_changes(nullptr);
6342 ss << "set_recovery_delay: set osd_recovery_delay_start "
6343 << "to " << service->cct->_conf->osd_recovery_delay_start;
6344 return;
6345 }
6346 if (command == "injectfull") {
6347 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6348 string type = cmd_getval_or<string>(cmdmap, "type", "full");
6349 OSDService::s_names state;
6350
6351 if (type == "none" || count == 0) {
6352 type = "none";
6353 count = 0;
6354 }
6355 state = service->get_full_state(type);
6356 if (state == OSDService::s_names::INVALID) {
6357 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6358 return;
6359 }
6360 service->set_injectfull(state, count);
6361 return;
6362 }
6363 ss << "Internal error - command=" << command;
6364 }
6365
6366 // =========================================
6367
6368 void OSD::ms_handle_connect(Connection *con)
6369 {
6370 dout(10) << __func__ << " con " << con << dendl;
6371 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6372 std::lock_guard l(osd_lock);
6373 if (is_stopping())
6374 return;
6375 dout(10) << __func__ << " on mon" << dendl;
6376
6377 if (is_preboot()) {
6378 start_boot();
6379 } else if (is_booting()) {
6380 _send_boot(); // resend boot message
6381 } else {
6382 map_lock.lock_shared();
6383 std::lock_guard l2(mon_report_lock);
6384
6385 utime_t now = ceph_clock_now();
6386 last_mon_report = now;
6387
6388 // resend everything, it's a new session
6389 send_full_update();
6390 send_alive();
6391 service.requeue_pg_temp();
6392 service.clear_sent_ready_to_merge();
6393 service.send_pg_temp();
6394 service.send_ready_to_merge();
6395 service.send_pg_created();
6396 requeue_failures();
6397 send_failures();
6398
6399 map_lock.unlock_shared();
6400 if (is_active()) {
6401 send_beacon(ceph::coarse_mono_clock::now());
6402 }
6403 }
6404
6405 // full map requests may happen while active or pre-boot
6406 if (requested_full_first) {
6407 rerequest_full_maps();
6408 }
6409 }
6410 }
6411
6412 void OSD::ms_handle_fast_connect(Connection *con)
6413 {
6414 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6415 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6416 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6417 s = ceph::make_ref<Session>(cct, con);
6418 con->set_priv(s);
6419 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6420 << " addr=" << s->con->get_peer_addr() << dendl;
6421 // we don't connect to clients
6422 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6423 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6424 }
6425 }
6426 }
6427
6428 void OSD::ms_handle_fast_accept(Connection *con)
6429 {
6430 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6431 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6432 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6433 s = ceph::make_ref<Session>(cct, con);
6434 con->set_priv(s);
6435 dout(10) << "new session (incoming)" << s << " con=" << con
6436 << " addr=" << con->get_peer_addr()
6437 << " must have raced with connect" << dendl;
6438 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6439 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6440 }
6441 }
6442 }
6443
6444 bool OSD::ms_handle_reset(Connection *con)
6445 {
6446 auto session = ceph::ref_cast<Session>(con->get_priv());
6447 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6448 if (!session)
6449 return false;
6450 session->wstate.reset(con);
6451 session->con->set_priv(nullptr);
6452 session->con.reset(); // break con <-> session ref cycle
6453 // note that we break session->con *before* the session_handle_reset
6454 // cleanup below. this avoids a race between us and
6455 // PG::add_backoff, Session::check_backoff, etc.
6456 session_handle_reset(session);
6457 return true;
6458 }
6459
6460 bool OSD::ms_handle_refused(Connection *con)
6461 {
6462 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6463 return false;
6464
6465 auto session = ceph::ref_cast<Session>(con->get_priv());
6466 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6467 if (!session)
6468 return false;
6469 int type = con->get_peer_type();
6470 // handle only OSD failures here
6471 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6472 OSDMapRef osdmap = get_osdmap();
6473 if (osdmap) {
6474 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6475 if (id >= 0 && osdmap->is_up(id)) {
6476 // I'm cheating mon heartbeat grace logic, because we know it's not going
6477 // to respawn alone. +1 so we won't hit any boundary case.
6478 monc->send_mon_message(
6479 new MOSDFailure(
6480 monc->get_fsid(),
6481 id,
6482 osdmap->get_addrs(id),
6483 cct->_conf->osd_heartbeat_grace + 1,
6484 osdmap->get_epoch(),
6485 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6486 ));
6487 }
6488 }
6489 }
6490 return true;
6491 }
6492
6493 struct CB_OSD_GetVersion {
6494 OSD *osd;
6495 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6496 void operator ()(boost::system::error_code ec, version_t newest,
6497 version_t oldest) {
6498 if (!ec)
6499 osd->_got_mon_epochs(oldest, newest);
6500 }
6501 };
6502
6503 void OSD::start_boot()
6504 {
6505 if (!_is_healthy()) {
6506 // if we are not healthy, do not mark ourselves up (yet)
6507 dout(1) << "not healthy; waiting to boot" << dendl;
6508 if (!is_waiting_for_healthy())
6509 start_waiting_for_healthy();
6510 // send pings sooner rather than later
6511 heartbeat_kick();
6512 return;
6513 }
6514 dout(1) << __func__ << dendl;
6515 set_state(STATE_PREBOOT);
6516 dout(10) << "start_boot - have maps " << superblock.oldest_map
6517 << ".." << superblock.newest_map << dendl;
6518 monc->get_version("osdmap", CB_OSD_GetVersion(this));
6519 }
6520
6521 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6522 {
6523 std::lock_guard l(osd_lock);
6524 if (is_preboot()) {
6525 _preboot(oldest, newest);
6526 }
6527 }
6528
6529 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6530 {
6531 ceph_assert(is_preboot());
6532 dout(10) << __func__ << " _preboot mon has osdmaps "
6533 << oldest << ".." << newest << dendl;
6534
6535 // ensure our local fullness awareness is accurate
6536 {
6537 std::lock_guard l(heartbeat_lock);
6538 heartbeat();
6539 }
6540
6541 const auto& monmap = monc->monmap;
6542 const auto osdmap = get_osdmap();
6543 // if our map within recent history, try to add ourselves to the osdmap.
6544 if (osdmap->get_epoch() == 0) {
6545 derr << "waiting for initial osdmap" << dendl;
6546 } else if (osdmap->is_destroyed(whoami)) {
6547 derr << "osdmap says I am destroyed" << dendl;
6548 // provide a small margin so we don't livelock seeing if we
6549 // un-destroyed ourselves.
6550 if (osdmap->get_epoch() > newest - 1) {
6551 exit(0);
6552 }
6553 } else if (osdmap->is_noup(whoami)) {
6554 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6555 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6556 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6557 << dendl;
6558 } else if (service.need_fullness_update()) {
6559 derr << "osdmap fullness state needs update" << dendl;
6560 send_full_update();
6561 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6562 superblock.purged_snaps_last < superblock.current_epoch) {
6563 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6564 << " < newest_map " << superblock.current_epoch << dendl;
6565 _get_purged_snaps();
6566 } else if (osdmap->get_epoch() >= oldest - 1 &&
6567 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6568
6569 // wait for pgs to fully catch up in a different thread, since
6570 // this thread might be required for splitting and merging PGs to
6571 // make progress.
6572 boot_finisher.queue(
6573 new LambdaContext(
6574 [this](int r) {
6575 std::unique_lock l(osd_lock);
6576 if (is_preboot()) {
6577 dout(10) << __func__ << " waiting for peering work to drain"
6578 << dendl;
6579 l.unlock();
6580 for (auto shard : shards) {
6581 shard->wait_min_pg_epoch(get_osdmap_epoch());
6582 }
6583 l.lock();
6584 }
6585 if (is_preboot()) {
6586 _send_boot();
6587 }
6588 }));
6589 return;
6590 }
6591
6592 // get all the latest maps
6593 if (osdmap->get_epoch() + 1 >= oldest)
6594 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6595 else
6596 osdmap_subscribe(oldest - 1, true);
6597 }
6598
6599 void OSD::_get_purged_snaps()
6600 {
6601 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6602 // overlapping requests to the mon, which will be somewhat inefficient, but
6603 // it should be reliable.
6604 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6605 << ", newest_map " << superblock.current_epoch << dendl;
6606 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6607 superblock.purged_snaps_last + 1,
6608 superblock.current_epoch + 1);
6609 monc->send_mon_message(m);
6610 }
6611
6612 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6613 {
6614 dout(10) << __func__ << " " << *m << dendl;
6615 ObjectStore::Transaction t;
6616 if (!is_preboot() ||
6617 m->last < superblock.purged_snaps_last) {
6618 goto out;
6619 }
6620 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
6621 make_purged_snaps_oid(), &t,
6622 m->purged_snaps);
6623 superblock.purged_snaps_last = m->last;
6624 write_superblock(t);
6625 store->queue_transaction(
6626 service.meta_ch,
6627 std::move(t));
6628 service.publish_superblock(superblock);
6629 if (m->last < superblock.current_epoch) {
6630 _get_purged_snaps();
6631 } else {
6632 start_boot();
6633 }
6634 out:
6635 m->put();
6636 }
6637
6638 void OSD::send_full_update()
6639 {
6640 if (!service.need_fullness_update())
6641 return;
6642 unsigned state = 0;
6643 if (service.is_full()) {
6644 state = CEPH_OSD_FULL;
6645 } else if (service.is_backfillfull()) {
6646 state = CEPH_OSD_BACKFILLFULL;
6647 } else if (service.is_nearfull()) {
6648 state = CEPH_OSD_NEARFULL;
6649 }
6650 set<string> s;
6651 OSDMap::calc_state_set(state, s);
6652 dout(10) << __func__ << " want state " << s << dendl;
6653 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6654 }
6655
6656 void OSD::start_waiting_for_healthy()
6657 {
6658 dout(1) << "start_waiting_for_healthy" << dendl;
6659 set_state(STATE_WAITING_FOR_HEALTHY);
6660 last_heartbeat_resample = utime_t();
6661
6662 // subscribe to osdmap updates, in case our peers really are known to be dead
6663 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6664 }
6665
6666 bool OSD::_is_healthy()
6667 {
6668 if (!cct->get_heartbeat_map()->is_healthy()) {
6669 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6670 return false;
6671 }
6672
6673 if (is_waiting_for_healthy()) {
6674 utime_t now = ceph_clock_now();
6675 if (osd_markdown_log.empty()) {
6676 dout(5) << __func__ << " force returning true since last markdown"
6677 << " was " << cct->_conf->osd_max_markdown_period
6678 << "s ago" << dendl;
6679 return true;
6680 }
6681 std::lock_guard l(heartbeat_lock);
6682 int num = 0, up = 0;
6683 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6684 p != heartbeat_peers.end();
6685 ++p) {
6686 if (p->second.is_healthy(now))
6687 ++up;
6688 ++num;
6689 }
6690 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6691 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6692 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6693 return false;
6694 }
6695 }
6696
6697 return true;
6698 }
6699
6700 void OSD::_send_boot()
6701 {
6702 dout(10) << "_send_boot" << dendl;
6703 Connection *local_connection =
6704 cluster_messenger->get_loopback_connection().get();
6705 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6706 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6707 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6708 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6709
6710 dout(20) << " initial client_addrs " << client_addrs
6711 << ", cluster_addrs " << cluster_addrs
6712 << ", hb_back_addrs " << hb_back_addrs
6713 << ", hb_front_addrs " << hb_front_addrs
6714 << dendl;
6715 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6716 dout(10) << " assuming cluster_addrs match client_addrs "
6717 << client_addrs << dendl;
6718 cluster_addrs = cluster_messenger->get_myaddrs();
6719 }
6720 if (auto session = local_connection->get_priv(); !session) {
6721 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6722 }
6723
6724 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6725 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6726 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6727 << cluster_addrs << dendl;
6728 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6729 }
6730 if (auto session = local_connection->get_priv(); !session) {
6731 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6732 }
6733
6734 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6735 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6736 dout(10) << " assuming hb_front_addrs match client_addrs "
6737 << client_addrs << dendl;
6738 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6739 }
6740 if (auto session = local_connection->get_priv(); !session) {
6741 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6742 }
6743
6744 // we now know what our front and back addrs will be, and we are
6745 // about to tell the mon what our metadata (including numa bindings)
6746 // are, so now is a good time!
6747 set_numa_affinity();
6748
6749 MOSDBoot *mboot = new MOSDBoot(
6750 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6751 hb_back_addrs, hb_front_addrs, cluster_addrs,
6752 CEPH_FEATURES_ALL);
6753 dout(10) << " final client_addrs " << client_addrs
6754 << ", cluster_addrs " << cluster_addrs
6755 << ", hb_back_addrs " << hb_back_addrs
6756 << ", hb_front_addrs " << hb_front_addrs
6757 << dendl;
6758 _collect_metadata(&mboot->metadata);
6759 monc->send_mon_message(mboot);
6760 set_state(STATE_BOOTING);
6761 }
6762
6763 void OSD::_collect_metadata(map<string,string> *pm)
6764 {
6765 // config info
6766 (*pm)["osd_data"] = dev_path;
6767 if (store->get_type() == "filestore") {
6768 // not applicable for bluestore
6769 (*pm)["osd_journal"] = journal_path;
6770 }
6771 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6772 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6773 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6774 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6775
6776 // backend
6777 (*pm)["osd_objectstore"] = store->get_type();
6778 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6779 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6780 (*pm)["default_device_class"] = store->get_default_device_class();
6781 string osdspec_affinity;
6782 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6783 if (r < 0 || osdspec_affinity.empty()) {
6784 osdspec_affinity = "";
6785 }
6786 (*pm)["osdspec_affinity"] = osdspec_affinity;
6787 store->collect_metadata(pm);
6788
6789 collect_sys_info(pm, cct);
6790
6791 (*pm)["front_iface"] = pick_iface(
6792 cct,
6793 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6794 (*pm)["back_iface"] = pick_iface(
6795 cct,
6796 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6797
6798 // network numa
6799 {
6800 int node = -1;
6801 set<int> nodes;
6802 set<string> unknown;
6803 for (auto nm : { "front_iface", "back_iface" }) {
6804 if (!(*pm)[nm].size()) {
6805 unknown.insert(nm);
6806 continue;
6807 }
6808 int n = -1;
6809 int r = get_iface_numa_node((*pm)[nm], &n);
6810 if (r < 0) {
6811 unknown.insert((*pm)[nm]);
6812 continue;
6813 }
6814 nodes.insert(n);
6815 if (node < 0) {
6816 node = n;
6817 }
6818 }
6819 if (unknown.size()) {
6820 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6821 }
6822 if (!nodes.empty()) {
6823 (*pm)["network_numa_nodes"] = stringify(nodes);
6824 }
6825 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6826 (*pm)["network_numa_node"] = stringify(node);
6827 }
6828 }
6829
6830 if (numa_node >= 0) {
6831 (*pm)["numa_node"] = stringify(numa_node);
6832 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6833 &numa_cpu_set);
6834 }
6835
6836 set<string> devnames;
6837 store->get_devices(&devnames);
6838 map<string,string> errs;
6839 get_device_metadata(devnames, pm, &errs);
6840 for (auto& i : errs) {
6841 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6842 }
6843 dout(10) << __func__ << " " << *pm << dendl;
6844 }
6845
6846 void OSD::queue_want_up_thru(epoch_t want)
6847 {
6848 std::shared_lock map_locker{map_lock};
6849 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6850 std::lock_guard report_locker(mon_report_lock);
6851 if (want > up_thru_wanted) {
6852 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6853 << ", currently " << cur
6854 << dendl;
6855 up_thru_wanted = want;
6856 send_alive();
6857 } else {
6858 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6859 << ", currently " << cur
6860 << dendl;
6861 }
6862 }
6863
6864 void OSD::send_alive()
6865 {
6866 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6867 const auto osdmap = get_osdmap();
6868 if (!osdmap->exists(whoami))
6869 return;
6870 epoch_t up_thru = osdmap->get_up_thru(whoami);
6871 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6872 if (up_thru_wanted > up_thru) {
6873 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6874 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6875 }
6876 }
6877
6878 void OSD::request_full_map(epoch_t first, epoch_t last)
6879 {
6880 dout(10) << __func__ << " " << first << ".." << last
6881 << ", previously requested "
6882 << requested_full_first << ".." << requested_full_last << dendl;
6883 ceph_assert(ceph_mutex_is_locked(osd_lock));
6884 ceph_assert(first > 0 && last > 0);
6885 ceph_assert(first <= last);
6886 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6887 if (requested_full_first == 0) {
6888 // first request
6889 requested_full_first = first;
6890 requested_full_last = last;
6891 } else if (last <= requested_full_last) {
6892 // dup
6893 return;
6894 } else {
6895 // additional request
6896 first = requested_full_last + 1;
6897 requested_full_last = last;
6898 }
6899 MMonGetOSDMap *req = new MMonGetOSDMap;
6900 req->request_full(first, last);
6901 monc->send_mon_message(req);
6902 }
6903
6904 void OSD::got_full_map(epoch_t e)
6905 {
6906 ceph_assert(requested_full_first <= requested_full_last);
6907 ceph_assert(ceph_mutex_is_locked(osd_lock));
6908 if (requested_full_first == 0) {
6909 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6910 return;
6911 }
6912 if (e < requested_full_first) {
6913 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6914 << ".." << requested_full_last
6915 << ", ignoring" << dendl;
6916 return;
6917 }
6918 if (e >= requested_full_last) {
6919 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6920 << ".." << requested_full_last << ", resetting" << dendl;
6921 requested_full_first = requested_full_last = 0;
6922 return;
6923 }
6924
6925 requested_full_first = e + 1;
6926
6927 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6928 << ".." << requested_full_last
6929 << ", still need more" << dendl;
6930 }
6931
6932 void OSD::requeue_failures()
6933 {
6934 std::lock_guard l(heartbeat_lock);
6935 unsigned old_queue = failure_queue.size();
6936 unsigned old_pending = failure_pending.size();
6937 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6938 failure_queue[p->first] = p->second.first;
6939 failure_pending.erase(p++);
6940 }
6941 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6942 << failure_queue.size() << dendl;
6943 }
6944
6945 void OSD::send_failures()
6946 {
6947 ceph_assert(ceph_mutex_is_locked(map_lock));
6948 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6949 std::lock_guard l(heartbeat_lock);
6950 utime_t now = ceph_clock_now();
6951 const auto osdmap = get_osdmap();
6952 while (!failure_queue.empty()) {
6953 int osd = failure_queue.begin()->first;
6954 if (!failure_pending.count(osd)) {
6955 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6956 monc->send_mon_message(
6957 new MOSDFailure(
6958 monc->get_fsid(),
6959 osd,
6960 osdmap->get_addrs(osd),
6961 failed_for,
6962 osdmap->get_epoch()));
6963 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6964 osdmap->get_addrs(osd));
6965 }
6966 failure_queue.erase(osd);
6967 }
6968 }
6969
6970 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6971 {
6972 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6973 MOSDFailure::FLAG_ALIVE);
6974 monc->send_mon_message(m);
6975 }
6976
6977 void OSD::cancel_pending_failures()
6978 {
6979 std::lock_guard l(heartbeat_lock);
6980 auto it = failure_pending.begin();
6981 while (it != failure_pending.end()) {
6982 dout(10) << __func__ << " canceling in-flight failure report for osd."
6983 << it->first << dendl;
6984 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6985 failure_pending.erase(it++);
6986 }
6987 }
6988
6989 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6990 {
6991 const auto& monmap = monc->monmap;
6992 // send beacon to mon even if we are just connected, and the monmap is not
6993 // initialized yet by then.
6994 if (monmap.epoch > 0 &&
6995 monmap.get_required_features().contains_all(
6996 ceph::features::mon::FEATURE_LUMINOUS)) {
6997 dout(20) << __func__ << " sending" << dendl;
6998 MOSDBeacon* beacon = nullptr;
6999 {
7000 std::lock_guard l{min_last_epoch_clean_lock};
7001 beacon = new MOSDBeacon(get_osdmap_epoch(),
7002 min_last_epoch_clean,
7003 superblock.last_purged_snaps_scrub,
7004 cct->_conf->osd_beacon_report_interval);
7005 beacon->pgs = min_last_epoch_clean_pgs;
7006 last_sent_beacon = now;
7007 }
7008 monc->send_mon_message(beacon);
7009 } else {
7010 dout(20) << __func__ << " not sending" << dendl;
7011 }
7012 }
7013
7014 void OSD::handle_command(MCommand *m)
7015 {
7016 ConnectionRef con = m->get_connection();
7017 auto session = ceph::ref_cast<Session>(con->get_priv());
7018 if (!session) {
7019 con->send_message(new MCommandReply(m, -EACCES));
7020 m->put();
7021 return;
7022 }
7023 if (!session->caps.allow_all()) {
7024 con->send_message(new MCommandReply(m, -EACCES));
7025 m->put();
7026 return;
7027 }
7028 cct->get_admin_socket()->queue_tell_command(m);
7029 m->put();
7030 }
7031
7032 namespace {
7033 class unlock_guard {
7034 ceph::mutex& m;
7035 public:
7036 explicit unlock_guard(ceph::mutex& mutex)
7037 : m(mutex)
7038 {
7039 m.unlock();
7040 }
7041 unlock_guard(unlock_guard&) = delete;
7042 ~unlock_guard() {
7043 m.lock();
7044 }
7045 };
7046 }
7047
7048 void OSD::scrub_purged_snaps()
7049 {
7050 dout(10) << __func__ << dendl;
7051 ceph_assert(ceph_mutex_is_locked(osd_lock));
7052 SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
7053 make_snapmapper_oid(),
7054 make_purged_snaps_oid());
7055 clog->debug() << "purged_snaps scrub starts";
7056 osd_lock.unlock();
7057 s.run();
7058 if (s.stray.size()) {
7059 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7060 } else {
7061 clog->debug() << "purged_snaps scrub ok";
7062 }
7063 set<pair<spg_t,snapid_t>> queued;
7064 for (auto& [pool, snap, hash, shard] : s.stray) {
7065 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7066 if (!pi) {
7067 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7068 continue;
7069 }
7070 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7071 spg_t spgid(pgid, shard);
7072 pair<spg_t,snapid_t> p(spgid, snap);
7073 if (queued.count(p)) {
7074 dout(20) << __func__ << " pg " << spgid << " snap " << snap
7075 << " already queued" << dendl;
7076 continue;
7077 }
7078 PGRef pg = lookup_lock_pg(spgid);
7079 if (!pg) {
7080 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7081 continue;
7082 }
7083 queued.insert(p);
7084 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7085 << snap << dendl;
7086 pg->queue_snap_retrim(snap);
7087 pg->unlock();
7088 }
7089 osd_lock.lock();
7090 if (is_stopping()) {
7091 return;
7092 }
7093 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7094 ObjectStore::Transaction t;
7095 superblock.last_purged_snaps_scrub = ceph_clock_now();
7096 write_superblock(t);
7097 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7098 ceph_assert(tr == 0);
7099 if (is_active()) {
7100 send_beacon(ceph::coarse_mono_clock::now());
7101 }
7102 dout(10) << __func__ << " done" << dendl;
7103 }
7104
7105 void OSD::probe_smart(const string& only_devid, ostream& ss)
7106 {
7107 set<string> devnames;
7108 store->get_devices(&devnames);
7109 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7110 "osd_smart_report_timeout");
7111
7112 // == typedef std::map<std::string, mValue> mObject;
7113 json_spirit::mObject json_map;
7114
7115 for (auto dev : devnames) {
7116 // smartctl works only on physical devices; filter out any logical device
7117 if (dev.find("dm-") == 0) {
7118 continue;
7119 }
7120
7121 string err;
7122 string devid = get_device_id(dev, &err);
7123 if (devid.size() == 0) {
7124 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7125 << err << "), skipping" << dendl;
7126 continue;
7127 }
7128 if (only_devid.size() && devid != only_devid) {
7129 continue;
7130 }
7131
7132 json_spirit::mValue smart_json;
7133 if (block_device_get_metrics(dev, smart_timeout,
7134 &smart_json)) {
7135 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7136 continue;
7137 }
7138 json_map[devid] = smart_json;
7139 }
7140 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7141 }
7142
7143 bool OSD::heartbeat_dispatch(Message *m)
7144 {
7145 dout(30) << "heartbeat_dispatch " << m << dendl;
7146 switch (m->get_type()) {
7147
7148 case CEPH_MSG_PING:
7149 dout(10) << "ping from " << m->get_source_inst() << dendl;
7150 m->put();
7151 break;
7152
7153 case MSG_OSD_PING:
7154 handle_osd_ping(static_cast<MOSDPing*>(m));
7155 break;
7156
7157 default:
7158 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7159 m->put();
7160 }
7161
7162 return true;
7163 }
7164
7165 bool OSD::ms_dispatch(Message *m)
7166 {
7167 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7168 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7169 service.got_stop_ack();
7170 m->put();
7171 return true;
7172 }
7173
7174 // lock!
7175
7176 osd_lock.lock();
7177 if (is_stopping()) {
7178 osd_lock.unlock();
7179 m->put();
7180 return true;
7181 }
7182
7183 do_waiters();
7184 _dispatch(m);
7185
7186 osd_lock.unlock();
7187
7188 return true;
7189 }
7190
7191 void OSDService::maybe_share_map(
7192 Connection *con,
7193 const OSDMapRef& osdmap,
7194 epoch_t peer_epoch_lb)
7195 {
7196 // NOTE: we assume caller hold something that keeps the Connection itself
7197 // pinned (e.g., an OpRequest's MessageRef).
7198 auto session = ceph::ref_cast<Session>(con->get_priv());
7199 if (!session) {
7200 return;
7201 }
7202
7203 // assume the peer has the newer of the op's sent_epoch and what
7204 // we think we sent them.
7205 session->sent_epoch_lock.lock();
7206 if (peer_epoch_lb > session->last_sent_epoch) {
7207 dout(10) << __func__ << " con " << con
7208 << " " << con->get_peer_addr()
7209 << " map epoch " << session->last_sent_epoch
7210 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7211 session->last_sent_epoch = peer_epoch_lb;
7212 }
7213 epoch_t last_sent_epoch = session->last_sent_epoch;
7214 session->sent_epoch_lock.unlock();
7215
7216 if (osdmap->get_epoch() <= last_sent_epoch) {
7217 return;
7218 }
7219
7220 send_incremental_map(last_sent_epoch, con, osdmap);
7221 last_sent_epoch = osdmap->get_epoch();
7222
7223 session->sent_epoch_lock.lock();
7224 if (session->last_sent_epoch < last_sent_epoch) {
7225 dout(10) << __func__ << " con " << con
7226 << " " << con->get_peer_addr()
7227 << " map epoch " << session->last_sent_epoch
7228 << " -> " << last_sent_epoch << " (shared)" << dendl;
7229 session->last_sent_epoch = last_sent_epoch;
7230 }
7231 session->sent_epoch_lock.unlock();
7232 }
7233
7234 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7235 {
7236 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7237
7238 auto i = session->waiting_on_map.begin();
7239 while (i != session->waiting_on_map.end()) {
7240 OpRequestRef op = &(*i);
7241 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7242 auto m = op->get_req<MOSDFastDispatchOp>();
7243 if (m->get_min_epoch() > osdmap->get_epoch()) {
7244 break;
7245 }
7246 session->waiting_on_map.erase(i++);
7247 op->put();
7248
7249 spg_t pgid;
7250 if (m->get_type() == CEPH_MSG_OSD_OP) {
7251 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7252 static_cast<const MOSDOp*>(m)->get_pg());
7253 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7254 continue;
7255 }
7256 } else {
7257 pgid = m->get_spg();
7258 }
7259 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7260 }
7261
7262 if (session->waiting_on_map.empty()) {
7263 clear_session_waiting_on_map(session);
7264 } else {
7265 register_session_waiting_on_map(session);
7266 }
7267 }
7268
7269 void OSD::ms_fast_dispatch(Message *m)
7270 {
7271 auto dispatch_span = tracing::osd::tracer.start_trace(__func__);
7272 FUNCTRACE(cct);
7273 if (service.is_stopping()) {
7274 m->put();
7275 return;
7276 }
7277 // peering event?
7278 switch (m->get_type()) {
7279 case CEPH_MSG_PING:
7280 dout(10) << "ping from " << m->get_source() << dendl;
7281 m->put();
7282 return;
7283 case MSG_OSD_FORCE_RECOVERY:
7284 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7285 return;
7286 case MSG_OSD_SCRUB2:
7287 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7288 return;
7289 case MSG_OSD_PG_CREATE2:
7290 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7291 case MSG_OSD_PG_NOTIFY:
7292 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7293 case MSG_OSD_PG_INFO:
7294 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7295 case MSG_OSD_PG_REMOVE:
7296 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7297 // these are single-pg messages that handle themselves
7298 case MSG_OSD_PG_LOG:
7299 case MSG_OSD_PG_TRIM:
7300 case MSG_OSD_PG_NOTIFY2:
7301 case MSG_OSD_PG_QUERY2:
7302 case MSG_OSD_PG_INFO2:
7303 case MSG_OSD_BACKFILL_RESERVE:
7304 case MSG_OSD_RECOVERY_RESERVE:
7305 case MSG_OSD_PG_LEASE:
7306 case MSG_OSD_PG_LEASE_ACK:
7307 {
7308 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7309 if (require_osd_peer(pm)) {
7310 enqueue_peering_evt(
7311 pm->get_spg(),
7312 PGPeeringEventRef(pm->get_event()));
7313 }
7314 pm->put();
7315 return;
7316 }
7317 }
7318
7319 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7320 {
7321 #ifdef WITH_LTTNG
7322 osd_reqid_t reqid = op->get_reqid();
7323 #endif
7324 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7325 reqid.name._num, reqid.tid, reqid.inc);
7326 }
7327 op->osd_parent_span = tracing::osd::tracer.add_span("op-request-created", dispatch_span);
7328
7329 if (m->trace)
7330 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7331
7332 // note sender epoch, min req's epoch
7333 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7334 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7335 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7336
7337 service.maybe_inject_dispatch_delay();
7338
7339 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7340 m->get_type() != CEPH_MSG_OSD_OP) {
7341 // queue it directly
7342 enqueue_op(
7343 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7344 std::move(op),
7345 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7346 } else {
7347 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7348 // message that didn't have an explicit spg_t); we need to map
7349 // them to an spg_t while preserving delivery order.
7350 auto priv = m->get_connection()->get_priv();
7351 if (auto session = static_cast<Session*>(priv.get()); session) {
7352 std::lock_guard l{session->session_dispatch_lock};
7353 op->get();
7354 session->waiting_on_map.push_back(*op);
7355 OSDMapRef nextmap = service.get_nextmap_reserved();
7356 dispatch_session_waiting(session, nextmap);
7357 service.release_map(nextmap);
7358 }
7359 }
7360 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7361 }
7362
7363 int OSD::ms_handle_authentication(Connection *con)
7364 {
7365 int ret = 0;
7366 auto s = ceph::ref_cast<Session>(con->get_priv());
7367 if (!s) {
7368 s = ceph::make_ref<Session>(cct, con);
7369 con->set_priv(s);
7370 s->entity_name = con->get_peer_entity_name();
7371 dout(10) << __func__ << " new session " << s << " con " << s->con
7372 << " entity " << s->entity_name
7373 << " addr " << con->get_peer_addrs() << dendl;
7374 } else {
7375 dout(10) << __func__ << " existing session " << s << " con " << s->con
7376 << " entity " << s->entity_name
7377 << " addr " << con->get_peer_addrs() << dendl;
7378 }
7379
7380 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7381 if (caps_info.allow_all) {
7382 s->caps.set_allow_all();
7383 } else if (caps_info.caps.length() > 0) {
7384 bufferlist::const_iterator p = caps_info.caps.cbegin();
7385 string str;
7386 try {
7387 decode(str, p);
7388 }
7389 catch (ceph::buffer::error& e) {
7390 dout(10) << __func__ << " session " << s << " " << s->entity_name
7391 << " failed to decode caps string" << dendl;
7392 ret = -EACCES;
7393 }
7394 if (!ret) {
7395 bool success = s->caps.parse(str);
7396 if (success) {
7397 dout(10) << __func__ << " session " << s
7398 << " " << s->entity_name
7399 << " has caps " << s->caps << " '" << str << "'" << dendl;
7400 ret = 1;
7401 } else {
7402 dout(10) << __func__ << " session " << s << " " << s->entity_name
7403 << " failed to parse caps '" << str << "'" << dendl;
7404 ret = -EACCES;
7405 }
7406 }
7407 }
7408 return ret;
7409 }
7410
7411 void OSD::do_waiters()
7412 {
7413 ceph_assert(ceph_mutex_is_locked(osd_lock));
7414
7415 dout(10) << "do_waiters -- start" << dendl;
7416 while (!finished.empty()) {
7417 OpRequestRef next = finished.front();
7418 finished.pop_front();
7419 dispatch_op(next);
7420 }
7421 dout(10) << "do_waiters -- finish" << dendl;
7422 }
7423
7424 void OSD::dispatch_op(OpRequestRef op)
7425 {
7426 switch (op->get_req()->get_type()) {
7427
7428 case MSG_OSD_PG_CREATE:
7429 handle_pg_create(op);
7430 break;
7431 }
7432 }
7433
7434 void OSD::_dispatch(Message *m)
7435 {
7436 ceph_assert(ceph_mutex_is_locked(osd_lock));
7437 dout(20) << "_dispatch " << m << " " << *m << dendl;
7438
7439 switch (m->get_type()) {
7440 // -- don't need OSDMap --
7441
7442 // map and replication
7443 case CEPH_MSG_OSD_MAP:
7444 handle_osd_map(static_cast<MOSDMap*>(m));
7445 break;
7446 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7447 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7448 break;
7449
7450 // osd
7451 case MSG_OSD_SCRUB:
7452 handle_scrub(static_cast<MOSDScrub*>(m));
7453 break;
7454
7455 case MSG_COMMAND:
7456 handle_command(static_cast<MCommand*>(m));
7457 return;
7458
7459 // -- need OSDMap --
7460
7461 case MSG_OSD_PG_CREATE:
7462 {
7463 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7464 if (m->trace)
7465 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7466 // no map? starting up?
7467 if (!get_osdmap()) {
7468 dout(7) << "no OSDMap, not booted" << dendl;
7469 logger->inc(l_osd_waiting_for_map);
7470 waiting_for_osdmap.push_back(op);
7471 op->mark_delayed("no osdmap");
7472 break;
7473 }
7474
7475 // need OSDMap
7476 dispatch_op(op);
7477 }
7478 }
7479 }
7480
7481 // remove me post-nautilus
7482 void OSD::handle_scrub(MOSDScrub *m)
7483 {
7484 dout(10) << "handle_scrub " << *m << dendl;
7485 if (!require_mon_or_mgr_peer(m)) {
7486 m->put();
7487 return;
7488 }
7489 if (m->fsid != monc->get_fsid()) {
7490 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7491 << dendl;
7492 m->put();
7493 return;
7494 }
7495
7496 vector<spg_t> spgs;
7497 _get_pgids(&spgs);
7498
7499 if (!m->scrub_pgs.empty()) {
7500 vector<spg_t> v;
7501 for (auto pgid : m->scrub_pgs) {
7502 spg_t pcand;
7503 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7504 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7505 v.push_back(pcand);
7506 }
7507 }
7508 spgs.swap(v);
7509 }
7510
7511 for (auto pgid : spgs) {
7512 enqueue_peering_evt(
7513 pgid,
7514 PGPeeringEventRef(
7515 std::make_shared<PGPeeringEvent>(
7516 get_osdmap_epoch(),
7517 get_osdmap_epoch(),
7518 PeeringState::RequestScrub(m->deep, m->repair))));
7519 }
7520
7521 m->put();
7522 }
7523
7524 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7525 {
7526 dout(10) << __func__ << " " << *m << dendl;
7527 if (!require_mon_or_mgr_peer(m)) {
7528 m->put();
7529 return;
7530 }
7531 if (m->fsid != monc->get_fsid()) {
7532 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7533 << dendl;
7534 m->put();
7535 return;
7536 }
7537 for (auto pgid : m->scrub_pgs) {
7538 enqueue_peering_evt(
7539 pgid,
7540 PGPeeringEventRef(
7541 std::make_shared<PGPeeringEvent>(
7542 m->epoch,
7543 m->epoch,
7544 PeeringState::RequestScrub(m->deep, m->repair))));
7545 }
7546 m->put();
7547 }
7548
7549 bool OSD::scrub_random_backoff()
7550 {
7551 bool coin_flip = (rand() / (double)RAND_MAX >=
7552 cct->_conf->osd_scrub_backoff_ratio);
7553 if (!coin_flip) {
7554 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7555 return true;
7556 }
7557 return false;
7558 }
7559
7560
7561 void OSD::sched_scrub()
7562 {
7563 auto& scrub_scheduler = service.get_scrub_services();
7564
7565 // fail fast if no resources are available
7566 if (!scrub_scheduler.can_inc_scrubs()) {
7567 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7568 return;
7569 }
7570
7571 // if there is a PG that is just now trying to reserve scrub replica resources -
7572 // we should wait and not initiate a new scrub
7573 if (scrub_scheduler.is_reserving_now()) {
7574 dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7575 return;
7576 }
7577
7578 Scrub::ScrubPreconds env_conditions;
7579
7580 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7581 if (!cct->_conf->osd_repair_during_recovery) {
7582 dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7583 << dendl;
7584 return;
7585 }
7586 dout(10) << __func__
7587 << " will only schedule explicitly requested repair due to active recovery"
7588 << dendl;
7589 env_conditions.allow_requested_repair_only = true;
7590 }
7591
7592 if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7593 dout(20) << __func__ << " sched_scrub starts" << dendl;
7594 auto all_jobs = scrub_scheduler.list_registered_jobs();
7595 for (const auto& sj : all_jobs) {
7596 dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7597 }
7598 }
7599
7600 auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7601 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7602 << ")" << dendl;
7603 }
7604
7605 Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7606 bool allow_requested_repair_only)
7607 {
7608 dout(20) << __func__ << " trying " << pgid << dendl;
7609
7610 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7611 // allowed
7612
7613 PGRef pg = osd->lookup_lock_pg(pgid);
7614 if (!pg) {
7615 // the PG was dequeued in the short timespan between creating the candidates list
7616 // (collect_ripe_jobs()) and here
7617 dout(5) << __func__ << " pg " << pgid << " not found" << dendl;
7618 return Scrub::schedule_result_t::no_such_pg;
7619 }
7620
7621 // This has already started, so go on to the next scrub job
7622 if (pg->is_scrub_queued_or_active()) {
7623 pg->unlock();
7624 dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7625 return Scrub::schedule_result_t::already_started;
7626 }
7627 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7628 if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7629 pg->unlock();
7630 dout(10) << __func__ << " skip " << pgid
7631 << " because repairing is not explicitly requested on it" << dendl;
7632 return Scrub::schedule_result_t::preconditions;
7633 }
7634
7635 auto scrub_attempt = pg->sched_scrub();
7636 pg->unlock();
7637 return scrub_attempt;
7638 }
7639
7640 void OSD::resched_all_scrubs()
7641 {
7642 dout(10) << __func__ << ": start" << dendl;
7643 auto all_jobs = service.get_scrub_services().list_registered_jobs();
7644 for (auto& e : all_jobs) {
7645
7646 auto& job = *e;
7647 dout(20) << __func__ << ": examine " << job.pgid << dendl;
7648
7649 PGRef pg = _lookup_lock_pg(job.pgid);
7650 if (!pg)
7651 continue;
7652
7653 if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7654 dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7655 pg->reschedule_scrub();
7656 }
7657 pg->unlock();
7658 }
7659 dout(10) << __func__ << ": done" << dendl;
7660 }
7661
7662 MPGStats* OSD::collect_pg_stats()
7663 {
7664 dout(15) << __func__ << dendl;
7665 // This implementation unconditionally sends every is_primary PG's
7666 // stats every time we're called. This has equivalent cost to the
7667 // previous implementation's worst case where all PGs are busy and
7668 // their stats are always enqueued for sending.
7669 std::shared_lock l{map_lock};
7670
7671 osd_stat_t cur_stat = service.get_osd_stat();
7672 cur_stat.os_perf_stat = store->get_cur_stats();
7673
7674 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7675 m->osd_stat = cur_stat;
7676
7677 std::lock_guard lec{min_last_epoch_clean_lock};
7678 min_last_epoch_clean = get_osdmap_epoch();
7679 min_last_epoch_clean_pgs.clear();
7680
7681 std::set<int64_t> pool_set;
7682 vector<PGRef> pgs;
7683 _get_pgs(&pgs);
7684 for (auto& pg : pgs) {
7685 auto pool = pg->pg_id.pgid.pool();
7686 pool_set.emplace((int64_t)pool);
7687 if (!pg->is_primary()) {
7688 continue;
7689 }
7690 pg->with_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7691 m->pg_stat[pg->pg_id.pgid] = s;
7692 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7693 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7694 });
7695 }
7696 store_statfs_t st;
7697 bool per_pool_stats = false;
7698 bool per_pool_omap_stats = false;
7699 for (auto p : pool_set) {
7700 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7701 if (r == -ENOTSUP) {
7702 break;
7703 } else {
7704 assert(r >= 0);
7705 m->pool_stat[p] = st;
7706 per_pool_stats = true;
7707 }
7708 }
7709
7710 // indicate whether we are reporting per-pool stats
7711 m->osd_stat.num_osds = 1;
7712 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7713 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7714
7715 return m;
7716 }
7717
7718 vector<DaemonHealthMetric> OSD::get_health_metrics()
7719 {
7720 vector<DaemonHealthMetric> metrics;
7721 {
7722 utime_t oldest_secs;
7723 const utime_t now = ceph_clock_now();
7724 auto too_old = now;
7725 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7726 int slow = 0;
7727 TrackedOpRef oldest_op;
7728 OSDMapRef osdmap = get_osdmap();
7729 // map of slow op counts by slow op event type for an aggregated logging to
7730 // the cluster log.
7731 map<uint8_t, int> slow_op_types;
7732 // map of slow op counts by pool for reporting a pool name with highest
7733 // slow ops.
7734 map<uint64_t, int> slow_op_pools;
7735 bool log_aggregated_slow_op =
7736 cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
7737 auto count_slow_ops = [&](TrackedOp& op) {
7738 if (op.get_initiated() < too_old) {
7739 stringstream ss;
7740 ss << "slow request " << op.get_desc()
7741 << " initiated "
7742 << op.get_initiated()
7743 << " currently "
7744 << op.state_string();
7745 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7746 if (log_aggregated_slow_op) {
7747 if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7748 uint8_t op_type = req->state_flag();
7749 auto m = req->get_req<MOSDFastDispatchOp>();
7750 uint64_t poolid = m->get_spg().pgid.m_pool;
7751 slow_op_types[op_type]++;
7752 if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7753 slow_op_pools[poolid]++;
7754 }
7755 }
7756 } else {
7757 clog->warn() << ss.str();
7758 }
7759 slow++;
7760 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7761 oldest_op = &op;
7762 }
7763 return true;
7764 } else {
7765 return false;
7766 }
7767 };
7768 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7769 if (slow) {
7770 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7771 << oldest_op->get_desc() << dendl;
7772 if (log_aggregated_slow_op &&
7773 slow_op_types.size() > 0) {
7774 stringstream ss;
7775 ss << slow << " slow requests (by type [ ";
7776 for (const auto& [op_type, count] : slow_op_types) {
7777 ss << "'" << OpRequest::get_state_string(op_type)
7778 << "' : " << count
7779 << " ";
7780 }
7781 auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7782 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7783 return p1.second < p2.second;
7784 });
7785 if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7786 string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7787 ss << "] most affected pool [ '"
7788 << pool_name
7789 << "' : "
7790 << slow_pool_it->second
7791 << " ])";
7792 } else {
7793 ss << "])";
7794 }
7795 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7796 clog->warn() << ss.str();
7797 }
7798 }
7799 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7800 } else {
7801 // no news is not good news.
7802 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7803 }
7804 }
7805 {
7806 std::lock_guard l(pending_creates_lock);
7807 auto n_primaries = pending_creates_from_mon;
7808 for (const auto& create : pending_creates_from_osd) {
7809 if (create.second) {
7810 n_primaries++;
7811 }
7812 }
7813 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7814 }
7815 return metrics;
7816 }
7817
7818 // =====================================================
7819 // MAP
7820
7821 void OSD::wait_for_new_map(OpRequestRef op)
7822 {
7823 // ask?
7824 if (waiting_for_osdmap.empty()) {
7825 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7826 }
7827
7828 logger->inc(l_osd_waiting_for_map);
7829 waiting_for_osdmap.push_back(op);
7830 op->mark_delayed("wait for new map");
7831 }
7832
7833
7834 /** update_map
7835 * assimilate new OSDMap(s). scan pgs, etc.
7836 */
7837
7838 void OSD::note_down_osd(int peer)
7839 {
7840 ceph_assert(ceph_mutex_is_locked(osd_lock));
7841 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7842
7843 std::lock_guard l{heartbeat_lock};
7844 failure_queue.erase(peer);
7845 failure_pending.erase(peer);
7846 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7847 if (p != heartbeat_peers.end()) {
7848 p->second.clear_mark_down();
7849 heartbeat_peers.erase(p);
7850 }
7851 }
7852
7853 void OSD::note_up_osd(int peer)
7854 {
7855 heartbeat_set_peers_need_update();
7856 }
7857
7858 struct C_OnMapCommit : public Context {
7859 OSD *osd;
7860 epoch_t first, last;
7861 MOSDMap *msg;
7862 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7863 : osd(o), first(f), last(l), msg(m) {}
7864 void finish(int r) override {
7865 osd->_committed_osd_maps(first, last, msg);
7866 msg->put();
7867 }
7868 };
7869
7870 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7871 {
7872 std::lock_guard l(osdmap_subscribe_lock);
7873 if (latest_subscribed_epoch >= epoch && !force_request)
7874 return;
7875
7876 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7877
7878 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7879 force_request) {
7880 monc->renew_subs();
7881 }
7882 }
7883
7884 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7885 {
7886 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7887 if (min <= superblock.oldest_map)
7888 return;
7889
7890 int num = 0;
7891 ObjectStore::Transaction t;
7892 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7893 dout(20) << " removing old osdmap epoch " << e << dendl;
7894 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7895 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7896 superblock.oldest_map = e + 1;
7897 num++;
7898 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7899 service.publish_superblock(superblock);
7900 write_superblock(t);
7901 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7902 ceph_assert(tr == 0);
7903 num = 0;
7904 if (!skip_maps) {
7905 // skip_maps leaves us with a range of old maps if we fail to remove all
7906 // of them before moving superblock.oldest_map forward to the first map
7907 // in the incoming MOSDMap msg. so we should continue removing them in
7908 // this case, even we could do huge series of delete transactions all at
7909 // once.
7910 break;
7911 }
7912 }
7913 }
7914 if (num > 0) {
7915 service.publish_superblock(superblock);
7916 write_superblock(t);
7917 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7918 ceph_assert(tr == 0);
7919 }
7920 // we should not remove the cached maps
7921 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7922 }
7923
7924 void OSD::handle_osd_map(MOSDMap *m)
7925 {
7926 // wait for pgs to catch up
7927 {
7928 // we extend the map cache pins to accomodate pgs slow to consume maps
7929 // for some period, until we hit the max_lag_factor bound, at which point
7930 // we block here to stop injesting more maps than they are able to keep
7931 // up with.
7932 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7933 m_osd_pg_epoch_max_lag_factor;
7934 ceph_assert(max_lag > 0);
7935 epoch_t osd_min = 0;
7936 for (auto shard : shards) {
7937 epoch_t min = shard->get_min_pg_epoch();
7938 if (osd_min == 0 || min < osd_min) {
7939 osd_min = min;
7940 }
7941 }
7942 epoch_t osdmap_epoch = get_osdmap_epoch();
7943 if (osd_min > 0 &&
7944 osdmap_epoch > max_lag &&
7945 osdmap_epoch - max_lag > osd_min) {
7946 epoch_t need = osdmap_epoch - max_lag;
7947 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7948 << " max_lag " << max_lag << ")" << dendl;
7949 for (auto shard : shards) {
7950 epoch_t min = shard->get_min_pg_epoch();
7951 if (need > min) {
7952 dout(10) << __func__ << " waiting for pgs to consume " << need
7953 << " (shard " << shard->shard_id << " min " << min
7954 << ", map cache is " << cct->_conf->osd_map_cache_size
7955 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7956 << ")" << dendl;
7957 unlock_guard unlock{osd_lock};
7958 shard->wait_min_pg_epoch(need);
7959 }
7960 }
7961 }
7962 }
7963
7964 ceph_assert(ceph_mutex_is_locked(osd_lock));
7965 map<epoch_t,OSDMapRef> added_maps;
7966 map<epoch_t,bufferlist> added_maps_bl;
7967 if (m->fsid != monc->get_fsid()) {
7968 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7969 << monc->get_fsid() << dendl;
7970 m->put();
7971 return;
7972 }
7973 if (is_initializing()) {
7974 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7975 m->put();
7976 return;
7977 }
7978
7979 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7980 if (session && !(session->entity_name.is_mon() ||
7981 session->entity_name.is_osd())) {
7982 //not enough perms!
7983 dout(10) << "got osd map from Session " << session
7984 << " which we can't take maps from (not a mon or osd)" << dendl;
7985 m->put();
7986 return;
7987 }
7988
7989 // share with the objecter
7990 if (!is_preboot())
7991 service.objecter->handle_osd_map(m);
7992
7993 epoch_t first = m->get_first();
7994 epoch_t last = m->get_last();
7995 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7996 << superblock.newest_map
7997 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7998 << dendl;
7999
8000 logger->inc(l_osd_map);
8001 logger->inc(l_osd_mape, last - first + 1);
8002 if (first <= superblock.newest_map)
8003 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8004 if (service.max_oldest_map < m->oldest_map) {
8005 service.max_oldest_map = m->oldest_map;
8006 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8007 }
8008
8009 // make sure there is something new, here, before we bother flushing
8010 // the queues and such
8011 if (last <= superblock.newest_map) {
8012 dout(10) << " no new maps here, dropping" << dendl;
8013 m->put();
8014 return;
8015 }
8016
8017 // missing some?
8018 bool skip_maps = false;
8019 if (first > superblock.newest_map + 1) {
8020 dout(10) << "handle_osd_map message skips epochs "
8021 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8022 if (m->oldest_map <= superblock.newest_map + 1) {
8023 osdmap_subscribe(superblock.newest_map + 1, false);
8024 m->put();
8025 return;
8026 }
8027 // always try to get the full range of maps--as many as we can. this
8028 // 1- is good to have
8029 // 2- is at present the only way to ensure that we get a *full* map as
8030 // the first map!
8031 if (m->oldest_map < first) {
8032 osdmap_subscribe(m->oldest_map - 1, true);
8033 m->put();
8034 return;
8035 }
8036 skip_maps = true;
8037 }
8038
8039 ObjectStore::Transaction t;
8040 uint64_t txn_size = 0;
8041
8042 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8043
8044 // store new maps: queue for disk and put in the osdmap cache
8045 epoch_t start = std::max(superblock.newest_map + 1, first);
8046 for (epoch_t e = start; e <= last; e++) {
8047 if (txn_size >= t.get_num_bytes()) {
8048 derr << __func__ << " transaction size overflowed" << dendl;
8049 ceph_assert(txn_size < t.get_num_bytes());
8050 }
8051 txn_size = t.get_num_bytes();
8052 map<epoch_t,bufferlist>::iterator p;
8053 p = m->maps.find(e);
8054 if (p != m->maps.end()) {
8055 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8056 OSDMap *o = new OSDMap;
8057 bufferlist& bl = p->second;
8058
8059 o->decode(bl);
8060
8061 purged_snaps[e] = o->get_new_purged_snaps();
8062
8063 ghobject_t fulloid = get_osdmap_pobject_name(e);
8064 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8065 added_maps[e] = add_map(o);
8066 added_maps_bl[e] = bl;
8067 got_full_map(e);
8068 continue;
8069 }
8070
8071 p = m->incremental_maps.find(e);
8072 if (p != m->incremental_maps.end()) {
8073 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8074 bufferlist& bl = p->second;
8075 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8076 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8077
8078 OSDMap *o = new OSDMap;
8079 if (e > 1) {
8080 bufferlist obl;
8081 bool got = get_map_bl(e - 1, obl);
8082 if (!got) {
8083 auto p = added_maps_bl.find(e - 1);
8084 ceph_assert(p != added_maps_bl.end());
8085 obl = p->second;
8086 }
8087 o->decode(obl);
8088 }
8089
8090 OSDMap::Incremental inc;
8091 auto p = bl.cbegin();
8092 inc.decode(p);
8093
8094 if (o->apply_incremental(inc) < 0) {
8095 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8096 ceph_abort_msg("bad fsid");
8097 }
8098
8099 bufferlist fbl;
8100 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8101
8102 bool injected_failure = false;
8103 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8104 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8105 derr << __func__ << " injecting map crc failure" << dendl;
8106 injected_failure = true;
8107 }
8108
8109 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8110 dout(2) << "got incremental " << e
8111 << " but failed to encode full with correct crc; requesting"
8112 << dendl;
8113 clog->warn() << "failed to encode map e" << e << " with expected crc";
8114 dout(20) << "my encoded map was:\n";
8115 fbl.hexdump(*_dout);
8116 *_dout << dendl;
8117 delete o;
8118 request_full_map(e, last);
8119 last = e - 1;
8120
8121 // don't continue committing if we failed to enc the first inc map
8122 if (last < start) {
8123 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8124 m->put();
8125 return;
8126 }
8127 break;
8128 }
8129 got_full_map(e);
8130 purged_snaps[e] = o->get_new_purged_snaps();
8131
8132 ghobject_t fulloid = get_osdmap_pobject_name(e);
8133 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8134 added_maps[e] = add_map(o);
8135 added_maps_bl[e] = fbl;
8136 continue;
8137 }
8138
8139 ceph_abort_msg("MOSDMap lied about what maps it had?");
8140 }
8141
8142 // even if this map isn't from a mon, we may have satisfied our subscription
8143 monc->sub_got("osdmap", last);
8144
8145 if (!m->maps.empty() && requested_full_first) {
8146 dout(10) << __func__ << " still missing full maps " << requested_full_first
8147 << ".." << requested_full_last << dendl;
8148 rerequest_full_maps();
8149 }
8150
8151 if (superblock.oldest_map) {
8152 // make sure we at least keep pace with incoming maps
8153 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8154 pg_num_history.prune(superblock.oldest_map);
8155 }
8156
8157 if (!superblock.oldest_map || skip_maps)
8158 superblock.oldest_map = first;
8159 superblock.newest_map = last;
8160 superblock.current_epoch = last;
8161
8162 // note in the superblock that we were clean thru the prior epoch
8163 epoch_t boot_epoch = service.get_boot_epoch();
8164 if (boot_epoch && boot_epoch >= superblock.mounted) {
8165 superblock.mounted = boot_epoch;
8166 superblock.clean_thru = last;
8167 }
8168
8169 // check for pg_num changes and deleted pools
8170 OSDMapRef lastmap;
8171 for (auto& i : added_maps) {
8172 if (!lastmap) {
8173 if (!(lastmap = service.try_get_map(i.first - 1))) {
8174 dout(10) << __func__ << " can't get previous map " << i.first - 1
8175 << " probably first start of this osd" << dendl;
8176 continue;
8177 }
8178 }
8179 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8180 for (auto& j : lastmap->get_pools()) {
8181 if (!i.second->have_pg_pool(j.first)) {
8182 pg_num_history.log_pool_delete(i.first, j.first);
8183 dout(10) << __func__ << " recording final pg_pool_t for pool "
8184 << j.first << dendl;
8185 // this information is needed by _make_pg() if have to restart before
8186 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8187 ghobject_t obj = make_final_pool_info_oid(j.first);
8188 bufferlist bl;
8189 encode(j.second, bl, CEPH_FEATURES_ALL);
8190 string name = lastmap->get_pool_name(j.first);
8191 encode(name, bl);
8192 map<string,string> profile;
8193 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8194 profile = lastmap->get_erasure_code_profile(
8195 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8196 }
8197 encode(profile, bl);
8198 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8199 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8200 new_pg_num != j.second.get_pg_num()) {
8201 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8202 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8203 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8204 }
8205 }
8206 for (auto& j : i.second->get_pools()) {
8207 if (!lastmap->have_pg_pool(j.first)) {
8208 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8209 << j.second.get_pg_num() << dendl;
8210 pg_num_history.log_pg_num_change(i.first, j.first,
8211 j.second.get_pg_num());
8212 }
8213 }
8214 lastmap = i.second;
8215 }
8216 pg_num_history.epoch = last;
8217 {
8218 bufferlist bl;
8219 ::encode(pg_num_history, bl);
8220 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8221 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8222 }
8223
8224 // record new purged_snaps
8225 if (superblock.purged_snaps_last == start - 1) {
8226 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
8227 make_purged_snaps_oid(), &t,
8228 purged_snaps);
8229 superblock.purged_snaps_last = last;
8230 } else {
8231 dout(10) << __func__ << " superblock purged_snaps_last is "
8232 << superblock.purged_snaps_last
8233 << ", not recording new purged_snaps" << dendl;
8234 }
8235
8236 // superblock and commit
8237 write_superblock(t);
8238 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8239 store->queue_transaction(
8240 service.meta_ch,
8241 std::move(t));
8242 service.publish_superblock(superblock);
8243 }
8244
8245 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8246 {
8247 dout(10) << __func__ << " " << first << ".." << last << dendl;
8248 if (is_stopping()) {
8249 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8250 return;
8251 }
8252 std::lock_guard l(osd_lock);
8253 if (is_stopping()) {
8254 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8255 return;
8256 }
8257 map_lock.lock();
8258
8259 ceph_assert(first <= last);
8260
8261 bool do_shutdown = false;
8262 bool do_restart = false;
8263 bool network_error = false;
8264 OSDMapRef osdmap = get_osdmap();
8265
8266 // advance through the new maps
8267 for (epoch_t cur = first; cur <= last; cur++) {
8268 dout(10) << " advance to epoch " << cur
8269 << " (<= last " << last
8270 << " <= newest_map " << superblock.newest_map
8271 << ")" << dendl;
8272
8273 OSDMapRef newmap = get_map(cur);
8274 ceph_assert(newmap); // we just cached it above!
8275
8276 // start blocklisting messages sent to peers that go down.
8277 service.pre_publish_map(newmap);
8278
8279 // kill connections to newly down osds
8280 bool waited_for_reservations = false;
8281 set<int> old;
8282 osdmap = get_osdmap();
8283 osdmap->get_all_osds(old);
8284 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8285 if (*p != whoami &&
8286 osdmap->is_up(*p) && // in old map
8287 newmap->is_down(*p)) { // but not the new one
8288 if (!waited_for_reservations) {
8289 service.await_reserved_maps();
8290 waited_for_reservations = true;
8291 }
8292 note_down_osd(*p);
8293 } else if (*p != whoami &&
8294 osdmap->is_down(*p) &&
8295 newmap->is_up(*p)) {
8296 note_up_osd(*p);
8297 }
8298 }
8299
8300 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8301 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8302 << dendl;
8303 if (is_booting()) {
8304 // this captures the case where we sent the boot message while
8305 // NOUP was being set on the mon and our boot request was
8306 // dropped, and then later it is cleared. it imperfectly
8307 // handles the case where our original boot message was not
8308 // dropped and we restart even though we might have booted, but
8309 // that is harmless (boot will just take slightly longer).
8310 do_restart = true;
8311 }
8312 }
8313
8314 osdmap = std::move(newmap);
8315 set_osdmap(osdmap);
8316 epoch_t up_epoch;
8317 epoch_t boot_epoch;
8318 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8319 if (!up_epoch &&
8320 osdmap->is_up(whoami) &&
8321 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8322 up_epoch = osdmap->get_epoch();
8323 dout(10) << "up_epoch is " << up_epoch << dendl;
8324 if (!boot_epoch) {
8325 boot_epoch = osdmap->get_epoch();
8326 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8327 }
8328 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8329 }
8330 }
8331
8332 epoch_t _bind_epoch = service.get_bind_epoch();
8333 if (osdmap->is_up(whoami) &&
8334 osdmap->get_addrs(whoami).legacy_equals(
8335 client_messenger->get_myaddrs()) &&
8336 _bind_epoch < osdmap->get_up_from(whoami)) {
8337
8338 if (is_booting()) {
8339 dout(1) << "state: booting -> active" << dendl;
8340 set_state(STATE_ACTIVE);
8341 do_restart = false;
8342
8343 // set incarnation so that osd_reqid_t's we generate for our
8344 // objecter requests are unique across restarts.
8345 service.objecter->set_client_incarnation(osdmap->get_epoch());
8346 cancel_pending_failures();
8347 }
8348 }
8349
8350 if (osdmap->get_epoch() > 0 &&
8351 is_active()) {
8352 if (!osdmap->exists(whoami)) {
8353 derr << "map says i do not exist. shutting down." << dendl;
8354 do_shutdown = true; // don't call shutdown() while we have
8355 // everything paused
8356 } else if (osdmap->is_stop(whoami)) {
8357 derr << "map says i am stopped by admin. shutting down." << dendl;
8358 do_shutdown = true;
8359 } else if (!osdmap->is_up(whoami) ||
8360 !osdmap->get_addrs(whoami).legacy_equals(
8361 client_messenger->get_myaddrs()) ||
8362 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8363 cluster_messenger->get_myaddrs()) ||
8364 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8365 hb_back_server_messenger->get_myaddrs()) ||
8366 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8367 hb_front_server_messenger->get_myaddrs())) {
8368 if (!osdmap->is_up(whoami)) {
8369 if (service.is_preparing_to_stop() || service.is_stopping()) {
8370 service.got_stop_ack();
8371 } else {
8372 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8373 "but it is still running";
8374 clog->debug() << "map e" << osdmap->get_epoch()
8375 << " wrongly marked me down at e"
8376 << osdmap->get_down_at(whoami);
8377 }
8378 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8379 // note that this is best-effort...
8380 monc->send_mon_message(
8381 new MOSDMarkMeDead(
8382 monc->get_fsid(),
8383 whoami,
8384 osdmap->get_epoch()));
8385 }
8386 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8387 client_messenger->get_myaddrs())) {
8388 clog->error() << "map e" << osdmap->get_epoch()
8389 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8390 << " != my " << client_messenger->get_myaddrs() << ")";
8391 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8392 cluster_messenger->get_myaddrs())) {
8393 clog->error() << "map e" << osdmap->get_epoch()
8394 << " had wrong cluster addr ("
8395 << osdmap->get_cluster_addrs(whoami)
8396 << " != my " << cluster_messenger->get_myaddrs() << ")";
8397 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8398 hb_back_server_messenger->get_myaddrs())) {
8399 clog->error() << "map e" << osdmap->get_epoch()
8400 << " had wrong heartbeat back addr ("
8401 << osdmap->get_hb_back_addrs(whoami)
8402 << " != my " << hb_back_server_messenger->get_myaddrs()
8403 << ")";
8404 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8405 hb_front_server_messenger->get_myaddrs())) {
8406 clog->error() << "map e" << osdmap->get_epoch()
8407 << " had wrong heartbeat front addr ("
8408 << osdmap->get_hb_front_addrs(whoami)
8409 << " != my " << hb_front_server_messenger->get_myaddrs()
8410 << ")";
8411 }
8412
8413 if (!service.is_stopping()) {
8414 epoch_t up_epoch = 0;
8415 epoch_t bind_epoch = osdmap->get_epoch();
8416 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8417 do_restart = true;
8418
8419 //add markdown log
8420 utime_t now = ceph_clock_now();
8421 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8422 osd_markdown_log.push_back(now);
8423 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8424 derr << __func__ << " marked down "
8425 << osd_markdown_log.size()
8426 << " > osd_max_markdown_count "
8427 << cct->_conf->osd_max_markdown_count
8428 << " in last " << grace << " seconds, shutting down"
8429 << dendl;
8430 do_restart = false;
8431 do_shutdown = true;
8432 }
8433
8434 start_waiting_for_healthy();
8435
8436 set<int> avoid_ports;
8437 #if defined(__FreeBSD__)
8438 // prevent FreeBSD from grabbing the client_messenger port during
8439 // rebinding. In which case a cluster_meesneger will connect also
8440 // to the same port
8441 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8442 #endif
8443 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8444
8445 int r = cluster_messenger->rebind(avoid_ports);
8446 if (r != 0) {
8447 do_shutdown = true; // FIXME: do_restart?
8448 network_error = true;
8449 derr << __func__ << " marked down:"
8450 << " rebind cluster_messenger failed" << dendl;
8451 }
8452
8453 hb_back_server_messenger->mark_down_all();
8454 hb_front_server_messenger->mark_down_all();
8455 hb_front_client_messenger->mark_down_all();
8456 hb_back_client_messenger->mark_down_all();
8457
8458 reset_heartbeat_peers(true);
8459 }
8460 }
8461 } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8462 derr << "map says i am stopped by admin. shutting down." << dendl;
8463 do_shutdown = true;
8464 }
8465
8466 map_lock.unlock();
8467
8468 check_osdmap_features();
8469
8470 // yay!
8471 consume_map();
8472
8473 if (is_active() || is_waiting_for_healthy())
8474 maybe_update_heartbeat_peers();
8475
8476 if (is_active()) {
8477 activate_map();
8478 }
8479
8480 if (do_shutdown) {
8481 if (network_error) {
8482 cancel_pending_failures();
8483 }
8484 // trigger shutdown in a different thread
8485 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8486 queue_async_signal(SIGINT);
8487 }
8488 else if (m->newest_map && m->newest_map > last) {
8489 dout(10) << " msg say newest map is " << m->newest_map
8490 << ", requesting more" << dendl;
8491 osdmap_subscribe(osdmap->get_epoch()+1, false);
8492 }
8493 else if (is_preboot()) {
8494 if (m->get_source().is_mon())
8495 _preboot(m->oldest_map, m->newest_map);
8496 else
8497 start_boot();
8498 }
8499 else if (do_restart)
8500 start_boot();
8501
8502 }
8503
8504 void OSD::check_osdmap_features()
8505 {
8506 // adjust required feature bits?
8507
8508 // we have to be a bit careful here, because we are accessing the
8509 // Policy structures without taking any lock. in particular, only
8510 // modify integer values that can safely be read by a racing CPU.
8511 // since we are only accessing existing Policy structures a their
8512 // current memory location, and setting or clearing bits in integer
8513 // fields, and we are the only writer, this is not a problem.
8514
8515 const auto osdmap = get_osdmap();
8516 {
8517 Messenger::Policy p = client_messenger->get_default_policy();
8518 uint64_t mask;
8519 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8520 if ((p.features_required & mask) != features) {
8521 dout(0) << "crush map has features " << features
8522 << ", adjusting msgr requires for clients" << dendl;
8523 p.features_required = (p.features_required & ~mask) | features;
8524 client_messenger->set_default_policy(p);
8525 }
8526 }
8527 {
8528 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8529 uint64_t mask;
8530 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8531 if ((p.features_required & mask) != features) {
8532 dout(0) << "crush map has features " << features
8533 << " was " << p.features_required
8534 << ", adjusting msgr requires for mons" << dendl;
8535 p.features_required = (p.features_required & ~mask) | features;
8536 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8537 }
8538 }
8539 {
8540 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8541 uint64_t mask;
8542 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8543
8544 if ((p.features_required & mask) != features) {
8545 dout(0) << "crush map has features " << features
8546 << ", adjusting msgr requires for osds" << dendl;
8547 p.features_required = (p.features_required & ~mask) | features;
8548 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8549 }
8550
8551 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8552 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8553 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8554 ObjectStore::Transaction t;
8555 write_superblock(t);
8556 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8557 ceph_assert(err == 0);
8558 }
8559 }
8560
8561 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8562 hb_front_server_messenger->set_require_authorizer(false);
8563 hb_back_server_messenger->set_require_authorizer(false);
8564 } else {
8565 hb_front_server_messenger->set_require_authorizer(true);
8566 hb_back_server_messenger->set_require_authorizer(true);
8567 }
8568
8569 if (osdmap->require_osd_release != last_require_osd_release) {
8570 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8571 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8572 store->write_meta("require_osd_release",
8573 stringify((int)osdmap->require_osd_release));
8574 last_require_osd_release = osdmap->require_osd_release;
8575 }
8576 }
8577
8578 struct C_FinishSplits : public Context {
8579 OSD *osd;
8580 set<PGRef> pgs;
8581 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8582 : osd(osd), pgs(in) {}
8583 void finish(int r) override {
8584 osd->_finish_splits(pgs);
8585 }
8586 };
8587
8588 void OSD::_finish_splits(set<PGRef>& pgs)
8589 {
8590 dout(10) << __func__ << " " << pgs << dendl;
8591 if (is_stopping())
8592 return;
8593 for (set<PGRef>::iterator i = pgs.begin();
8594 i != pgs.end();
8595 ++i) {
8596 PG *pg = i->get();
8597
8598 PeeringCtx rctx;
8599 pg->lock();
8600 dout(10) << __func__ << " " << *pg << dendl;
8601 epoch_t e = pg->get_osdmap_epoch();
8602 pg->handle_initialize(rctx);
8603 pg->queue_null(e, e);
8604 dispatch_context(rctx, pg, service.get_osdmap());
8605 pg->unlock();
8606
8607 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8608 shards[shard_index]->register_and_wake_split_child(pg);
8609 }
8610 };
8611
8612 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8613 unsigned need)
8614 {
8615 std::lock_guard l(merge_lock);
8616 auto& p = merge_waiters[nextmap->get_epoch()][target];
8617 p[src->pg_id] = src;
8618 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8619 << " for " << target << ", have " << p.size() << "/" << need
8620 << dendl;
8621 return p.size() == need;
8622 }
8623
8624 bool OSD::advance_pg(
8625 epoch_t osd_epoch,
8626 PG *pg,
8627 ThreadPool::TPHandle &handle,
8628 PeeringCtx &rctx)
8629 {
8630 if (osd_epoch <= pg->get_osdmap_epoch()) {
8631 return true;
8632 }
8633 ceph_assert(pg->is_locked());
8634 OSDMapRef lastmap = pg->get_osdmap();
8635 set<PGRef> new_pgs; // any split children
8636 bool ret = true;
8637
8638 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8639 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8640 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8641 next_epoch <= osd_epoch;
8642 ++next_epoch) {
8643 OSDMapRef nextmap = service.try_get_map(next_epoch);
8644 if (!nextmap) {
8645 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8646 continue;
8647 }
8648
8649 unsigned new_pg_num =
8650 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8651 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8652 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8653 // check for merge
8654 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8655 spg_t parent;
8656 if (pg->pg_id.is_merge_source(
8657 old_pg_num,
8658 new_pg_num,
8659 &parent)) {
8660 // we are merge source
8661 PGRef spg = pg; // carry a ref
8662 dout(1) << __func__ << " " << pg->pg_id
8663 << " is merge source, target is " << parent
8664 << dendl;
8665 pg->write_if_dirty(rctx);
8666 if (!new_pgs.empty()) {
8667 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8668 new_pgs));
8669 new_pgs.clear();
8670 }
8671 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8672 pg->ch->flush();
8673 // release backoffs explicitly, since the on_shutdown path
8674 // aggressively tears down backoff state.
8675 if (pg->is_primary()) {
8676 pg->release_pg_backoffs();
8677 }
8678 pg->on_shutdown();
8679 OSDShard *sdata = pg->osd_shard;
8680 {
8681 std::lock_guard l(sdata->shard_lock);
8682 if (pg->pg_slot) {
8683 sdata->_detach_pg(pg->pg_slot);
8684 // update pg count now since we might not get an osdmap
8685 // any time soon.
8686 if (pg->is_primary())
8687 logger->dec(l_osd_pg_primary);
8688 else if (pg->is_nonprimary())
8689 logger->dec(l_osd_pg_replica); // misnomer
8690 else
8691 logger->dec(l_osd_pg_stray);
8692 }
8693 }
8694 pg->unlock();
8695
8696 set<spg_t> children;
8697 parent.is_split(new_pg_num, old_pg_num, &children);
8698 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8699 enqueue_peering_evt(
8700 parent,
8701 PGPeeringEventRef(
8702 std::make_shared<PGPeeringEvent>(
8703 nextmap->get_epoch(),
8704 nextmap->get_epoch(),
8705 NullEvt())));
8706 }
8707 ret = false;
8708 goto out;
8709 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8710 // we are merge target
8711 set<spg_t> children;
8712 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8713 dout(20) << __func__ << " " << pg->pg_id
8714 << " is merge target, sources are " << children
8715 << dendl;
8716 map<spg_t,PGRef> sources;
8717 {
8718 std::lock_guard l(merge_lock);
8719 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8720 unsigned need = children.size();
8721 dout(20) << __func__ << " have " << s.size() << "/"
8722 << need << dendl;
8723 if (s.size() == need) {
8724 sources.swap(s);
8725 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8726 if (merge_waiters[nextmap->get_epoch()].empty()) {
8727 merge_waiters.erase(nextmap->get_epoch());
8728 }
8729 }
8730 }
8731 if (!sources.empty()) {
8732 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8733 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8734 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8735 pg->merge_from(
8736 sources, rctx, split_bits,
8737 nextmap->get_pg_pool(
8738 pg->pg_id.pool())->last_pg_merge_meta);
8739 pg->pg_slot->waiting_for_merge_epoch = 0;
8740 } else {
8741 dout(20) << __func__ << " not ready to merge yet" << dendl;
8742 pg->write_if_dirty(rctx);
8743 if (!new_pgs.empty()) {
8744 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8745 new_pgs));
8746 new_pgs.clear();
8747 }
8748 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8749 pg->unlock();
8750 // kick source(s) to get them ready
8751 for (auto& i : children) {
8752 dout(20) << __func__ << " kicking source " << i << dendl;
8753 enqueue_peering_evt(
8754 i,
8755 PGPeeringEventRef(
8756 std::make_shared<PGPeeringEvent>(
8757 nextmap->get_epoch(),
8758 nextmap->get_epoch(),
8759 NullEvt())));
8760 }
8761 ret = false;
8762 goto out;
8763 }
8764 }
8765 }
8766 }
8767
8768 vector<int> newup, newacting;
8769 int up_primary, acting_primary;
8770 nextmap->pg_to_up_acting_osds(
8771 pg->pg_id.pgid,
8772 &newup, &up_primary,
8773 &newacting, &acting_primary);
8774 pg->handle_advance_map(
8775 nextmap, lastmap, newup, up_primary,
8776 newacting, acting_primary, rctx);
8777
8778 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8779 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8780 if (oldpool != lastmap->get_pools().end()
8781 && newpool != nextmap->get_pools().end()) {
8782 dout(20) << __func__
8783 << " new pool opts " << newpool->second.opts
8784 << " old pool opts " << oldpool->second.opts
8785 << dendl;
8786
8787 double old_min_interval = 0, new_min_interval = 0;
8788 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8789 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8790
8791 double old_max_interval = 0, new_max_interval = 0;
8792 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8793 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8794
8795 // Assume if an interval is change from set to unset or vice versa the actual config
8796 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8797 // unnecessarily.
8798 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8799 pg->on_info_history_change();
8800 }
8801 }
8802
8803 if (new_pg_num && old_pg_num != new_pg_num) {
8804 // check for split
8805 set<spg_t> children;
8806 if (pg->pg_id.is_split(
8807 old_pg_num,
8808 new_pg_num,
8809 &children)) {
8810 split_pgs(
8811 pg, children, &new_pgs, lastmap, nextmap,
8812 rctx);
8813 }
8814 }
8815
8816 lastmap = nextmap;
8817 old_pg_num = new_pg_num;
8818 handle.reset_tp_timeout();
8819 }
8820 pg->handle_activate_map(rctx);
8821
8822 ret = true;
8823 out:
8824 if (!new_pgs.empty()) {
8825 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8826 }
8827 return ret;
8828 }
8829
8830 void OSD::consume_map()
8831 {
8832 ceph_assert(ceph_mutex_is_locked(osd_lock));
8833 auto osdmap = get_osdmap();
8834 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8835
8836 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8837 * speak the older sorting version any more. Be careful not to force
8838 * a shutdown if we are merely processing old maps, though.
8839 */
8840 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8841 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8842 ceph_abort();
8843 }
8844
8845 service.pre_publish_map(osdmap);
8846 service.await_reserved_maps();
8847 service.publish_map(osdmap);
8848
8849 // prime splits and merges
8850 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8851 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8852 for (auto& shard : shards) {
8853 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8854 }
8855 if (!newly_split.empty()) {
8856 for (auto& shard : shards) {
8857 shard->prime_splits(osdmap, &newly_split);
8858 }
8859 ceph_assert(newly_split.empty());
8860 }
8861
8862 // prune sent_ready_to_merge
8863 service.prune_sent_ready_to_merge(osdmap);
8864
8865 // FIXME, maybe: We could race against an incoming peering message
8866 // that instantiates a merge PG after identify_merges() below and
8867 // never set up its peer to complete the merge. An OSD restart
8868 // would clear it up. This is a hard race to resolve,
8869 // extraordinarily rare (we only merge PGs that are stable and
8870 // clean, so it'd have to be an imported PG to an OSD with a
8871 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8872 // replace all of this with a seastar-based code soon anyway.
8873 if (!merge_pgs.empty()) {
8874 // mark the pgs we already have, or create new and empty merge
8875 // participants for those we are missing. do this all under the
8876 // shard lock so we don't have to worry about racing pg creates
8877 // via _process.
8878 for (auto& shard : shards) {
8879 shard->prime_merges(osdmap, &merge_pgs);
8880 }
8881 ceph_assert(merge_pgs.empty());
8882 }
8883
8884 service.prune_pg_created();
8885
8886 unsigned pushes_to_free = 0;
8887 for (auto& shard : shards) {
8888 shard->consume_map(osdmap, &pushes_to_free);
8889 }
8890
8891 vector<spg_t> pgids;
8892 _get_pgids(&pgids);
8893
8894 // count (FIXME, probably during seastar rewrite)
8895 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8896 vector<PGRef> pgs;
8897 _get_pgs(&pgs);
8898 for (auto& pg : pgs) {
8899 // FIXME (probably during seastar rewrite): this is lockless and
8900 // racy, but we don't want to take pg lock here.
8901 if (pg->is_primary())
8902 num_pg_primary++;
8903 else if (pg->is_nonprimary())
8904 num_pg_replica++; // misnomer
8905 else
8906 num_pg_stray++;
8907 }
8908
8909 {
8910 // FIXME (as part of seastar rewrite): move to OSDShard
8911 std::lock_guard l(pending_creates_lock);
8912 for (auto pg = pending_creates_from_osd.begin();
8913 pg != pending_creates_from_osd.end();) {
8914 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8915 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8916 << "discarding pending_create_from_osd" << dendl;
8917 pg = pending_creates_from_osd.erase(pg);
8918 } else {
8919 ++pg;
8920 }
8921 }
8922 }
8923
8924 service.maybe_inject_dispatch_delay();
8925
8926 dispatch_sessions_waiting_on_map();
8927
8928 service.maybe_inject_dispatch_delay();
8929
8930 service.release_reserved_pushes(pushes_to_free);
8931
8932 // queue null events to push maps down to individual PGs
8933 for (auto pgid : pgids) {
8934 enqueue_peering_evt(
8935 pgid,
8936 PGPeeringEventRef(
8937 std::make_shared<PGPeeringEvent>(
8938 osdmap->get_epoch(),
8939 osdmap->get_epoch(),
8940 NullEvt())));
8941 }
8942 logger->set(l_osd_pg, pgids.size());
8943 logger->set(l_osd_pg_primary, num_pg_primary);
8944 logger->set(l_osd_pg_replica, num_pg_replica);
8945 logger->set(l_osd_pg_stray, num_pg_stray);
8946 }
8947
8948 void OSD::activate_map()
8949 {
8950 ceph_assert(ceph_mutex_is_locked(osd_lock));
8951 auto osdmap = get_osdmap();
8952
8953 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8954
8955 // norecover?
8956 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8957 if (!service.recovery_is_paused()) {
8958 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8959 service.pause_recovery();
8960 }
8961 } else {
8962 if (service.recovery_is_paused()) {
8963 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8964 service.unpause_recovery();
8965 }
8966 }
8967
8968 service.activate_map();
8969
8970 // process waiters
8971 take_waiters(waiting_for_osdmap);
8972 }
8973
8974 bool OSD::require_mon_peer(const Message *m)
8975 {
8976 if (!m->get_connection()->peer_is_mon()) {
8977 dout(0) << "require_mon_peer received from non-mon "
8978 << m->get_connection()->get_peer_addr()
8979 << " " << *m << dendl;
8980 return false;
8981 }
8982 return true;
8983 }
8984
8985 bool OSD::require_mon_or_mgr_peer(const Message *m)
8986 {
8987 if (!m->get_connection()->peer_is_mon() &&
8988 !m->get_connection()->peer_is_mgr()) {
8989 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8990 << m->get_connection()->get_peer_addr()
8991 << " " << *m << dendl;
8992 return false;
8993 }
8994 return true;
8995 }
8996
8997 bool OSD::require_osd_peer(const Message *m)
8998 {
8999 if (!m->get_connection()->peer_is_osd()) {
9000 dout(0) << "require_osd_peer received from non-osd "
9001 << m->get_connection()->get_peer_addr()
9002 << " " << *m << dendl;
9003 return false;
9004 }
9005 return true;
9006 }
9007
9008 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9009 {
9010 epoch_t up_epoch = service.get_up_epoch();
9011 if (epoch < up_epoch) {
9012 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9013 return false;
9014 }
9015
9016 if (!is_active()) {
9017 dout(7) << "still in boot state, dropping message " << *m << dendl;
9018 return false;
9019 }
9020
9021 return true;
9022 }
9023
9024 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
9025 bool is_fast_dispatch)
9026 {
9027 int from = m->get_source().num();
9028
9029 if (map->is_down(from) ||
9030 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9031 dout(5) << "from dead osd." << from << ", marking down, "
9032 << " msg was " << m->get_source_inst().addr
9033 << " expected "
9034 << (map->is_up(from) ?
9035 map->get_cluster_addrs(from) : entity_addrvec_t())
9036 << dendl;
9037 ConnectionRef con = m->get_connection();
9038 con->mark_down();
9039 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
9040 if (!is_fast_dispatch)
9041 s->session_dispatch_lock.lock();
9042 clear_session_waiting_on_map(s);
9043 con->set_priv(nullptr); // break ref <-> session cycle, if any
9044 s->con.reset();
9045 if (!is_fast_dispatch)
9046 s->session_dispatch_lock.unlock();
9047 }
9048 return false;
9049 }
9050 return true;
9051 }
9052
9053
9054 /*
9055 * require that we have same (or newer) map, and that
9056 * the source is the pg primary.
9057 */
9058 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9059 bool is_fast_dispatch)
9060 {
9061 const Message *m = op->get_req();
9062 const auto osdmap = get_osdmap();
9063 dout(15) << "require_same_or_newer_map " << epoch
9064 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9065
9066 ceph_assert(ceph_mutex_is_locked(osd_lock));
9067
9068 // do they have a newer map?
9069 if (epoch > osdmap->get_epoch()) {
9070 dout(7) << "waiting for newer map epoch " << epoch
9071 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9072 wait_for_new_map(op);
9073 return false;
9074 }
9075
9076 if (!require_self_aliveness(op->get_req(), epoch)) {
9077 return false;
9078 }
9079
9080 // ok, our map is same or newer.. do they still exist?
9081 if (m->get_connection()->get_messenger() == cluster_messenger &&
9082 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9083 return false;
9084 }
9085
9086 return true;
9087 }
9088
9089
9090
9091
9092
9093 // ----------------------------------------
9094 // pg creation
9095
9096 void OSD::split_pgs(
9097 PG *parent,
9098 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9099 OSDMapRef curmap,
9100 OSDMapRef nextmap,
9101 PeeringCtx &rctx)
9102 {
9103 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9104 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9105
9106 vector<object_stat_sum_t> updated_stats;
9107 parent->start_split_stats(childpgids, &updated_stats);
9108
9109 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9110 for (set<spg_t>::const_iterator i = childpgids.begin();
9111 i != childpgids.end();
9112 ++i, ++stat_iter) {
9113 ceph_assert(stat_iter != updated_stats.end());
9114 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9115 PG* child = _make_pg(nextmap, *i);
9116 child->lock(true);
9117 out_pgs->insert(child);
9118 child->ch = store->create_new_collection(child->coll);
9119
9120 {
9121 uint32_t shard_index = i->hash_to_shard(shards.size());
9122 assert(NULL != shards[shard_index]);
9123 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9124 }
9125
9126 unsigned split_bits = i->get_split_bits(pg_num);
9127 dout(10) << " pg_num is " << pg_num
9128 << ", m_seed " << i->ps()
9129 << ", split_bits is " << split_bits << dendl;
9130 parent->split_colls(
9131 *i,
9132 split_bits,
9133 i->ps(),
9134 &child->get_pool().info,
9135 rctx.transaction);
9136 parent->split_into(
9137 i->pgid,
9138 child,
9139 split_bits);
9140
9141 child->init_collection_pool_opts();
9142
9143 child->finish_split_stats(*stat_iter, rctx.transaction);
9144 child->unlock();
9145 }
9146 ceph_assert(stat_iter != updated_stats.end());
9147 parent->finish_split_stats(*stat_iter, rctx.transaction);
9148 }
9149
9150 /*
9151 * holding osd_lock
9152 */
9153 void OSD::handle_pg_create(OpRequestRef op)
9154 {
9155 // NOTE: this can be removed in P release (mimic is the last version to
9156 // send MOSDPGCreate messages).
9157
9158 auto m = op->get_req<MOSDPGCreate>();
9159 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9160
9161 dout(10) << "handle_pg_create " << *m << dendl;
9162
9163 if (!require_mon_peer(op->get_req())) {
9164 return;
9165 }
9166
9167 if (!require_same_or_newer_map(op, m->epoch, false))
9168 return;
9169
9170 op->mark_started();
9171
9172 const auto osdmap = get_osdmap();
9173 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9174 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9175 p != m->mkpg.end();
9176 ++p, ++ci) {
9177 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9178 epoch_t created = p->second.created;
9179 if (p->second.split_bits) // Skip split pgs
9180 continue;
9181 pg_t on = p->first;
9182
9183 if (!osdmap->have_pg_pool(on.pool())) {
9184 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9185 continue;
9186 }
9187
9188 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9189
9190 spg_t pgid;
9191 bool mapped = osdmap->get_primary_shard(on, &pgid);
9192 ceph_assert(mapped);
9193
9194 // is it still ours?
9195 vector<int> up, acting;
9196 int up_primary = -1;
9197 int acting_primary = -1;
9198 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9199 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9200
9201 if (acting_primary != whoami) {
9202 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9203 << "), my role=" << role << ", skipping" << dendl;
9204 continue;
9205 }
9206
9207
9208 PastIntervals pi;
9209 pg_history_t history;
9210 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9211
9212 // The mon won't resend unless the primary changed, so we ignore
9213 // same_interval_since. We'll pass this history with the current
9214 // epoch as the event.
9215 if (history.same_primary_since > m->epoch) {
9216 dout(10) << __func__ << ": got obsolete pg create on pgid "
9217 << pgid << " from epoch " << m->epoch
9218 << ", primary changed in " << history.same_primary_since
9219 << dendl;
9220 continue;
9221 }
9222 enqueue_peering_evt(
9223 pgid,
9224 PGPeeringEventRef(
9225 std::make_shared<PGPeeringEvent>(
9226 osdmap->get_epoch(),
9227 osdmap->get_epoch(),
9228 NullEvt(),
9229 true,
9230 new PGCreateInfo(
9231 pgid,
9232 osdmap->get_epoch(),
9233 history,
9234 pi,
9235 true)
9236 )));
9237 }
9238
9239 {
9240 std::lock_guard l(pending_creates_lock);
9241 if (pending_creates_from_mon == 0) {
9242 last_pg_create_epoch = m->epoch;
9243 }
9244 }
9245
9246 maybe_update_heartbeat_peers();
9247 }
9248
9249
9250 // ----------------------------------------
9251 // peering and recovery
9252
9253 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9254 ThreadPool::TPHandle *handle)
9255 {
9256 if (!service.get_osdmap()->is_up(whoami)) {
9257 dout(20) << __func__ << " not up in osdmap" << dendl;
9258 } else if (!is_active()) {
9259 dout(20) << __func__ << " not active" << dendl;
9260 } else {
9261 for (auto& [osd, ls] : ctx.message_map) {
9262 if (!curmap->is_up(osd)) {
9263 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9264 continue;
9265 }
9266 ConnectionRef con = service.get_con_osd_cluster(
9267 osd, curmap->get_epoch());
9268 if (!con) {
9269 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9270 << dendl;
9271 continue;
9272 }
9273 service.maybe_share_map(con.get(), curmap);
9274 for (auto m : ls) {
9275 con->send_message2(m);
9276 }
9277 ls.clear();
9278 }
9279 }
9280 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9281 int tr = store->queue_transaction(
9282 pg->ch,
9283 std::move(ctx.transaction), TrackedOpRef(),
9284 handle);
9285 ceph_assert(tr == 0);
9286 }
9287 }
9288
9289 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9290 {
9291 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9292 if (!require_mon_peer(m)) {
9293 m->put();
9294 return;
9295 }
9296 for (auto& p : m->pgs) {
9297 spg_t pgid = p.first;
9298 epoch_t created = p.second.first;
9299 utime_t created_stamp = p.second.second;
9300 auto q = m->pg_extra.find(pgid);
9301 if (q == m->pg_extra.end()) {
9302 dout(20) << __func__ << " " << pgid << " e" << created
9303 << "@" << created_stamp
9304 << " (no history or past_intervals)" << dendl;
9305 // pre-octopus ... no pg history. this can be removed in Q release.
9306 enqueue_peering_evt(
9307 pgid,
9308 PGPeeringEventRef(
9309 std::make_shared<PGPeeringEvent>(
9310 m->epoch,
9311 m->epoch,
9312 NullEvt(),
9313 true,
9314 new PGCreateInfo(
9315 pgid,
9316 created,
9317 pg_history_t(created, created_stamp),
9318 PastIntervals(),
9319 true)
9320 )));
9321 } else {
9322 dout(20) << __func__ << " " << pgid << " e" << created
9323 << "@" << created_stamp
9324 << " history " << q->second.first
9325 << " pi " << q->second.second << dendl;
9326 if (!q->second.second.empty() &&
9327 m->epoch < q->second.second.get_bounds().second) {
9328 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9329 << " and unmatched past_intervals " << q->second.second
9330 << " (history " << q->second.first << ")";
9331 } else {
9332 enqueue_peering_evt(
9333 pgid,
9334 PGPeeringEventRef(
9335 std::make_shared<PGPeeringEvent>(
9336 m->epoch,
9337 m->epoch,
9338 NullEvt(),
9339 true,
9340 new PGCreateInfo(
9341 pgid,
9342 m->epoch,
9343 q->second.first,
9344 q->second.second,
9345 true)
9346 )));
9347 }
9348 }
9349 }
9350
9351 {
9352 std::lock_guard l(pending_creates_lock);
9353 if (pending_creates_from_mon == 0) {
9354 last_pg_create_epoch = m->epoch;
9355 }
9356 }
9357
9358 m->put();
9359 }
9360
9361 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9362 {
9363 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9364 if (!require_osd_peer(m)) {
9365 m->put();
9366 return;
9367 }
9368 int from = m->get_source().num();
9369 for (auto& p : m->get_pg_list()) {
9370 spg_t pgid(p.info.pgid.pgid, p.to);
9371 enqueue_peering_evt(
9372 pgid,
9373 PGPeeringEventRef(
9374 std::make_shared<PGPeeringEvent>(
9375 p.epoch_sent,
9376 p.query_epoch,
9377 MNotifyRec(
9378 pgid, pg_shard_t(from, p.from),
9379 p,
9380 m->get_connection()->get_features()),
9381 true,
9382 new PGCreateInfo(
9383 pgid,
9384 p.query_epoch,
9385 p.info.history,
9386 p.past_intervals,
9387 false)
9388 )));
9389 }
9390 m->put();
9391 }
9392
9393 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9394 {
9395 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9396 if (!require_osd_peer(m)) {
9397 m->put();
9398 return;
9399 }
9400 int from = m->get_source().num();
9401 for (auto& p : m->pg_list) {
9402 enqueue_peering_evt(
9403 spg_t(p.info.pgid.pgid, p.to),
9404 PGPeeringEventRef(
9405 std::make_shared<PGPeeringEvent>(
9406 p.epoch_sent, p.query_epoch,
9407 MInfoRec(
9408 pg_shard_t(from, p.from),
9409 p.info,
9410 p.epoch_sent)))
9411 );
9412 }
9413 m->put();
9414 }
9415
9416 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9417 {
9418 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9419 if (!require_osd_peer(m)) {
9420 m->put();
9421 return;
9422 }
9423 for (auto& pgid : m->pg_list) {
9424 enqueue_peering_evt(
9425 pgid,
9426 PGPeeringEventRef(
9427 std::make_shared<PGPeeringEvent>(
9428 m->get_epoch(), m->get_epoch(),
9429 PeeringState::DeleteStart())));
9430 }
9431 m->put();
9432 }
9433
9434 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9435 {
9436 dout(10) << __func__ << " " << *m << dendl;
9437 if (!require_mon_or_mgr_peer(m)) {
9438 m->put();
9439 return;
9440 }
9441 epoch_t epoch = get_osdmap_epoch();
9442 for (auto pgid : m->forced_pgs) {
9443 if (m->options & OFR_BACKFILL) {
9444 if (m->options & OFR_CANCEL) {
9445 enqueue_peering_evt(
9446 pgid,
9447 PGPeeringEventRef(
9448 std::make_shared<PGPeeringEvent>(
9449 epoch, epoch,
9450 PeeringState::UnsetForceBackfill())));
9451 } else {
9452 enqueue_peering_evt(
9453 pgid,
9454 PGPeeringEventRef(
9455 std::make_shared<PGPeeringEvent>(
9456 epoch, epoch,
9457 PeeringState::SetForceBackfill())));
9458 }
9459 } else if (m->options & OFR_RECOVERY) {
9460 if (m->options & OFR_CANCEL) {
9461 enqueue_peering_evt(
9462 pgid,
9463 PGPeeringEventRef(
9464 std::make_shared<PGPeeringEvent>(
9465 epoch, epoch,
9466 PeeringState::UnsetForceRecovery())));
9467 } else {
9468 enqueue_peering_evt(
9469 pgid,
9470 PGPeeringEventRef(
9471 std::make_shared<PGPeeringEvent>(
9472 epoch, epoch,
9473 PeeringState::SetForceRecovery())));
9474 }
9475 }
9476 }
9477 m->put();
9478 }
9479
9480 void OSD::handle_pg_query_nopg(const MQuery& q)
9481 {
9482 spg_t pgid = q.pgid;
9483 dout(10) << __func__ << " " << pgid << dendl;
9484
9485 OSDMapRef osdmap = get_osdmap();
9486 if (!osdmap->have_pg_pool(pgid.pool()))
9487 return;
9488
9489 dout(10) << " pg " << pgid << " dne" << dendl;
9490 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9491 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9492 if (con) {
9493 Message *m;
9494 if (q.query.type == pg_query_t::LOG ||
9495 q.query.type == pg_query_t::FULLLOG) {
9496 m = new MOSDPGLog(
9497 q.query.from, q.query.to,
9498 osdmap->get_epoch(), empty,
9499 q.query.epoch_sent);
9500 } else {
9501 pg_notify_t notify{q.query.from, q.query.to,
9502 q.query.epoch_sent,
9503 osdmap->get_epoch(),
9504 empty,
9505 PastIntervals()};
9506 m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9507 std::move(notify));
9508 }
9509 service.maybe_share_map(con.get(), osdmap);
9510 con->send_message(m);
9511 }
9512 }
9513
9514 void OSDService::queue_check_readable(spg_t spgid,
9515 epoch_t lpr,
9516 ceph::signedspan delay)
9517 {
9518 if (delay == ceph::signedspan::zero()) {
9519 osd->enqueue_peering_evt(
9520 spgid,
9521 PGPeeringEventRef(
9522 std::make_shared<PGPeeringEvent>(
9523 lpr, lpr,
9524 PeeringState::CheckReadable())));
9525 } else {
9526 mono_timer.add_event(
9527 delay,
9528 [this, spgid, lpr]() {
9529 queue_check_readable(spgid, lpr);
9530 });
9531 }
9532 }
9533
9534
9535 // =========================================================
9536 // RECOVERY
9537
9538 void OSDService::_maybe_queue_recovery() {
9539 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9540 uint64_t available_pushes;
9541 while (!awaiting_throttle.empty() &&
9542 _recover_now(&available_pushes)) {
9543 uint64_t to_start = std::min(
9544 available_pushes,
9545 cct->_conf->osd_recovery_max_single_start);
9546 _queue_for_recovery(awaiting_throttle.front(), to_start);
9547 awaiting_throttle.pop_front();
9548 dout(10) << __func__ << " starting " << to_start
9549 << ", recovery_ops_reserved " << recovery_ops_reserved
9550 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9551 recovery_ops_reserved += to_start;
9552 }
9553 }
9554
9555 bool OSDService::_recover_now(uint64_t *available_pushes)
9556 {
9557 if (available_pushes)
9558 *available_pushes = 0;
9559
9560 if (ceph_clock_now() < defer_recovery_until) {
9561 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9562 return false;
9563 }
9564
9565 if (recovery_paused) {
9566 dout(15) << __func__ << " paused" << dendl;
9567 return false;
9568 }
9569
9570 uint64_t max = osd->get_recovery_max_active();
9571 if (max <= recovery_ops_active + recovery_ops_reserved) {
9572 dout(15) << __func__ << " active " << recovery_ops_active
9573 << " + reserved " << recovery_ops_reserved
9574 << " >= max " << max << dendl;
9575 return false;
9576 }
9577
9578 if (available_pushes)
9579 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9580
9581 return true;
9582 }
9583
9584 unsigned OSDService::get_target_pg_log_entries() const
9585 {
9586 auto num_pgs = osd->get_num_pgs();
9587 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9588 if (num_pgs > 0 && target > 0) {
9589 // target an even spread of our budgeted log entries across all
9590 // PGs. note that while we only get to control the entry count
9591 // for primary PGs, we'll normally be responsible for a mix of
9592 // primary and replica PGs (for the same pool(s) even), so this
9593 // will work out.
9594 return std::max<unsigned>(
9595 std::min<unsigned>(target / num_pgs,
9596 cct->_conf->osd_max_pg_log_entries),
9597 cct->_conf->osd_min_pg_log_entries);
9598 } else {
9599 // fall back to a per-pg value.
9600 return cct->_conf->osd_min_pg_log_entries;
9601 }
9602 }
9603
9604 void OSD::do_recovery(
9605 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9606 ThreadPool::TPHandle &handle)
9607 {
9608 uint64_t started = 0;
9609
9610 /*
9611 * When the value of osd_recovery_sleep is set greater than zero, recovery
9612 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9613 * recovery event's schedule time. This is done by adding a
9614 * recovery_requeue_callback event, which re-queues the recovery op using
9615 * queue_recovery_after_sleep.
9616 */
9617 float recovery_sleep = get_osd_recovery_sleep();
9618 {
9619 std::lock_guard l(service.sleep_lock);
9620 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9621 PGRef pgref(pg);
9622 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9623 dout(20) << "do_recovery wake up at "
9624 << ceph_clock_now()
9625 << ", re-queuing recovery" << dendl;
9626 std::lock_guard l(service.sleep_lock);
9627 service.recovery_needs_sleep = false;
9628 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9629 });
9630
9631 // This is true for the first recovery op and when the previous recovery op
9632 // has been scheduled in the past. The next recovery op is scheduled after
9633 // completing the sleep from now.
9634
9635 if (auto now = ceph::real_clock::now();
9636 service.recovery_schedule_time < now) {
9637 service.recovery_schedule_time = now;
9638 }
9639 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9640 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9641 recovery_requeue_callback);
9642 dout(20) << "Recovery event scheduled at "
9643 << service.recovery_schedule_time << dendl;
9644 return;
9645 }
9646 }
9647
9648 {
9649 {
9650 std::lock_guard l(service.sleep_lock);
9651 service.recovery_needs_sleep = true;
9652 }
9653
9654 if (pg->pg_has_reset_since(queued)) {
9655 goto out;
9656 }
9657
9658 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9659 #ifdef DEBUG_RECOVERY_OIDS
9660 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9661 #endif
9662
9663 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9664 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9665 << " on " << *pg << dendl;
9666
9667 if (do_unfound) {
9668 PeeringCtx rctx;
9669 rctx.handle = &handle;
9670 pg->find_unfound(queued, rctx);
9671 dispatch_context(rctx, pg, pg->get_osdmap());
9672 }
9673 }
9674
9675 out:
9676 ceph_assert(started <= reserved_pushes);
9677 service.release_reserved_pushes(reserved_pushes);
9678 }
9679
9680 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9681 {
9682 std::lock_guard l(recovery_lock);
9683 dout(10) << "start_recovery_op " << *pg << " " << soid
9684 << " (" << recovery_ops_active << "/"
9685 << osd->get_recovery_max_active() << " rops)"
9686 << dendl;
9687 recovery_ops_active++;
9688
9689 #ifdef DEBUG_RECOVERY_OIDS
9690 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9691 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9692 recovery_oids[pg->pg_id].insert(soid);
9693 #endif
9694 }
9695
9696 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9697 {
9698 std::lock_guard l(recovery_lock);
9699 dout(10) << "finish_recovery_op " << *pg << " " << soid
9700 << " dequeue=" << dequeue
9701 << " (" << recovery_ops_active << "/"
9702 << osd->get_recovery_max_active() << " rops)"
9703 << dendl;
9704
9705 // adjust count
9706 ceph_assert(recovery_ops_active > 0);
9707 recovery_ops_active--;
9708
9709 #ifdef DEBUG_RECOVERY_OIDS
9710 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9711 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9712 recovery_oids[pg->pg_id].erase(soid);
9713 #endif
9714
9715 _maybe_queue_recovery();
9716 }
9717
9718 bool OSDService::is_recovery_active()
9719 {
9720 if (cct->_conf->osd_debug_pretend_recovery_active) {
9721 return true;
9722 }
9723 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9724 }
9725
9726 void OSDService::release_reserved_pushes(uint64_t pushes)
9727 {
9728 std::lock_guard l(recovery_lock);
9729 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9730 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9731 << dendl;
9732 ceph_assert(recovery_ops_reserved >= pushes);
9733 recovery_ops_reserved -= pushes;
9734 _maybe_queue_recovery();
9735 }
9736
9737 // =========================================================
9738 // OPS
9739
9740 bool OSD::op_is_discardable(const MOSDOp *op)
9741 {
9742 // drop client request if they are not connected and can't get the
9743 // reply anyway.
9744 if (!op->get_connection()->is_connected()) {
9745 return true;
9746 }
9747 return false;
9748 }
9749
9750 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9751 {
9752 const utime_t stamp = op->get_req()->get_recv_stamp();
9753 const utime_t latency = ceph_clock_now() - stamp;
9754 const unsigned priority = op->get_req()->get_priority();
9755 const int cost = op->get_req()->get_cost();
9756 const uint64_t owner = op->get_req()->get_source().num();
9757 const int type = op->get_req()->get_type();
9758
9759 dout(15) << "enqueue_op " << op << " prio " << priority
9760 << " type " << type
9761 << " cost " << cost
9762 << " latency " << latency
9763 << " epoch " << epoch
9764 << " " << *(op->get_req()) << dendl;
9765 op->osd_trace.event("enqueue op");
9766 op->osd_trace.keyval("priority", priority);
9767 op->osd_trace.keyval("cost", cost);
9768
9769 auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9770 enqueue_span->AddEvent(__func__, {
9771 {"priority", priority},
9772 {"cost", cost},
9773 {"epoch", epoch},
9774 {"owner", owner},
9775 {"type", type}
9776 });
9777
9778 op->mark_queued_for_pg();
9779 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9780 if (type == MSG_OSD_PG_PUSH ||
9781 type == MSG_OSD_PG_PUSH_REPLY) {
9782 op_shardedwq.queue(
9783 OpSchedulerItem(
9784 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9785 cost, priority, stamp, owner, epoch));
9786 } else {
9787 op_shardedwq.queue(
9788 OpSchedulerItem(
9789 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9790 cost, priority, stamp, owner, epoch));
9791 }
9792 }
9793
9794 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9795 {
9796 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9797 op_shardedwq.queue(
9798 OpSchedulerItem(
9799 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9800 10,
9801 cct->_conf->osd_peering_op_priority,
9802 utime_t(),
9803 0,
9804 evt->get_epoch_sent()));
9805 }
9806
9807 /*
9808 * NOTE: dequeue called in worker thread, with pg lock
9809 */
9810 void OSD::dequeue_op(
9811 PGRef pg, OpRequestRef op,
9812 ThreadPool::TPHandle &handle)
9813 {
9814 const Message *m = op->get_req();
9815
9816 FUNCTRACE(cct);
9817 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9818
9819 utime_t now = ceph_clock_now();
9820 op->set_dequeued_time(now);
9821
9822 utime_t latency = now - m->get_recv_stamp();
9823 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9824 << " cost " << m->get_cost()
9825 << " latency " << latency
9826 << " " << *m
9827 << " pg " << *pg << dendl;
9828
9829 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9830
9831 service.maybe_share_map(m->get_connection().get(),
9832 pg->get_osdmap(),
9833 op->sent_epoch);
9834
9835 if (pg->is_deleting())
9836 return;
9837
9838 op->mark_reached_pg();
9839 op->osd_trace.event("dequeue_op");
9840
9841 pg->do_request(op, handle);
9842
9843 // finish
9844 dout(10) << "dequeue_op " << op << " finish" << dendl;
9845 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9846 }
9847
9848
9849 void OSD::dequeue_peering_evt(
9850 OSDShard *sdata,
9851 PG *pg,
9852 PGPeeringEventRef evt,
9853 ThreadPool::TPHandle& handle)
9854 {
9855 auto curmap = sdata->get_osdmap();
9856 bool need_up_thru = false;
9857 epoch_t same_interval_since = 0;
9858 if (!pg) {
9859 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9860 handle_pg_query_nopg(*q);
9861 } else {
9862 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9863 ceph_abort();
9864 }
9865 } else if (PeeringCtx rctx;
9866 advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9867 pg->do_peering_event(evt, rctx);
9868 if (pg->is_deleted()) {
9869 pg->unlock();
9870 return;
9871 }
9872 dispatch_context(rctx, pg, curmap, &handle);
9873 need_up_thru = pg->get_need_up_thru();
9874 same_interval_since = pg->get_same_interval_since();
9875 pg->unlock();
9876 }
9877
9878 if (need_up_thru) {
9879 queue_want_up_thru(same_interval_since);
9880 }
9881
9882 service.send_pg_temp();
9883 }
9884
9885 void OSD::dequeue_delete(
9886 OSDShard *sdata,
9887 PG *pg,
9888 epoch_t e,
9889 ThreadPool::TPHandle& handle)
9890 {
9891 dequeue_peering_evt(
9892 sdata,
9893 pg,
9894 PGPeeringEventRef(
9895 std::make_shared<PGPeeringEvent>(
9896 e, e,
9897 PeeringState::DeleteSome())),
9898 handle);
9899 }
9900
9901
9902
9903 // --------------------------------
9904
9905 const char** OSD::get_tracked_conf_keys() const
9906 {
9907 static const char* KEYS[] = {
9908 "osd_max_backfills",
9909 "osd_min_recovery_priority",
9910 "osd_max_trimming_pgs",
9911 "osd_op_complaint_time",
9912 "osd_op_log_threshold",
9913 "osd_op_history_size",
9914 "osd_op_history_duration",
9915 "osd_op_history_slow_op_size",
9916 "osd_op_history_slow_op_threshold",
9917 "osd_enable_op_tracker",
9918 "osd_map_cache_size",
9919 "osd_pg_epoch_max_lag_factor",
9920 "osd_pg_epoch_persisted_max_stale",
9921 "osd_recovery_sleep",
9922 "osd_recovery_sleep_hdd",
9923 "osd_recovery_sleep_ssd",
9924 "osd_recovery_sleep_hybrid",
9925 "osd_delete_sleep",
9926 "osd_delete_sleep_hdd",
9927 "osd_delete_sleep_ssd",
9928 "osd_delete_sleep_hybrid",
9929 "osd_snap_trim_sleep",
9930 "osd_snap_trim_sleep_hdd",
9931 "osd_snap_trim_sleep_ssd",
9932 "osd_snap_trim_sleep_hybrid",
9933 "osd_scrub_sleep",
9934 "osd_recovery_max_active",
9935 "osd_recovery_max_active_hdd",
9936 "osd_recovery_max_active_ssd",
9937 // clog & admin clog
9938 "clog_to_monitors",
9939 "clog_to_syslog",
9940 "clog_to_syslog_facility",
9941 "clog_to_syslog_level",
9942 "osd_objectstore_fuse",
9943 "clog_to_graylog",
9944 "clog_to_graylog_host",
9945 "clog_to_graylog_port",
9946 "host",
9947 "fsid",
9948 "osd_recovery_delay_start",
9949 "osd_client_message_size_cap",
9950 "osd_client_message_cap",
9951 "osd_heartbeat_min_size",
9952 "osd_heartbeat_interval",
9953 "osd_object_clean_region_max_num_intervals",
9954 "osd_scrub_min_interval",
9955 "osd_scrub_max_interval",
9956 NULL
9957 };
9958 return KEYS;
9959 }
9960
9961 void OSD::handle_conf_change(const ConfigProxy& conf,
9962 const std::set <std::string> &changed)
9963 {
9964 std::lock_guard l{osd_lock};
9965
9966 if (changed.count("osd_max_backfills") ||
9967 changed.count("osd_delete_sleep") ||
9968 changed.count("osd_delete_sleep_hdd") ||
9969 changed.count("osd_delete_sleep_ssd") ||
9970 changed.count("osd_delete_sleep_hybrid") ||
9971 changed.count("osd_snap_trim_sleep") ||
9972 changed.count("osd_snap_trim_sleep_hdd") ||
9973 changed.count("osd_snap_trim_sleep_ssd") ||
9974 changed.count("osd_snap_trim_sleep_hybrid") ||
9975 changed.count("osd_scrub_sleep") ||
9976 changed.count("osd_recovery_sleep") ||
9977 changed.count("osd_recovery_sleep_hdd") ||
9978 changed.count("osd_recovery_sleep_ssd") ||
9979 changed.count("osd_recovery_sleep_hybrid") ||
9980 changed.count("osd_recovery_max_active") ||
9981 changed.count("osd_recovery_max_active_hdd") ||
9982 changed.count("osd_recovery_max_active_ssd")) {
9983 if (!maybe_override_options_for_qos() &&
9984 changed.count("osd_max_backfills")) {
9985 // Scheduler is not "mclock". Fallback to earlier behavior
9986 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9987 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9988 }
9989 }
9990 if (changed.count("osd_min_recovery_priority")) {
9991 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9992 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9993 }
9994 if (changed.count("osd_max_trimming_pgs")) {
9995 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9996 }
9997 if (changed.count("osd_op_complaint_time") ||
9998 changed.count("osd_op_log_threshold")) {
9999 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10000 cct->_conf->osd_op_log_threshold);
10001 }
10002 if (changed.count("osd_op_history_size") ||
10003 changed.count("osd_op_history_duration")) {
10004 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10005 cct->_conf->osd_op_history_duration);
10006 }
10007 if (changed.count("osd_op_history_slow_op_size") ||
10008 changed.count("osd_op_history_slow_op_threshold")) {
10009 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10010 cct->_conf->osd_op_history_slow_op_threshold);
10011 }
10012 if (changed.count("osd_enable_op_tracker")) {
10013 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10014 }
10015 if (changed.count("osd_map_cache_size")) {
10016 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10017 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10018 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10019 }
10020 if (changed.count("clog_to_monitors") ||
10021 changed.count("clog_to_syslog") ||
10022 changed.count("clog_to_syslog_level") ||
10023 changed.count("clog_to_syslog_facility") ||
10024 changed.count("clog_to_graylog") ||
10025 changed.count("clog_to_graylog_host") ||
10026 changed.count("clog_to_graylog_port") ||
10027 changed.count("host") ||
10028 changed.count("fsid")) {
10029 update_log_config();
10030 }
10031 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10032 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10033 "osd_pg_epoch_max_lag_factor");
10034 }
10035
10036 #ifdef HAVE_LIBFUSE
10037 if (changed.count("osd_objectstore_fuse")) {
10038 if (store) {
10039 enable_disable_fuse(false);
10040 }
10041 }
10042 #endif
10043
10044 if (changed.count("osd_recovery_delay_start")) {
10045 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10046 service.kick_recovery_queue();
10047 }
10048
10049 if (changed.count("osd_client_message_cap")) {
10050 uint64_t newval = cct->_conf->osd_client_message_cap;
10051 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10052 if (pol.throttler_messages) {
10053 pol.throttler_messages->reset_max(newval);
10054 }
10055 }
10056 if (changed.count("osd_client_message_size_cap")) {
10057 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10058 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10059 if (pol.throttler_bytes) {
10060 pol.throttler_bytes->reset_max(newval);
10061 }
10062 }
10063 if (changed.count("osd_object_clean_region_max_num_intervals")) {
10064 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10065 }
10066
10067 if (changed.count("osd_scrub_min_interval") ||
10068 changed.count("osd_scrub_max_interval")) {
10069 resched_all_scrubs();
10070 dout(0) << __func__ << ": scrub interval change" << dendl;
10071 }
10072 check_config();
10073 if (changed.count("osd_asio_thread_count")) {
10074 service.poolctx.stop();
10075 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10076 }
10077 }
10078
10079 void OSD::maybe_override_max_osd_capacity_for_qos()
10080 {
10081 // If the scheduler enabled is mclock, override the default
10082 // osd capacity with the value obtained from running the
10083 // osd bench test. This is later used to setup mclock.
10084 if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
10085 (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
10086 (!unsupported_objstore_for_qos())) {
10087 std::string max_capacity_iops_config;
10088 bool force_run_benchmark =
10089 cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
10090
10091 if (store_is_rotational) {
10092 max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
10093 } else {
10094 max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
10095 }
10096
10097 if (!force_run_benchmark) {
10098 double default_iops = 0.0;
10099
10100 // Get the current osd iops capacity
10101 double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
10102
10103 // Get the default max iops capacity
10104 auto val = cct->_conf.get_val_default(max_capacity_iops_config);
10105 if (!val.has_value()) {
10106 derr << __func__ << " Unable to determine default value of "
10107 << max_capacity_iops_config << dendl;
10108 // Cannot determine default iops. Force a run of the OSD benchmark.
10109 force_run_benchmark = true;
10110 } else {
10111 // Default iops
10112 default_iops = std::stod(val.value());
10113 }
10114
10115 // Determine if we really need to run the osd benchmark
10116 if (!force_run_benchmark && (default_iops != cur_iops)) {
10117 dout(1) << __func__ << std::fixed << std::setprecision(2)
10118 << " default_iops: " << default_iops
10119 << " cur_iops: " << cur_iops
10120 << ". Skip OSD benchmark test." << dendl;
10121 return;
10122 }
10123 }
10124
10125 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10126 int64_t count = 12288000; // Count of bytes to write
10127 int64_t bsize = 4096; // Block size
10128 int64_t osize = 4194304; // Object size
10129 int64_t onum = 100; // Count of objects to write
10130 double elapsed = 0.0; // Time taken to complete the test
10131 double iops = 0.0;
10132 stringstream ss;
10133 int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
10134 if (ret != 0) {
10135 derr << __func__
10136 << " osd bench err: " << ret
10137 << " osd bench errstr: " << ss.str()
10138 << dendl;
10139 return;
10140 }
10141
10142 double rate = count / elapsed;
10143 iops = rate / bsize;
10144 dout(1) << __func__
10145 << " osd bench result -"
10146 << std::fixed << std::setprecision(3)
10147 << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
10148 << " iops: " << iops
10149 << " elapsed_sec: " << elapsed
10150 << dendl;
10151
10152 // Persist iops to the MON store
10153 ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
10154 if (ret < 0) {
10155 // Fallback to setting the config within the in-memory "values" map.
10156 cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
10157 }
10158
10159 // Override the max osd capacity for all shards
10160 for (auto& shard : shards) {
10161 shard->update_scheduler_config();
10162 }
10163 }
10164 }
10165
10166 bool OSD::maybe_override_options_for_qos()
10167 {
10168 // If the scheduler enabled is mclock, override the recovery, backfill
10169 // and sleep options so that mclock can meet the QoS goals.
10170 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10171 !unsupported_objstore_for_qos()) {
10172 dout(1) << __func__
10173 << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10174
10175 // Set high value for recovery max active
10176 uint32_t rec_max_active = 1000;
10177 cct->_conf.set_val(
10178 "osd_recovery_max_active", std::to_string(rec_max_active));
10179 cct->_conf.set_val(
10180 "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10181 cct->_conf.set_val(
10182 "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10183
10184 // Set high value for osd_max_backfill
10185 uint32_t max_backfills = 1000;
10186 cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10187 service.local_reserver.set_max(max_backfills);
10188 service.remote_reserver.set_max(max_backfills);
10189
10190 // Disable recovery sleep
10191 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10192 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10193 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10194 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10195
10196 // Disable delete sleep
10197 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10198 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10199 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10200 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10201
10202 // Disable snap trim sleep
10203 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10204 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10205 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10206 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10207
10208 // Disable scrub sleep
10209 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10210 return true;
10211 }
10212 return false;
10213 }
10214
10215 int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10216 {
10217 std::string cmd =
10218 "{"
10219 "\"prefix\": \"config set\", "
10220 "\"who\": \"osd." + std::to_string(whoami) + "\", "
10221 "\"name\": \"" + key + "\", "
10222 "\"value\": \"" + val + "\""
10223 "}";
10224
10225 vector<std::string> vcmd{cmd};
10226 bufferlist inbl;
10227 std::string outs;
10228 C_SaferCond cond;
10229 monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
10230 int r = cond.wait();
10231 if (r < 0) {
10232 derr << __func__ << " Failed to set config key " << key
10233 << " err: " << cpp_strerror(r)
10234 << " errstr: " << outs << dendl;
10235 return r;
10236 }
10237
10238 return 0;
10239 }
10240
10241 bool OSD::unsupported_objstore_for_qos()
10242 {
10243 static const std::vector<std::string> unsupported_objstores = { "filestore" };
10244 return std::find(unsupported_objstores.begin(),
10245 unsupported_objstores.end(),
10246 store->get_type()) != unsupported_objstores.end();
10247 }
10248
10249 void OSD::update_log_config()
10250 {
10251 auto parsed_options = clog->parse_client_options(cct);
10252 derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
10253 }
10254
10255 void OSD::check_config()
10256 {
10257 // some sanity checks
10258 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10259 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10260 << " is not > osd_pg_epoch_persisted_max_stale ("
10261 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10262 }
10263 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10264 clog->warn() << "osd_object_clean_region_max_num_intervals ("
10265 << cct->_conf->osd_object_clean_region_max_num_intervals
10266 << ") is < 0";
10267 }
10268 }
10269
10270 // --------------------------------
10271
10272 void OSD::get_latest_osdmap()
10273 {
10274 dout(10) << __func__ << " -- start" << dendl;
10275
10276 boost::system::error_code ec;
10277 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10278
10279 dout(10) << __func__ << " -- finish" << dendl;
10280 }
10281
10282 // --------------------------------
10283
10284 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10285 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10286 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10287 dout(10) << "setting " << queries.size() << " queries" << dendl;
10288
10289 std::list<OSDPerfMetricQuery> supported_queries;
10290 for (auto &it : queries) {
10291 auto &query = it.first;
10292 if (!query.key_descriptor.empty()) {
10293 supported_queries.push_back(query);
10294 }
10295 }
10296 if (supported_queries.size() < queries.size()) {
10297 dout(1) << queries.size() - supported_queries.size()
10298 << " unsupported queries" << dendl;
10299 }
10300 {
10301 std::lock_guard locker{m_perf_queries_lock};
10302 m_perf_queries = supported_queries;
10303 m_perf_limits = queries;
10304 }
10305 std::vector<PGRef> pgs;
10306 _get_pgs(&pgs);
10307 for (auto& pg : pgs) {
10308 std::scoped_lock l{*pg};
10309 pg->set_dynamic_perf_stats_queries(supported_queries);
10310 }
10311 }
10312
10313 MetricPayload OSD::get_perf_reports() {
10314 OSDMetricPayload payload;
10315 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10316
10317 std::vector<PGRef> pgs;
10318 _get_pgs(&pgs);
10319 DynamicPerfStats dps;
10320 for (auto& pg : pgs) {
10321 // m_perf_queries can be modified only in set_perf_queries by mgr client
10322 // request, and it is protected by by mgr client's lock, which is held
10323 // when set_perf_queries/get_perf_reports are called, so we may not hold
10324 // m_perf_queries_lock here.
10325 DynamicPerfStats pg_dps(m_perf_queries);
10326 pg->lock();
10327 pg->get_dynamic_perf_stats(&pg_dps);
10328 pg->unlock();
10329 dps.merge(pg_dps);
10330 }
10331 dps.add_to_reports(m_perf_limits, &reports);
10332 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10333
10334 return payload;
10335 }
10336
10337 // =============================================================
10338
10339 #undef dout_context
10340 #define dout_context cct
10341 #undef dout_prefix
10342 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10343
10344 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10345 {
10346 dout(10) << pg->pg_id << " " << pg << dendl;
10347 slot->pg = pg;
10348 pg->osd_shard = this;
10349 pg->pg_slot = slot;
10350 osd->inc_num_pgs();
10351
10352 slot->epoch = pg->get_osdmap_epoch();
10353 pg_slots_by_epoch.insert(*slot);
10354 }
10355
10356 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10357 {
10358 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10359 slot->pg->osd_shard = nullptr;
10360 slot->pg->pg_slot = nullptr;
10361 slot->pg = nullptr;
10362 osd->dec_num_pgs();
10363
10364 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10365 slot->epoch = 0;
10366 if (waiting_for_min_pg_epoch) {
10367 min_pg_epoch_cond.notify_all();
10368 }
10369 }
10370
10371 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10372 {
10373 std::lock_guard l(shard_lock);
10374 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10375 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10376 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10377 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10378 slot->epoch = e;
10379 pg_slots_by_epoch.insert(*slot);
10380 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10381 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10382 if (waiting_for_min_pg_epoch) {
10383 min_pg_epoch_cond.notify_all();
10384 }
10385 }
10386
10387 epoch_t OSDShard::get_min_pg_epoch()
10388 {
10389 std::lock_guard l(shard_lock);
10390 auto p = pg_slots_by_epoch.begin();
10391 if (p == pg_slots_by_epoch.end()) {
10392 return 0;
10393 }
10394 return p->epoch;
10395 }
10396
10397 void OSDShard::wait_min_pg_epoch(epoch_t need)
10398 {
10399 std::unique_lock l{shard_lock};
10400 ++waiting_for_min_pg_epoch;
10401 min_pg_epoch_cond.wait(l, [need, this] {
10402 if (pg_slots_by_epoch.empty()) {
10403 return true;
10404 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10405 return true;
10406 } else {
10407 dout(10) << need << " waiting on "
10408 << pg_slots_by_epoch.begin()->epoch << dendl;
10409 return false;
10410 }
10411 });
10412 --waiting_for_min_pg_epoch;
10413 }
10414
10415 epoch_t OSDShard::get_max_waiting_epoch()
10416 {
10417 std::lock_guard l(shard_lock);
10418 epoch_t r = 0;
10419 for (auto& i : pg_slots) {
10420 if (!i.second->waiting_peering.empty()) {
10421 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10422 }
10423 }
10424 return r;
10425 }
10426
10427 void OSDShard::consume_map(
10428 const OSDMapRef& new_osdmap,
10429 unsigned *pushes_to_free)
10430 {
10431 std::lock_guard l(shard_lock);
10432 OSDMapRef old_osdmap;
10433 {
10434 std::lock_guard l(osdmap_lock);
10435 old_osdmap = std::move(shard_osdmap);
10436 shard_osdmap = new_osdmap;
10437 }
10438 dout(10) << new_osdmap->get_epoch()
10439 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10440 << dendl;
10441 int queued = 0;
10442
10443 // check slots
10444 auto p = pg_slots.begin();
10445 while (p != pg_slots.end()) {
10446 OSDShardPGSlot *slot = p->second.get();
10447 const spg_t& pgid = p->first;
10448 dout(20) << __func__ << " " << pgid << dendl;
10449 if (!slot->waiting_for_split.empty()) {
10450 dout(20) << __func__ << " " << pgid
10451 << " waiting for split " << slot->waiting_for_split << dendl;
10452 ++p;
10453 continue;
10454 }
10455 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10456 dout(20) << __func__ << " " << pgid
10457 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10458 << dendl;
10459 ++p;
10460 continue;
10461 }
10462 if (!slot->waiting_peering.empty()) {
10463 epoch_t first = slot->waiting_peering.begin()->first;
10464 if (first <= new_osdmap->get_epoch()) {
10465 dout(20) << __func__ << " " << pgid
10466 << " pending_peering first epoch " << first
10467 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10468 queued += _wake_pg_slot(pgid, slot);
10469 }
10470 ++p;
10471 continue;
10472 }
10473 if (!slot->waiting.empty()) {
10474 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10475 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10476 << dendl;
10477 ++p;
10478 continue;
10479 }
10480 while (!slot->waiting.empty() &&
10481 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10482 auto& qi = slot->waiting.front();
10483 dout(20) << __func__ << " " << pgid
10484 << " waiting item " << qi
10485 << " epoch " << qi.get_map_epoch()
10486 << " <= " << new_osdmap->get_epoch()
10487 << ", "
10488 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10489 "misdirected")
10490 << ", dropping" << dendl;
10491 *pushes_to_free += qi.get_reserved_pushes();
10492 slot->waiting.pop_front();
10493 }
10494 }
10495 if (slot->waiting.empty() &&
10496 slot->num_running == 0 &&
10497 slot->waiting_for_split.empty() &&
10498 !slot->pg) {
10499 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10500 p = pg_slots.erase(p);
10501 continue;
10502 }
10503
10504 ++p;
10505 }
10506 if (queued) {
10507 std::lock_guard l{sdata_wait_lock};
10508 if (queued == 1)
10509 sdata_cond.notify_one();
10510 else
10511 sdata_cond.notify_all();
10512 }
10513 }
10514
10515 int OSDShard::_wake_pg_slot(
10516 spg_t pgid,
10517 OSDShardPGSlot *slot)
10518 {
10519 int count = 0;
10520 dout(20) << __func__ << " " << pgid
10521 << " to_process " << slot->to_process
10522 << " waiting " << slot->waiting
10523 << " waiting_peering " << slot->waiting_peering << dendl;
10524 for (auto i = slot->to_process.rbegin();
10525 i != slot->to_process.rend();
10526 ++i) {
10527 scheduler->enqueue_front(std::move(*i));
10528 count++;
10529 }
10530 slot->to_process.clear();
10531 for (auto i = slot->waiting.rbegin();
10532 i != slot->waiting.rend();
10533 ++i) {
10534 scheduler->enqueue_front(std::move(*i));
10535 count++;
10536 }
10537 slot->waiting.clear();
10538 for (auto i = slot->waiting_peering.rbegin();
10539 i != slot->waiting_peering.rend();
10540 ++i) {
10541 // this is overkill; we requeue everything, even if some of these
10542 // items are waiting for maps we don't have yet. FIXME, maybe,
10543 // someday, if we decide this inefficiency matters
10544 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10545 scheduler->enqueue_front(std::move(*j));
10546 count++;
10547 }
10548 }
10549 slot->waiting_peering.clear();
10550 ++slot->requeue_seq;
10551 return count;
10552 }
10553
10554 void OSDShard::identify_splits_and_merges(
10555 const OSDMapRef& as_of_osdmap,
10556 set<pair<spg_t,epoch_t>> *split_pgs,
10557 set<pair<spg_t,epoch_t>> *merge_pgs)
10558 {
10559 std::lock_guard l(shard_lock);
10560 if (shard_osdmap) {
10561 for (auto& i : pg_slots) {
10562 const spg_t& pgid = i.first;
10563 auto *slot = i.second.get();
10564 if (slot->pg) {
10565 osd->service.identify_splits_and_merges(
10566 shard_osdmap, as_of_osdmap, pgid,
10567 split_pgs, merge_pgs);
10568 } else if (!slot->waiting_for_split.empty()) {
10569 osd->service.identify_splits_and_merges(
10570 shard_osdmap, as_of_osdmap, pgid,
10571 split_pgs, nullptr);
10572 } else {
10573 dout(20) << __func__ << " slot " << pgid
10574 << " has no pg and waiting_for_split " << dendl;
10575 }
10576 }
10577 }
10578 }
10579
10580 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10581 set<pair<spg_t,epoch_t>> *pgids)
10582 {
10583 std::lock_guard l(shard_lock);
10584 _prime_splits(pgids);
10585 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10586 set<pair<spg_t,epoch_t>> newer_children;
10587 for (auto i : *pgids) {
10588 osd->service.identify_splits_and_merges(
10589 as_of_osdmap, shard_osdmap, i.first,
10590 &newer_children, nullptr);
10591 }
10592 newer_children.insert(pgids->begin(), pgids->end());
10593 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10594 << shard_osdmap->get_epoch() << ", new children " << newer_children
10595 << dendl;
10596 _prime_splits(&newer_children);
10597 // note: we don't care what is left over here for other shards.
10598 // if this shard is ahead of us and one isn't, e.g., one thread is
10599 // calling into prime_splits via _process (due to a newly created
10600 // pg) and this shard has a newer map due to a racing consume_map,
10601 // then any grandchildren left here will be identified (or were
10602 // identified) when the slower shard's osdmap is advanced.
10603 // _prime_splits() will tolerate the case where the pgid is
10604 // already primed.
10605 }
10606 }
10607
10608 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10609 {
10610 dout(10) << *pgids << dendl;
10611 auto p = pgids->begin();
10612 while (p != pgids->end()) {
10613 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10614 if (shard_index == shard_id) {
10615 auto r = pg_slots.emplace(p->first, nullptr);
10616 if (r.second) {
10617 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10618 r.first->second = make_unique<OSDShardPGSlot>();
10619 r.first->second->waiting_for_split.insert(p->second);
10620 } else {
10621 auto q = r.first;
10622 ceph_assert(q != pg_slots.end());
10623 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10624 << dendl;
10625 q->second->waiting_for_split.insert(p->second);
10626 }
10627 p = pgids->erase(p);
10628 } else {
10629 ++p;
10630 }
10631 }
10632 }
10633
10634 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10635 set<pair<spg_t,epoch_t>> *merge_pgs)
10636 {
10637 std::lock_guard l(shard_lock);
10638 dout(20) << __func__ << " checking shard " << shard_id
10639 << " for remaining merge pgs " << merge_pgs << dendl;
10640 auto p = merge_pgs->begin();
10641 while (p != merge_pgs->end()) {
10642 spg_t pgid = p->first;
10643 epoch_t epoch = p->second;
10644 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10645 if (shard_index != shard_id) {
10646 ++p;
10647 continue;
10648 }
10649 OSDShardPGSlot *slot;
10650 auto r = pg_slots.emplace(pgid, nullptr);
10651 if (r.second) {
10652 r.first->second = make_unique<OSDShardPGSlot>();
10653 }
10654 slot = r.first->second.get();
10655 if (slot->pg) {
10656 // already have pg
10657 dout(20) << __func__ << " have merge participant pg " << pgid
10658 << " " << slot->pg << dendl;
10659 } else if (!slot->waiting_for_split.empty() &&
10660 *slot->waiting_for_split.begin() < epoch) {
10661 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10662 << " " << slot->waiting_for_split << dendl;
10663 } else {
10664 dout(20) << __func__ << " creating empty merge participant " << pgid
10665 << " for merge in " << epoch << dendl;
10666 // leave history zeroed; PG::merge_from() will fill it in.
10667 pg_history_t history;
10668 PGCreateInfo cinfo(pgid, epoch - 1,
10669 history, PastIntervals(), false);
10670 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10671 _attach_pg(r.first->second.get(), pg.get());
10672 _wake_pg_slot(pgid, slot);
10673 pg->unlock();
10674 }
10675 // mark slot for merge
10676 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10677 slot->waiting_for_merge_epoch = epoch;
10678 p = merge_pgs->erase(p);
10679 }
10680 }
10681
10682 void OSDShard::register_and_wake_split_child(PG *pg)
10683 {
10684 dout(15) << __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
10685 epoch_t epoch;
10686 {
10687 std::lock_guard l(shard_lock);
10688 dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
10689 auto p = pg_slots.find(pg->pg_id);
10690 ceph_assert(p != pg_slots.end());
10691 auto *slot = p->second.get();
10692 dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10693 << slot->waiting_for_split << dendl;
10694 ceph_assert(!slot->pg);
10695 ceph_assert(!slot->waiting_for_split.empty());
10696 _attach_pg(slot, pg);
10697
10698 epoch = pg->get_osdmap_epoch();
10699 ceph_assert(slot->waiting_for_split.count(epoch));
10700 slot->waiting_for_split.erase(epoch);
10701 if (slot->waiting_for_split.empty()) {
10702 _wake_pg_slot(pg->pg_id, slot);
10703 } else {
10704 dout(10) << __func__ << " still waiting for split on "
10705 << slot->waiting_for_split << dendl;
10706 }
10707 }
10708
10709 // kick child to ensure it pulls up to the latest osdmap
10710 osd->enqueue_peering_evt(
10711 pg->pg_id,
10712 PGPeeringEventRef(
10713 std::make_shared<PGPeeringEvent>(
10714 epoch,
10715 epoch,
10716 NullEvt())));
10717
10718 std::lock_guard l{sdata_wait_lock};
10719 sdata_cond.notify_one();
10720 }
10721
10722 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10723 {
10724 std::lock_guard l(shard_lock);
10725 vector<spg_t> to_delete;
10726 for (auto& i : pg_slots) {
10727 if (i.first != parent &&
10728 i.first.get_ancestor(old_pg_num) == parent) {
10729 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10730 << dendl;
10731 _wake_pg_slot(i.first, i.second.get());
10732 to_delete.push_back(i.first);
10733 }
10734 }
10735 for (auto pgid : to_delete) {
10736 pg_slots.erase(pgid);
10737 }
10738 }
10739
10740 void OSDShard::update_scheduler_config()
10741 {
10742 std::lock_guard l(shard_lock);
10743 scheduler->update_configuration();
10744 }
10745
10746 std::string OSDShard::get_scheduler_type()
10747 {
10748 std::ostringstream scheduler_type;
10749 scheduler_type << *scheduler;
10750 return scheduler_type.str();
10751 }
10752
10753 OSDShard::OSDShard(
10754 int id,
10755 CephContext *cct,
10756 OSD *osd)
10757 : shard_id(id),
10758 cct(cct),
10759 osd(osd),
10760 shard_name(string("OSDShard.") + stringify(id)),
10761 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10762 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10763 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10764 shard_lock_name(shard_name + "::shard_lock"),
10765 shard_lock{make_mutex(shard_lock_name)},
10766 scheduler(ceph::osd::scheduler::make_scheduler(
10767 cct, osd->num_shards, osd->store->is_rotational(),
10768 osd->store->get_type())),
10769 context_queue(sdata_wait_lock, sdata_cond)
10770 {
10771 dout(0) << "using op scheduler " << *scheduler << dendl;
10772 }
10773
10774
10775 // =============================================================
10776
10777 #undef dout_context
10778 #define dout_context osd->cct
10779 #undef dout_prefix
10780 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10781
10782 void OSD::ShardedOpWQ::_add_slot_waiter(
10783 spg_t pgid,
10784 OSDShardPGSlot *slot,
10785 OpSchedulerItem&& qi)
10786 {
10787 if (qi.is_peering()) {
10788 dout(20) << __func__ << " " << pgid
10789 << " peering, item epoch is "
10790 << qi.get_map_epoch()
10791 << ", will wait on " << qi << dendl;
10792 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10793 } else {
10794 dout(20) << __func__ << " " << pgid
10795 << " item epoch is "
10796 << qi.get_map_epoch()
10797 << ", will wait on " << qi << dendl;
10798 slot->waiting.push_back(std::move(qi));
10799 }
10800 }
10801
10802 #undef dout_prefix
10803 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10804
10805 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10806 {
10807 uint32_t shard_index = thread_index % osd->num_shards;
10808 auto& sdata = osd->shards[shard_index];
10809 ceph_assert(sdata);
10810
10811 // If all threads of shards do oncommits, there is a out-of-order
10812 // problem. So we choose the thread which has the smallest
10813 // thread_index(thread_index < num_shards) of shard to do oncommit
10814 // callback.
10815 bool is_smallest_thread_index = thread_index < osd->num_shards;
10816
10817 // peek at spg_t
10818 sdata->shard_lock.lock();
10819 if (sdata->scheduler->empty() &&
10820 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10821 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10822 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10823 // we raced with a context_queue addition, don't wait
10824 wait_lock.unlock();
10825 } else if (!sdata->stop_waiting) {
10826 dout(20) << __func__ << " empty q, waiting" << dendl;
10827 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10828 sdata->shard_lock.unlock();
10829 sdata->sdata_cond.wait(wait_lock);
10830 wait_lock.unlock();
10831 sdata->shard_lock.lock();
10832 if (sdata->scheduler->empty() &&
10833 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10834 sdata->shard_lock.unlock();
10835 return;
10836 }
10837 // found a work item; reapply default wq timeouts
10838 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10839 timeout_interval, suicide_interval);
10840 } else {
10841 dout(20) << __func__ << " need return immediately" << dendl;
10842 wait_lock.unlock();
10843 sdata->shard_lock.unlock();
10844 return;
10845 }
10846 }
10847
10848 list<Context *> oncommits;
10849 if (is_smallest_thread_index) {
10850 sdata->context_queue.move_to(oncommits);
10851 }
10852
10853 WorkItem work_item;
10854 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10855 if (sdata->scheduler->empty()) {
10856 if (osd->is_stopping()) {
10857 sdata->shard_lock.unlock();
10858 for (auto c : oncommits) {
10859 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10860 delete c;
10861 }
10862 return; // OSD shutdown, discard.
10863 }
10864 sdata->shard_lock.unlock();
10865 handle_oncommits(oncommits);
10866 return;
10867 }
10868
10869 work_item = sdata->scheduler->dequeue();
10870 if (osd->is_stopping()) {
10871 sdata->shard_lock.unlock();
10872 for (auto c : oncommits) {
10873 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10874 delete c;
10875 }
10876 return; // OSD shutdown, discard.
10877 }
10878
10879 // If the work item is scheduled in the future, wait until
10880 // the time returned in the dequeue response before retrying.
10881 if (auto when_ready = std::get_if<double>(&work_item)) {
10882 if (is_smallest_thread_index) {
10883 sdata->shard_lock.unlock();
10884 handle_oncommits(oncommits);
10885 sdata->shard_lock.lock();
10886 }
10887 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10888 auto future_time = ceph::real_clock::from_double(*when_ready);
10889 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10890 // Disable heartbeat timeout until we find a non-future work item to process.
10891 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10892 sdata->shard_lock.unlock();
10893 ++sdata->waiting_threads;
10894 sdata->sdata_cond.wait_until(wait_lock, future_time);
10895 --sdata->waiting_threads;
10896 wait_lock.unlock();
10897 sdata->shard_lock.lock();
10898 // Reapply default wq timeouts
10899 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10900 timeout_interval, suicide_interval);
10901 // Populate the oncommits list if there were any additions
10902 // to the context_queue while we were waiting
10903 if (is_smallest_thread_index) {
10904 sdata->context_queue.move_to(oncommits);
10905 }
10906 }
10907 } // while
10908
10909 // Access the stored item
10910 auto item = std::move(std::get<OpSchedulerItem>(work_item));
10911 if (osd->is_stopping()) {
10912 sdata->shard_lock.unlock();
10913 for (auto c : oncommits) {
10914 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10915 delete c;
10916 }
10917 return; // OSD shutdown, discard.
10918 }
10919
10920 const auto token = item.get_ordering_token();
10921 auto r = sdata->pg_slots.emplace(token, nullptr);
10922 if (r.second) {
10923 r.first->second = make_unique<OSDShardPGSlot>();
10924 }
10925 OSDShardPGSlot *slot = r.first->second.get();
10926 dout(20) << __func__ << " " << token
10927 << (r.second ? " (new)" : "")
10928 << " to_process " << slot->to_process
10929 << " waiting " << slot->waiting
10930 << " waiting_peering " << slot->waiting_peering
10931 << dendl;
10932 slot->to_process.push_back(std::move(item));
10933 dout(20) << __func__ << " " << slot->to_process.back()
10934 << " queued" << dendl;
10935
10936 retry_pg:
10937 PGRef pg = slot->pg;
10938
10939 // lock pg (if we have it)
10940 if (pg) {
10941 // note the requeue seq now...
10942 uint64_t requeue_seq = slot->requeue_seq;
10943 ++slot->num_running;
10944
10945 sdata->shard_lock.unlock();
10946 osd->service.maybe_inject_dispatch_delay();
10947 pg->lock();
10948 osd->service.maybe_inject_dispatch_delay();
10949 sdata->shard_lock.lock();
10950
10951 auto q = sdata->pg_slots.find(token);
10952 if (q == sdata->pg_slots.end()) {
10953 // this can happen if we race with pg removal.
10954 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10955 pg->unlock();
10956 sdata->shard_lock.unlock();
10957 handle_oncommits(oncommits);
10958 return;
10959 }
10960 slot = q->second.get();
10961 --slot->num_running;
10962
10963 if (slot->to_process.empty()) {
10964 // raced with _wake_pg_slot or consume_map
10965 dout(20) << __func__ << " " << token
10966 << " nothing queued" << dendl;
10967 pg->unlock();
10968 sdata->shard_lock.unlock();
10969 handle_oncommits(oncommits);
10970 return;
10971 }
10972 if (requeue_seq != slot->requeue_seq) {
10973 dout(20) << __func__ << " " << token
10974 << " requeue_seq " << slot->requeue_seq << " > our "
10975 << requeue_seq << ", we raced with _wake_pg_slot"
10976 << dendl;
10977 pg->unlock();
10978 sdata->shard_lock.unlock();
10979 handle_oncommits(oncommits);
10980 return;
10981 }
10982 if (slot->pg != pg) {
10983 // this can happen if we race with pg removal.
10984 dout(20) << __func__ << " slot " << token << " no longer attached to "
10985 << pg << dendl;
10986 pg->unlock();
10987 goto retry_pg;
10988 }
10989 }
10990
10991 dout(20) << __func__ << " " << token
10992 << " to_process " << slot->to_process
10993 << " waiting " << slot->waiting
10994 << " waiting_peering " << slot->waiting_peering << dendl;
10995
10996 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10997 suicide_interval);
10998
10999 // take next item
11000 auto qi = std::move(slot->to_process.front());
11001 slot->to_process.pop_front();
11002 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11003 set<pair<spg_t,epoch_t>> new_children;
11004 OSDMapRef osdmap;
11005
11006 while (!pg) {
11007 // should this pg shard exist on this osd in this (or a later) epoch?
11008 osdmap = sdata->shard_osdmap;
11009 const PGCreateInfo *create_info = qi.creates_pg();
11010 if (!slot->waiting_for_split.empty()) {
11011 dout(20) << __func__ << " " << token
11012 << " splitting " << slot->waiting_for_split << dendl;
11013 _add_slot_waiter(token, slot, std::move(qi));
11014 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11015 dout(20) << __func__ << " " << token
11016 << " map " << qi.get_map_epoch() << " > "
11017 << osdmap->get_epoch() << dendl;
11018 _add_slot_waiter(token, slot, std::move(qi));
11019 } else if (qi.is_peering()) {
11020 if (!qi.peering_requires_pg()) {
11021 // for pg-less events, we run them under the ordering lock, since
11022 // we don't have the pg lock to keep them ordered.
11023 qi.run(osd, sdata, pg, tp_handle);
11024 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11025 if (create_info) {
11026 if (create_info->by_mon &&
11027 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11028 dout(20) << __func__ << " " << token
11029 << " no pg, no longer primary, ignoring mon create on "
11030 << qi << dendl;
11031 } else {
11032 dout(20) << __func__ << " " << token
11033 << " no pg, should create on " << qi << dendl;
11034 pg = osd->handle_pg_create_info(osdmap, create_info);
11035 if (pg) {
11036 // we created the pg! drop out and continue "normally"!
11037 sdata->_attach_pg(slot, pg.get());
11038 sdata->_wake_pg_slot(token, slot);
11039
11040 // identify split children between create epoch and shard epoch.
11041 osd->service.identify_splits_and_merges(
11042 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11043 sdata->_prime_splits(&new_children);
11044 // distribute remaining split children to other shards below!
11045 break;
11046 }
11047 dout(20) << __func__ << " ignored create on " << qi << dendl;
11048 }
11049 } else {
11050 dout(20) << __func__ << " " << token
11051 << " no pg, peering, !create, discarding " << qi << dendl;
11052 }
11053 } else {
11054 dout(20) << __func__ << " " << token
11055 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11056 << ", discarding " << qi
11057 << dendl;
11058 }
11059 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11060 dout(20) << __func__ << " " << token
11061 << " no pg, should exist e" << osdmap->get_epoch()
11062 << ", will wait on " << qi << dendl;
11063 _add_slot_waiter(token, slot, std::move(qi));
11064 } else {
11065 dout(20) << __func__ << " " << token
11066 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11067 << ", dropping " << qi << dendl;
11068 // share map with client?
11069 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11070 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11071 sdata->shard_osdmap,
11072 (*_op)->sent_epoch);
11073 }
11074 unsigned pushes_to_free = qi.get_reserved_pushes();
11075 if (pushes_to_free > 0) {
11076 sdata->shard_lock.unlock();
11077 osd->service.release_reserved_pushes(pushes_to_free);
11078 handle_oncommits(oncommits);
11079 return;
11080 }
11081 }
11082 sdata->shard_lock.unlock();
11083 handle_oncommits(oncommits);
11084 return;
11085 }
11086 if (qi.is_peering()) {
11087 OSDMapRef osdmap = sdata->shard_osdmap;
11088 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11089 _add_slot_waiter(token, slot, std::move(qi));
11090 sdata->shard_lock.unlock();
11091 pg->unlock();
11092 handle_oncommits(oncommits);
11093 return;
11094 }
11095 }
11096 sdata->shard_lock.unlock();
11097
11098 if (!new_children.empty()) {
11099 for (auto shard : osd->shards) {
11100 shard->prime_splits(osdmap, &new_children);
11101 }
11102 ceph_assert(new_children.empty());
11103 }
11104
11105 // osd_opwq_process marks the point at which an operation has been dequeued
11106 // and will begin to be handled by a worker thread.
11107 {
11108 #ifdef WITH_LTTNG
11109 osd_reqid_t reqid;
11110 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11111 reqid = (*_op)->get_reqid();
11112 }
11113 #endif
11114 tracepoint(osd, opwq_process_start, reqid.name._type,
11115 reqid.name._num, reqid.tid, reqid.inc);
11116 }
11117
11118 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11119 Formatter *f = Formatter::create("json");
11120 f->open_object_section("q");
11121 dump(f);
11122 f->close_section();
11123 f->flush(*_dout);
11124 delete f;
11125 *_dout << dendl;
11126
11127 qi.run(osd, sdata, pg, tp_handle);
11128
11129 {
11130 #ifdef WITH_LTTNG
11131 osd_reqid_t reqid;
11132 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11133 reqid = (*_op)->get_reqid();
11134 }
11135 #endif
11136 tracepoint(osd, opwq_process_finish, reqid.name._type,
11137 reqid.name._num, reqid.tid, reqid.inc);
11138 }
11139
11140 handle_oncommits(oncommits);
11141 }
11142
11143 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11144 if (unlikely(m_fast_shutdown) ) {
11145 // stop enqueing when we are in the middle of a fast shutdown
11146 return;
11147 }
11148
11149 uint32_t shard_index =
11150 item.get_ordering_token().hash_to_shard(osd->shards.size());
11151
11152 OSDShard* sdata = osd->shards[shard_index];
11153 assert (NULL != sdata);
11154 if (sdata->get_scheduler_type() == "mClockScheduler") {
11155 item.maybe_set_is_qos_item();
11156 }
11157
11158 dout(20) << __func__ << " " << item << dendl;
11159
11160 bool empty = true;
11161 {
11162 std::lock_guard l{sdata->shard_lock};
11163 empty = sdata->scheduler->empty();
11164 sdata->scheduler->enqueue(std::move(item));
11165 }
11166
11167 {
11168 std::lock_guard l{sdata->sdata_wait_lock};
11169 if (empty) {
11170 sdata->sdata_cond.notify_all();
11171 } else if (sdata->waiting_threads) {
11172 sdata->sdata_cond.notify_one();
11173 }
11174 }
11175 }
11176
11177 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11178 {
11179 if (unlikely(m_fast_shutdown) ) {
11180 // stop enqueing when we are in the middle of a fast shutdown
11181 return;
11182 }
11183
11184 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11185 auto& sdata = osd->shards[shard_index];
11186 ceph_assert(sdata);
11187 sdata->shard_lock.lock();
11188 auto p = sdata->pg_slots.find(item.get_ordering_token());
11189 if (p != sdata->pg_slots.end() &&
11190 !p->second->to_process.empty()) {
11191 // we may be racing with _process, which has dequeued a new item
11192 // from scheduler, put it on to_process, and is now busy taking the
11193 // pg lock. ensure this old requeued item is ordered before any
11194 // such newer item in to_process.
11195 p->second->to_process.push_front(std::move(item));
11196 item = std::move(p->second->to_process.back());
11197 p->second->to_process.pop_back();
11198 dout(20) << __func__
11199 << " " << p->second->to_process.front()
11200 << " shuffled w/ " << item << dendl;
11201 } else {
11202 dout(20) << __func__ << " " << item << dendl;
11203 }
11204 sdata->scheduler->enqueue_front(std::move(item));
11205 sdata->shard_lock.unlock();
11206 std::lock_guard l{sdata->sdata_wait_lock};
11207 sdata->sdata_cond.notify_one();
11208 }
11209
11210 void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11211 {
11212 uint32_t shard_index = 0;
11213 m_fast_shutdown = true;
11214
11215 for (; shard_index < osd->num_shards; shard_index++) {
11216 auto& sdata = osd->shards[shard_index];
11217 ceph_assert(sdata);
11218 sdata->shard_lock.lock();
11219 int work_count = 0;
11220 while(! sdata->scheduler->empty() ) {
11221 auto work_item = sdata->scheduler->dequeue();
11222 work_count++;
11223 }
11224 sdata->shard_lock.unlock();
11225 }
11226 }
11227
11228 namespace ceph::osd_cmds {
11229
11230 int heap(CephContext& cct,
11231 const cmdmap_t& cmdmap,
11232 std::ostream& outos,
11233 std::ostream& erros)
11234 {
11235 if (!ceph_using_tcmalloc()) {
11236 erros << "could not issue heap profiler command -- not using tcmalloc!";
11237 return -EOPNOTSUPP;
11238 }
11239
11240 string cmd;
11241 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11242 erros << "unable to get value for command \"" << cmd << "\"";
11243 return -EINVAL;
11244 }
11245
11246 std::vector<std::string> cmd_vec;
11247 get_str_vec(cmd, cmd_vec);
11248
11249 string val;
11250 if (cmd_getval(cmdmap, "value", val)) {
11251 cmd_vec.push_back(val);
11252 }
11253
11254 ceph_heap_profiler_handle_command(cmd_vec, outos);
11255
11256 return 0;
11257 }
11258
11259 } // namespace ceph::osd_cmds