]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/range/adaptor/reversed.hpp>
28
29 #ifdef HAVE_SYS_PARAM_H
30 #include <sys/param.h>
31 #endif
32
33 #ifdef HAVE_SYS_MOUNT_H
34 #include <sys/mount.h>
35 #endif
36
37 #include "osd/PG.h"
38 #include "osd/scrubber/scrub_machine.h"
39 #include "osd/scrubber/pg_scrubber.h"
40
41 #include "include/types.h"
42 #include "include/compat.h"
43 #include "include/random.h"
44 #include "include/scope_guard.h"
45
46 #include "OSD.h"
47 #include "OSDMap.h"
48 #include "Watch.h"
49 #include "osdc/Objecter.h"
50
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
60
61 #include "os/ObjectStore.h"
62 #ifdef HAVE_LIBFUSE
63 #include "os/FuseStore.h"
64 #endif
65
66 #include "PrimaryLogPG.h"
67
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
70
71 #include "mon/MonClient.h"
72
73 #include "messages/MLog.h"
74
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
90
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery2.h"
96 #include "messages/MOSDPGLog.h"
97 #include "messages/MOSDPGRemove.h"
98 #include "messages/MOSDPGInfo.h"
99 #include "messages/MOSDPGInfo2.h"
100 #include "messages/MOSDPGCreate2.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
111
112 #include "messages/MOSDPeeringOp.h"
113
114 #include "messages/MOSDAlive.h"
115
116 #include "messages/MOSDScrub2.h"
117
118 #include "messages/MCommand.h"
119 #include "messages/MCommandReply.h"
120
121 #include "messages/MPGStats.h"
122
123 #include "messages/MMonGetPurgedSnaps.h"
124 #include "messages/MMonGetPurgedSnapsReply.h"
125
126 #include "common/perf_counters.h"
127 #include "common/Timer.h"
128 #include "common/LogClient.h"
129 #include "common/AsyncReserver.h"
130 #include "common/HeartbeatMap.h"
131 #include "common/admin_socket.h"
132 #include "common/ceph_context.h"
133
134 #include "global/signal_handler.h"
135 #include "global/pidfile.h"
136
137 #include "include/color.h"
138 #include "perfglue/cpu_profiler.h"
139 #include "perfglue/heap_profiler.h"
140
141 #include "osd/ClassHandler.h"
142 #include "osd/OpRequest.h"
143
144 #include "auth/AuthAuthorizeHandler.h"
145 #include "auth/RotatingKeyRing.h"
146
147 #include "objclass/objclass.h"
148
149 #include "common/cmdparse.h"
150 #include "include/str_list.h"
151 #include "include/util.h"
152
153 #include "include/ceph_assert.h"
154 #include "common/config.h"
155 #include "common/EventTrace.h"
156
157 #include "json_spirit/json_spirit_reader.h"
158 #include "json_spirit/json_spirit_writer.h"
159
160 #ifdef WITH_LTTNG
161 #define TRACEPOINT_DEFINE
162 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
163 #include "tracing/osd.h"
164 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165 #undef TRACEPOINT_DEFINE
166 #else
167 #define tracepoint(...)
168 #endif
169
170 #include "osd_tracer.h"
171
172
173 #define dout_context cct
174 #define dout_subsys ceph_subsys_osd
175 #undef dout_prefix
176 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
177
178 using std::deque;
179 using std::list;
180 using std::lock_guard;
181 using std::make_pair;
182 using std::make_tuple;
183 using std::make_unique;
184 using std::map;
185 using std::ostream;
186 using std::ostringstream;
187 using std::pair;
188 using std::set;
189 using std::string;
190 using std::stringstream;
191 using std::to_string;
192 using std::unique_ptr;
193 using std::vector;
194
195 using ceph::bufferlist;
196 using ceph::bufferptr;
197 using ceph::decode;
198 using ceph::encode;
199 using ceph::fixed_u_to_string;
200 using ceph::Formatter;
201 using ceph::heartbeat_handle_d;
202 using ceph::make_mutex;
203
204 using namespace ceph::osd::scheduler;
205 using TOPNSPC::common::cmd_getval;
206 using TOPNSPC::common::cmd_getval_or;
207
208 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
209 return *_dout << "osd." << whoami << " " << epoch << " ";
210 }
211
212
213 //Initial features in new superblock.
214 //Features here are also automatically upgraded
215 CompatSet OSD::get_osd_initial_compat_set() {
216 CompatSet::FeatureSet ceph_osd_feature_compat;
217 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
218 CompatSet::FeatureSet ceph_osd_feature_incompat;
219 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
220 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
221 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
222 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
223 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
224 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
225 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
235 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
236 ceph_osd_feature_incompat);
237 }
238
239 //Features are added here that this OSD supports.
240 CompatSet OSD::get_osd_compat_set() {
241 CompatSet compat = get_osd_initial_compat_set();
242 //Any features here can be set in code, but not in initial superblock
243 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
244 return compat;
245 }
246
247 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
248 osd(osd),
249 cct(osd->cct),
250 whoami(osd->whoami), store(osd->store.get()),
251 log_client(osd->log_client), clog(osd->clog),
252 pg_recovery_stats(osd->pg_recovery_stats),
253 cluster_messenger(osd->cluster_messenger),
254 client_messenger(osd->client_messenger),
255 logger(osd->logger),
256 recoverystate_perf(osd->recoverystate_perf),
257 monc(osd->monc),
258 osd_max_object_size(cct->_conf, "osd_max_object_size"),
259 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
260 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
261 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
262 m_scrub_queue{cct, *this},
263 agent_valid_iterator(false),
264 agent_ops(0),
265 flush_mode_high_count(0),
266 agent_active(true),
267 agent_thread(this),
268 agent_stop_flag(false),
269 agent_timer(osd->client_messenger->cct, agent_timer_lock),
270 last_recalibrate(ceph_clock_now()),
271 promote_max_objects(0),
272 promote_max_bytes(0),
273 poolctx(poolctx),
274 objecter(make_unique<Objecter>(osd->client_messenger->cct,
275 osd->objecter_messenger,
276 osd->monc, poolctx)),
277 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
278 watch_timer(osd->client_messenger->cct, watch_lock),
279 next_notif_id(0),
280 recovery_request_timer(cct, recovery_request_lock, false),
281 sleep_timer(cct, sleep_lock, false),
282 reserver_finisher(cct),
283 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
284 cct->_conf->osd_min_recovery_priority),
285 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
286 cct->_conf->osd_min_recovery_priority),
287 snap_reserver(cct, &reserver_finisher,
288 cct->_conf->osd_max_trimming_pgs),
289 recovery_ops_active(0),
290 recovery_ops_reserved(0),
291 recovery_paused(false),
292 map_cache(cct, cct->_conf->osd_map_cache_size),
293 map_bl_cache(cct->_conf->osd_map_cache_size),
294 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
295 cur_state(NONE),
296 cur_ratio(0), physical_ratio(0),
297 boot_epoch(0), up_epoch(0), bind_epoch(0)
298 {
299 objecter->init();
300
301 for (int i = 0; i < m_objecter_finishers; i++) {
302 ostringstream str;
303 str << "objecter-finisher-" << i;
304 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
305 objecter_finishers.push_back(std::move(fin));
306 }
307 }
308
309 #ifdef PG_DEBUG_REFS
310 void OSDService::add_pgid(spg_t pgid, PG *pg) {
311 std::lock_guard l(pgid_lock);
312 if (!pgid_tracker.count(pgid)) {
313 live_pgs[pgid] = pg;
314 }
315 pgid_tracker[pgid]++;
316 }
317 void OSDService::remove_pgid(spg_t pgid, PG *pg)
318 {
319 std::lock_guard l(pgid_lock);
320 ceph_assert(pgid_tracker.count(pgid));
321 ceph_assert(pgid_tracker[pgid] > 0);
322 pgid_tracker[pgid]--;
323 if (pgid_tracker[pgid] == 0) {
324 pgid_tracker.erase(pgid);
325 live_pgs.erase(pgid);
326 }
327 }
328 void OSDService::dump_live_pgids()
329 {
330 std::lock_guard l(pgid_lock);
331 derr << "live pgids:" << dendl;
332 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
333 i != pgid_tracker.cend();
334 ++i) {
335 derr << "\t" << *i << dendl;
336 live_pgs[i->first]->dump_live_ids();
337 }
338 }
339 #endif
340
341
342 ceph::signedspan OSDService::get_mnow() const
343 {
344 return ceph::mono_clock::now() - osd->startup_time;
345 }
346
347 void OSDService::identify_splits_and_merges(
348 OSDMapRef old_map,
349 OSDMapRef new_map,
350 spg_t pgid,
351 set<pair<spg_t,epoch_t>> *split_children,
352 set<pair<spg_t,epoch_t>> *merge_pgs)
353 {
354 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
355 << " to e" << new_map->get_epoch() << dendl;
356 if (!old_map->have_pg_pool(pgid.pool())) {
357 dout(20) << __func__ << " " << pgid << " pool " << pgid.pool()
358 << " does not exist in old map" << dendl;
359 return;
360 }
361 int old_pgnum = old_map->get_pg_num(pgid.pool());
362 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
363 if (p == osd->pg_num_history.pg_nums.end()) {
364 dout(20) << __func__ << " " << pgid << " pool " << pgid.pool()
365 << " has no history" << dendl;
366 return;
367 }
368 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
369 << " to e" << new_map->get_epoch()
370 << " pg_nums " << p->second << dendl;
371 deque<spg_t> queue;
372 queue.push_back(pgid);
373 set<spg_t> did;
374 while (!queue.empty()) {
375 auto cur = queue.front();
376 queue.pop_front();
377 did.insert(cur);
378 unsigned pgnum = old_pgnum;
379 for (auto q = p->second.lower_bound(old_map->get_epoch());
380 q != p->second.end() &&
381 q->first <= new_map->get_epoch();
382 ++q) {
383 if (pgnum < q->second) {
384 // split?
385 if (cur.ps() < pgnum) {
386 set<spg_t> children;
387 if (cur.is_split(pgnum, q->second, &children)) {
388 dout(20) << __func__ << " " << cur << " e" << q->first
389 << " pg_num " << pgnum << " -> " << q->second
390 << " children " << children << dendl;
391 for (auto i : children) {
392 split_children->insert(make_pair(i, q->first));
393 if (!did.count(i))
394 queue.push_back(i);
395 }
396 }
397 } else if (cur.ps() < q->second) {
398 dout(20) << __func__ << " " << cur << " e" << q->first
399 << " pg_num " << pgnum << " -> " << q->second
400 << " is a child" << dendl;
401 // normally we'd capture this from the parent, but it's
402 // possible the parent doesn't exist yet (it will be
403 // fabricated to allow an intervening merge). note this PG
404 // as a split child here to be sure we catch it.
405 split_children->insert(make_pair(cur, q->first));
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is post-split, skipping" << dendl;
410 }
411 } else if (merge_pgs) {
412 // merge?
413 if (cur.ps() >= q->second) {
414 if (cur.ps() < pgnum) {
415 spg_t parent;
416 if (cur.is_merge_source(pgnum, q->second, &parent)) {
417 set<spg_t> children;
418 parent.is_split(q->second, pgnum, &children);
419 dout(20) << __func__ << " " << cur << " e" << q->first
420 << " pg_num " << pgnum << " -> " << q->second
421 << " is merge source, target " << parent
422 << ", source(s) " << children << dendl;
423 merge_pgs->insert(make_pair(parent, q->first));
424 if (!did.count(parent)) {
425 // queue (and re-scan) parent in case it might not exist yet
426 // and there are some future splits pending on it
427 queue.push_back(parent);
428 }
429 for (auto c : children) {
430 merge_pgs->insert(make_pair(c, q->first));
431 if (!did.count(c))
432 queue.push_back(c);
433 }
434 }
435 } else {
436 dout(20) << __func__ << " " << cur << " e" << q->first
437 << " pg_num " << pgnum << " -> " << q->second
438 << " is beyond old pgnum, skipping" << dendl;
439 }
440 } else {
441 set<spg_t> children;
442 if (cur.is_split(q->second, pgnum, &children)) {
443 dout(20) << __func__ << " " << cur << " e" << q->first
444 << " pg_num " << pgnum << " -> " << q->second
445 << " is merge target, source " << children << dendl;
446 for (auto c : children) {
447 merge_pgs->insert(make_pair(c, q->first));
448 if (!did.count(c))
449 queue.push_back(c);
450 }
451 merge_pgs->insert(make_pair(cur, q->first));
452 }
453 }
454 }
455 pgnum = q->second;
456 }
457 }
458 }
459
460 void OSDService::need_heartbeat_peer_update()
461 {
462 osd->need_heartbeat_peer_update();
463 }
464
465 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
466 {
467 std::lock_guard l(hb_stamp_lock);
468 if (peer >= hb_stamps.size()) {
469 hb_stamps.resize(peer + 1);
470 }
471 if (!hb_stamps[peer]) {
472 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
473 }
474 return hb_stamps[peer];
475 }
476
477 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
478 {
479 osd->enqueue_peering_evt(
480 spgid,
481 PGPeeringEventRef(
482 std::make_shared<PGPeeringEvent>(
483 epoch, epoch,
484 RenewLease())));
485 }
486
487 void OSDService::start_shutdown()
488 {
489 {
490 std::lock_guard l(agent_timer_lock);
491 agent_timer.shutdown();
492 }
493
494 {
495 std::lock_guard l(sleep_lock);
496 sleep_timer.shutdown();
497 }
498
499 {
500 std::lock_guard l(recovery_request_lock);
501 recovery_request_timer.shutdown();
502 }
503 }
504
505 void OSDService::shutdown_reserver()
506 {
507 reserver_finisher.wait_for_empty();
508 reserver_finisher.stop();
509 }
510
511 void OSDService::shutdown()
512 {
513 mono_timer.suspend();
514
515 {
516 std::lock_guard l(watch_lock);
517 watch_timer.shutdown();
518 }
519
520 objecter->shutdown();
521 for (auto& f : objecter_finishers) {
522 f->wait_for_empty();
523 f->stop();
524 }
525
526 publish_map(OSDMapRef());
527 next_osdmap = OSDMapRef();
528 }
529
530 void OSDService::init()
531 {
532 reserver_finisher.start();
533 for (auto& f : objecter_finishers) {
534 f->start();
535 }
536 objecter->set_client_incarnation(0);
537
538 // deprioritize objecter in daemonperf output
539 objecter->get_logger()->set_prio_adjust(-3);
540
541 watch_timer.init();
542 agent_timer.init();
543 mono_timer.resume();
544
545 agent_thread.create("osd_srv_agent");
546
547 if (cct->_conf->osd_recovery_delay_start)
548 defer_recovery(cct->_conf->osd_recovery_delay_start);
549 }
550
551 void OSDService::final_init()
552 {
553 objecter->start(osdmap.get());
554 }
555
556 void OSDService::activate_map()
557 {
558 // wake/unwake the tiering agent
559 std::lock_guard l{agent_lock};
560 agent_active =
561 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
562 osd->is_active();
563 agent_cond.notify_all();
564 }
565
566 OSDMapRef OSDService::get_nextmap_reserved() {
567 std::lock_guard l(pre_publish_lock);
568
569 epoch_t e = next_osdmap->get_epoch();
570
571 std::map<epoch_t, unsigned>::iterator i =
572 map_reservations.insert(std::make_pair(e, 0)).first;
573 i->second++;
574 dout(20) << __func__ << " map_reservations: " << map_reservations << dendl;
575 return next_osdmap;
576 }
577
578 /// releases reservation on map
579 void OSDService::release_map(OSDMapRef osdmap) {
580 std::lock_guard l(pre_publish_lock);
581 dout(20) << __func__ << " epoch: " << osdmap->get_epoch() << dendl;
582 std::map<epoch_t, unsigned>::iterator i =
583 map_reservations.find(osdmap->get_epoch());
584 ceph_assert(i != map_reservations.end());
585 ceph_assert(i->second > 0);
586 if (--(i->second) == 0) {
587 map_reservations.erase(i);
588 }
589 if (pre_publish_waiter) {
590 dout(20) << __func__ << " notify all." << dendl;
591 pre_publish_cond.notify_all();
592 }
593 }
594
595 /// blocks until there are no reserved maps prior to next_osdmap
596 void OSDService::await_reserved_maps() {
597 std::unique_lock l{pre_publish_lock};
598 dout(20) << __func__ << " epoch:" << next_osdmap->get_epoch() << dendl;
599
600 ceph_assert(next_osdmap);
601 pre_publish_waiter++;
602 pre_publish_cond.wait(l, [this] {
603 auto i = map_reservations.cbegin();
604 return (i == map_reservations.cend() ||
605 i->first >= next_osdmap->get_epoch());
606 });
607 pre_publish_waiter--;
608 dout(20) << __func__ << " done " << pre_publish_waiter << dendl;
609 }
610
611 void OSDService::request_osdmap_update(epoch_t e)
612 {
613 osd->osdmap_subscribe(e, false);
614 }
615
616
617 class AgentTimeoutCB : public Context {
618 PGRef pg;
619 public:
620 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
621 void finish(int) override {
622 pg->agent_choose_mode_restart();
623 }
624 };
625
626 void OSDService::agent_entry()
627 {
628 dout(10) << __func__ << " start" << dendl;
629 std::unique_lock agent_locker{agent_lock};
630
631 while (!agent_stop_flag) {
632 if (agent_queue.empty()) {
633 dout(20) << __func__ << " empty queue" << dendl;
634 agent_cond.wait(agent_locker);
635 continue;
636 }
637 uint64_t level = agent_queue.rbegin()->first;
638 set<PGRef>& top = agent_queue.rbegin()->second;
639 dout(10) << __func__
640 << " tiers " << agent_queue.size()
641 << ", top is " << level
642 << " with pgs " << top.size()
643 << ", ops " << agent_ops << "/"
644 << cct->_conf->osd_agent_max_ops
645 << (agent_active ? " active" : " NOT ACTIVE")
646 << dendl;
647 dout(20) << __func__ << " oids " << agent_oids << dendl;
648 int max = cct->_conf->osd_agent_max_ops - agent_ops;
649 int agent_flush_quota = max;
650 if (!flush_mode_high_count)
651 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
652 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
653 agent_cond.wait(agent_locker);
654 continue;
655 }
656
657 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
658 agent_queue_pos = top.begin();
659 agent_valid_iterator = true;
660 }
661 PGRef pg = *agent_queue_pos;
662 dout(10) << "high_count " << flush_mode_high_count
663 << " agent_ops " << agent_ops
664 << " flush_quota " << agent_flush_quota << dendl;
665 agent_locker.unlock();
666 if (!pg->agent_work(max, agent_flush_quota)) {
667 dout(10) << __func__ << " " << pg->pg_id
668 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
669 << " seconds" << dendl;
670
671 logger->inc(l_osd_tier_delay);
672 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
673 std::lock_guard timer_locker{agent_timer_lock};
674 Context *cb = new AgentTimeoutCB(pg);
675 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
676 }
677 agent_locker.lock();
678 }
679 dout(10) << __func__ << " finish" << dendl;
680 }
681
682 void OSDService::agent_stop()
683 {
684 {
685 std::lock_guard l(agent_lock);
686
687 // By this time all ops should be cancelled
688 ceph_assert(agent_ops == 0);
689 // By this time all PGs are shutdown and dequeued
690 if (!agent_queue.empty()) {
691 set<PGRef>& top = agent_queue.rbegin()->second;
692 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
693 ceph_abort_msg("agent queue not empty");
694 }
695
696 agent_stop_flag = true;
697 agent_cond.notify_all();
698 }
699 agent_thread.join();
700 }
701
702 // -------------------------------------
703
704 void OSDService::promote_throttle_recalibrate()
705 {
706 utime_t now = ceph_clock_now();
707 double dur = now - last_recalibrate;
708 last_recalibrate = now;
709 unsigned prob = promote_probability_millis;
710
711 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
712 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
713
714 unsigned min_prob = 1;
715
716 uint64_t attempts, obj, bytes;
717 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
718 dout(10) << __func__ << " " << attempts << " attempts, promoted "
719 << obj << " objects and " << byte_u_t(bytes) << "; target "
720 << target_obj_sec << " obj/sec or "
721 << byte_u_t(target_bytes_sec) << "/sec"
722 << dendl;
723
724 // calculate what the probability *should* be, given the targets
725 unsigned new_prob;
726 if (attempts && dur > 0) {
727 uint64_t avg_size = 1;
728 if (obj)
729 avg_size = std::max<uint64_t>(bytes / obj, 1);
730 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
731 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
732 / (double)attempts;
733 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
734 << avg_size << dendl;
735 if (target_obj_sec && target_bytes_sec)
736 new_prob = std::min(po, pb);
737 else if (target_obj_sec)
738 new_prob = po;
739 else if (target_bytes_sec)
740 new_prob = pb;
741 else
742 new_prob = 1000;
743 } else {
744 new_prob = 1000;
745 }
746 dout(20) << __func__ << " new_prob " << new_prob << dendl;
747
748 // correct for persistent skew between target rate and actual rate, adjust
749 double ratio = 1.0;
750 unsigned actual = 0;
751 if (attempts && obj) {
752 actual = obj * 1000 / attempts;
753 ratio = (double)actual / (double)prob;
754 new_prob = (double)new_prob / ratio;
755 }
756 new_prob = std::max(new_prob, min_prob);
757 new_prob = std::min(new_prob, 1000u);
758
759 // adjust
760 prob = (prob + new_prob) / 2;
761 prob = std::max(prob, min_prob);
762 prob = std::min(prob, 1000u);
763 dout(10) << __func__ << " actual " << actual
764 << ", actual/prob ratio " << ratio
765 << ", adjusted new_prob " << new_prob
766 << ", prob " << promote_probability_millis << " -> " << prob
767 << dendl;
768 promote_probability_millis = prob;
769
770 // set hard limits for this interval to mitigate stampedes
771 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
772 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
773 }
774
775 // -------------------------------------
776
777 float OSDService::get_failsafe_full_ratio()
778 {
779 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
780 if (full_ratio > 1.0) full_ratio /= 100.0;
781 return full_ratio;
782 }
783
784 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
785 {
786 // The OSDMap ratios take precendence. So if the failsafe is .95 and
787 // the admin sets the cluster full to .96, the failsafe moves up to .96
788 // too. (Not that having failsafe == full is ideal, but it's better than
789 // dropping writes before the clusters appears full.)
790 OSDMapRef osdmap = get_osdmap();
791 if (!osdmap || osdmap->get_epoch() == 0) {
792 return NONE;
793 }
794 float nearfull_ratio = osdmap->get_nearfull_ratio();
795 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
796 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
797 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
798
799 if (osdmap->require_osd_release < ceph_release_t::luminous) {
800 // use the failsafe for nearfull and full; the mon isn't using the
801 // flags anyway because we're mid-upgrade.
802 full_ratio = failsafe_ratio;
803 backfillfull_ratio = failsafe_ratio;
804 nearfull_ratio = failsafe_ratio;
805 } else if (full_ratio <= 0 ||
806 backfillfull_ratio <= 0 ||
807 nearfull_ratio <= 0) {
808 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
809 // use failsafe flag. ick. the monitor did something wrong or the user
810 // did something stupid.
811 full_ratio = failsafe_ratio;
812 backfillfull_ratio = failsafe_ratio;
813 nearfull_ratio = failsafe_ratio;
814 }
815
816 if (injectfull_state > NONE && injectfull) {
817 inject = "(Injected)";
818 return injectfull_state;
819 } else if (pratio > failsafe_ratio) {
820 return FAILSAFE;
821 } else if (ratio > full_ratio) {
822 return FULL;
823 } else if (ratio > backfillfull_ratio) {
824 return BACKFILLFULL;
825 } else if (pratio > nearfull_ratio) {
826 return NEARFULL;
827 }
828 return NONE;
829 }
830
831 void OSDService::check_full_status(float ratio, float pratio)
832 {
833 std::lock_guard l(full_status_lock);
834
835 cur_ratio = ratio;
836 physical_ratio = pratio;
837
838 string inject;
839 s_names new_state;
840 new_state = recalc_full_state(ratio, pratio, inject);
841
842 dout(20) << __func__ << " cur ratio " << ratio
843 << ", physical ratio " << pratio
844 << ", new state " << get_full_state_name(new_state)
845 << " " << inject
846 << dendl;
847
848 // warn
849 if (cur_state != new_state) {
850 dout(10) << __func__ << " " << get_full_state_name(cur_state)
851 << " -> " << get_full_state_name(new_state) << dendl;
852 if (new_state == FAILSAFE) {
853 clog->error() << "full status failsafe engaged, dropping updates, now "
854 << (int)roundf(ratio * 100) << "% full";
855 } else if (cur_state == FAILSAFE) {
856 clog->error() << "full status failsafe disengaged, no longer dropping "
857 << "updates, now " << (int)roundf(ratio * 100) << "% full";
858 }
859 cur_state = new_state;
860 }
861 }
862
863 bool OSDService::need_fullness_update()
864 {
865 OSDMapRef osdmap = get_osdmap();
866 s_names cur = NONE;
867 if (osdmap->exists(whoami)) {
868 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
869 cur = FULL;
870 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
871 cur = BACKFILLFULL;
872 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
873 cur = NEARFULL;
874 }
875 }
876 s_names want = NONE;
877 if (is_full())
878 want = FULL;
879 else if (is_backfillfull())
880 want = BACKFILLFULL;
881 else if (is_nearfull())
882 want = NEARFULL;
883 return want != cur;
884 }
885
886 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
887 {
888 if (injectfull && injectfull_state >= type) {
889 // injectfull is either a count of the number of times to return failsafe full
890 // or if -1 then always return full
891 if (injectfull > 0)
892 --injectfull;
893 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
894 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
895 << dendl;
896 return true;
897 }
898 return false;
899 }
900
901 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
902 {
903 std::lock_guard l(full_status_lock);
904
905 if (_check_inject_full(dpp, type))
906 return true;
907
908 if (cur_state >= type)
909 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
910 << " physical " << physical_ratio << dendl;
911
912 return cur_state >= type;
913 }
914
915 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
916 {
917 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
918 {
919 std::lock_guard l(full_status_lock);
920 if (_check_inject_full(dpp, type)) {
921 return true;
922 }
923 }
924
925 float pratio;
926 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
927
928 string notused;
929 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
930
931 if (tentative_state >= type)
932 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
933
934 return tentative_state >= type;
935 }
936
937 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
938 {
939 return _check_full(dpp, FAILSAFE);
940 }
941
942 bool OSDService::check_full(DoutPrefixProvider *dpp) const
943 {
944 return _check_full(dpp, FULL);
945 }
946
947 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
948 {
949 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
950 }
951
952 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
953 {
954 return _check_full(dpp, BACKFILLFULL);
955 }
956
957 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
958 {
959 return _check_full(dpp, NEARFULL);
960 }
961
962 bool OSDService::is_failsafe_full() const
963 {
964 std::lock_guard l(full_status_lock);
965 return cur_state == FAILSAFE;
966 }
967
968 bool OSDService::is_full() const
969 {
970 std::lock_guard l(full_status_lock);
971 return cur_state >= FULL;
972 }
973
974 bool OSDService::is_backfillfull() const
975 {
976 std::lock_guard l(full_status_lock);
977 return cur_state >= BACKFILLFULL;
978 }
979
980 bool OSDService::is_nearfull() const
981 {
982 std::lock_guard l(full_status_lock);
983 return cur_state >= NEARFULL;
984 }
985
986 void OSDService::set_injectfull(s_names type, int64_t count)
987 {
988 std::lock_guard l(full_status_lock);
989 injectfull_state = type;
990 injectfull = count;
991 }
992
993 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
994 osd_alert_list_t& alerts)
995 {
996 uint64_t bytes = stbuf.total;
997 uint64_t avail = stbuf.available;
998 uint64_t used = stbuf.get_used_raw();
999
1000 // For testing fake statfs values so it doesn't matter if all
1001 // OSDs are using the same partition.
1002 if (cct->_conf->fake_statfs_for_testing) {
1003 uint64_t total_num_bytes = 0;
1004 vector<PGRef> pgs;
1005 osd->_get_pgs(&pgs);
1006 for (auto p : pgs) {
1007 total_num_bytes += p->get_stats_num_bytes();
1008 }
1009 bytes = cct->_conf->fake_statfs_for_testing;
1010 if (total_num_bytes < bytes)
1011 avail = bytes - total_num_bytes;
1012 else
1013 avail = 0;
1014 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
1015 << " adjust available " << avail
1016 << dendl;
1017 used = bytes - avail;
1018 }
1019
1020 logger->set(l_osd_stat_bytes, bytes);
1021 logger->set(l_osd_stat_bytes_used, used);
1022 logger->set(l_osd_stat_bytes_avail, avail);
1023
1024 std::lock_guard l(stat_lock);
1025 osd_stat.statfs = stbuf;
1026 osd_stat.os_alerts.clear();
1027 osd_stat.os_alerts[whoami].swap(alerts);
1028 if (cct->_conf->fake_statfs_for_testing) {
1029 osd_stat.statfs.total = bytes;
1030 osd_stat.statfs.available = avail;
1031 // For testing don't want used to go negative, so clear reserved
1032 osd_stat.statfs.internally_reserved = 0;
1033 }
1034 }
1035
1036 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
1037 int num_pgs)
1038 {
1039 utime_t now = ceph_clock_now();
1040 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
1041 std::lock_guard l(stat_lock);
1042 osd_stat.hb_peers.swap(hb_peers);
1043 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1044 osd_stat.num_pgs = num_pgs;
1045 // Clean entries that aren't updated
1046 // This is called often enough that we can just remove 1 at a time
1047 for (auto i: osd_stat.hb_pingtime) {
1048 if (i.second.last_update == 0)
1049 continue;
1050 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1051 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1052 << " last_update " << i.second.last_update << dendl;
1053 osd_stat.hb_pingtime.erase(i.first);
1054 break;
1055 }
1056 }
1057 return osd_stat;
1058 }
1059
1060 void OSDService::inc_osd_stat_repaired()
1061 {
1062 std::lock_guard l(stat_lock);
1063 osd_stat.num_shards_repaired++;
1064 return;
1065 }
1066
1067 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1068 uint64_t adjust_used)
1069 {
1070 *pratio =
1071 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1072
1073 if (adjust_used) {
1074 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1075 if (new_stat.statfs.available > adjust_used)
1076 new_stat.statfs.available -= adjust_used;
1077 else
1078 new_stat.statfs.available = 0;
1079 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1080 }
1081
1082 // Check all pgs and adjust kb_used to include all pending backfill data
1083 int backfill_adjusted = 0;
1084 vector<PGRef> pgs;
1085 osd->_get_pgs(&pgs);
1086 for (auto p : pgs) {
1087 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1088 }
1089 if (backfill_adjusted) {
1090 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1091 }
1092 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1093 }
1094
1095 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1096 {
1097 dout(20) << __func__ << " " << m->get_type_name() << " to osd." << peer
1098 << " from_epoch " << from_epoch << dendl;
1099 OSDMapRef next_map = get_nextmap_reserved();
1100 // service map is always newer/newest
1101 ceph_assert(from_epoch <= next_map->get_epoch());
1102
1103 if (next_map->is_down(peer) ||
1104 next_map->get_info(peer).up_from > from_epoch) {
1105 m->put();
1106 release_map(next_map);
1107 return;
1108 }
1109 ConnectionRef peer_con;
1110 if (peer == whoami) {
1111 peer_con = osd->cluster_messenger->get_loopback_connection();
1112 } else {
1113 peer_con = osd->cluster_messenger->connect_to_osd(
1114 next_map->get_cluster_addrs(peer), false, true);
1115 }
1116 maybe_share_map(peer_con.get(), next_map);
1117 peer_con->send_message(m);
1118 release_map(next_map);
1119 }
1120
1121 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1122 {
1123 dout(20) << __func__ << " from_epoch " << from_epoch << dendl;
1124 OSDMapRef next_map = get_nextmap_reserved();
1125 // service map is always newer/newest
1126 ceph_assert(from_epoch <= next_map->get_epoch());
1127
1128 for (auto& iter : messages) {
1129 if (next_map->is_down(iter.first) ||
1130 next_map->get_info(iter.first).up_from > from_epoch) {
1131 iter.second->put();
1132 continue;
1133 }
1134 ConnectionRef peer_con;
1135 if (iter.first == whoami) {
1136 peer_con = osd->cluster_messenger->get_loopback_connection();
1137 } else {
1138 peer_con = osd->cluster_messenger->connect_to_osd(
1139 next_map->get_cluster_addrs(iter.first), false, true);
1140 }
1141 maybe_share_map(peer_con.get(), next_map);
1142 peer_con->send_message(iter.second);
1143 }
1144 release_map(next_map);
1145 }
1146 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1147 {
1148 dout(20) << __func__ << " to osd." << peer
1149 << " from_epoch " << from_epoch << dendl;
1150 OSDMapRef next_map = get_nextmap_reserved();
1151 // service map is always newer/newest
1152 ceph_assert(from_epoch <= next_map->get_epoch());
1153
1154 if (next_map->is_down(peer) ||
1155 next_map->get_info(peer).up_from > from_epoch) {
1156 release_map(next_map);
1157 return NULL;
1158 }
1159 ConnectionRef con;
1160 if (peer == whoami) {
1161 con = osd->cluster_messenger->get_loopback_connection();
1162 } else {
1163 con = osd->cluster_messenger->connect_to_osd(
1164 next_map->get_cluster_addrs(peer), false, true);
1165 }
1166 release_map(next_map);
1167 return con;
1168 }
1169
1170 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1171 {
1172 dout(20) << __func__ << " to osd." << peer
1173 << " from_epoch " << from_epoch << dendl;
1174 OSDMapRef next_map = get_nextmap_reserved();
1175 // service map is always newer/newest
1176 ceph_assert(from_epoch <= next_map->get_epoch());
1177
1178 pair<ConnectionRef,ConnectionRef> ret;
1179 if (next_map->is_down(peer) ||
1180 next_map->get_info(peer).up_from > from_epoch) {
1181 release_map(next_map);
1182 return ret;
1183 }
1184 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1185 next_map->get_hb_back_addrs(peer));
1186 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1187 next_map->get_hb_front_addrs(peer));
1188 release_map(next_map);
1189 return ret;
1190 }
1191
1192 entity_name_t OSDService::get_cluster_msgr_name() const
1193 {
1194 return cluster_messenger->get_myname();
1195 }
1196
1197 void OSDService::queue_want_pg_temp(pg_t pgid,
1198 const vector<int>& want,
1199 bool forced)
1200 {
1201 std::lock_guard l(pg_temp_lock);
1202 auto p = pg_temp_pending.find(pgid);
1203 if (p == pg_temp_pending.end() ||
1204 p->second.acting != want ||
1205 forced) {
1206 pg_temp_wanted[pgid] = {want, forced};
1207 }
1208 }
1209
1210 void OSDService::remove_want_pg_temp(pg_t pgid)
1211 {
1212 std::lock_guard l(pg_temp_lock);
1213 pg_temp_wanted.erase(pgid);
1214 pg_temp_pending.erase(pgid);
1215 }
1216
1217 void OSDService::_sent_pg_temp()
1218 {
1219 #ifdef HAVE_STDLIB_MAP_SPLICING
1220 pg_temp_pending.merge(pg_temp_wanted);
1221 #else
1222 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1223 make_move_iterator(end(pg_temp_wanted)));
1224 #endif
1225 pg_temp_wanted.clear();
1226 }
1227
1228 void OSDService::requeue_pg_temp()
1229 {
1230 std::lock_guard l(pg_temp_lock);
1231 // wanted overrides pending. note that remove_want_pg_temp
1232 // clears the item out of both.
1233 unsigned old_wanted = pg_temp_wanted.size();
1234 unsigned old_pending = pg_temp_pending.size();
1235 _sent_pg_temp();
1236 pg_temp_wanted.swap(pg_temp_pending);
1237 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1238 << pg_temp_wanted.size() << dendl;
1239 }
1240
1241 std::ostream& operator<<(std::ostream& out,
1242 const OSDService::pg_temp_t& pg_temp)
1243 {
1244 out << pg_temp.acting;
1245 if (pg_temp.forced) {
1246 out << " (forced)";
1247 }
1248 return out;
1249 }
1250
1251 void OSDService::send_pg_temp()
1252 {
1253 std::lock_guard l(pg_temp_lock);
1254 if (pg_temp_wanted.empty())
1255 return;
1256 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1257 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1258 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1259 auto& m = ms[pg_temp.forced];
1260 if (!m) {
1261 m = new MOSDPGTemp(osdmap->get_epoch());
1262 m->forced = pg_temp.forced;
1263 }
1264 m->pg_temp.emplace(pgid, pg_temp.acting);
1265 }
1266 for (auto m : ms) {
1267 if (m) {
1268 monc->send_mon_message(m);
1269 }
1270 }
1271 _sent_pg_temp();
1272 }
1273
1274 void OSDService::send_pg_created(pg_t pgid)
1275 {
1276 std::lock_guard l(pg_created_lock);
1277 dout(20) << __func__ << dendl;
1278 auto o = get_osdmap();
1279 if (o->require_osd_release >= ceph_release_t::luminous) {
1280 pg_created.insert(pgid);
1281 monc->send_mon_message(new MOSDPGCreated(pgid));
1282 }
1283 }
1284
1285 void OSDService::send_pg_created()
1286 {
1287 std::lock_guard l(pg_created_lock);
1288 dout(20) << __func__ << dendl;
1289 auto o = get_osdmap();
1290 if (o->require_osd_release >= ceph_release_t::luminous) {
1291 for (auto pgid : pg_created) {
1292 monc->send_mon_message(new MOSDPGCreated(pgid));
1293 }
1294 }
1295 }
1296
1297 void OSDService::prune_pg_created()
1298 {
1299 std::lock_guard l(pg_created_lock);
1300 dout(20) << __func__ << dendl;
1301 auto o = get_osdmap();
1302 auto i = pg_created.begin();
1303 while (i != pg_created.end()) {
1304 auto p = o->get_pg_pool(i->pool());
1305 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1306 dout(20) << __func__ << " pruning " << *i << dendl;
1307 i = pg_created.erase(i);
1308 } else {
1309 dout(20) << __func__ << " keeping " << *i << dendl;
1310 ++i;
1311 }
1312 }
1313 }
1314
1315
1316 // --------------------------------------
1317 // dispatch
1318
1319 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1320 epoch_t *_bind_epoch) const
1321 {
1322 std::lock_guard l(epoch_lock);
1323 if (_boot_epoch)
1324 *_boot_epoch = boot_epoch;
1325 if (_up_epoch)
1326 *_up_epoch = up_epoch;
1327 if (_bind_epoch)
1328 *_bind_epoch = bind_epoch;
1329 }
1330
1331 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1332 const epoch_t *_bind_epoch)
1333 {
1334 std::lock_guard l(epoch_lock);
1335 if (_boot_epoch) {
1336 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1337 boot_epoch = *_boot_epoch;
1338 }
1339 if (_up_epoch) {
1340 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1341 up_epoch = *_up_epoch;
1342 }
1343 if (_bind_epoch) {
1344 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1345 bind_epoch = *_bind_epoch;
1346 }
1347 }
1348
1349 bool OSDService::prepare_to_stop()
1350 {
1351 std::unique_lock l(is_stopping_lock);
1352 if (get_state() != NOT_STOPPING)
1353 return false;
1354
1355 OSDMapRef osdmap = get_osdmap();
1356 if (osdmap && osdmap->is_up(whoami)) {
1357 dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
1358 set_state(PREPARING_TO_STOP);
1359 monc->send_mon_message(
1360 new MOSDMarkMeDown(
1361 monc->get_fsid(),
1362 whoami,
1363 osdmap->get_addrs(whoami),
1364 osdmap->get_epoch(),
1365 true, // request ack
1366 true // mark as down and dead
1367 ));
1368 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1369 is_stopping_cond.wait_for(l, timeout,
1370 [this] { return get_state() == STOPPING; });
1371 }
1372
1373 dout(0) << __func__ << " starting shutdown" << dendl;
1374 set_state(STOPPING);
1375 return true;
1376 }
1377
1378 void OSDService::got_stop_ack()
1379 {
1380 std::scoped_lock l(is_stopping_lock);
1381 if (get_state() == PREPARING_TO_STOP) {
1382 dout(0) << __func__ << " starting shutdown" << dendl;
1383 set_state(STOPPING);
1384 is_stopping_cond.notify_all();
1385 } else {
1386 dout(10) << __func__ << " ignoring msg" << dendl;
1387 }
1388 }
1389
1390 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1391 OSDSuperblock& sblock)
1392 {
1393 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1394 osdmap->get_encoding_features());
1395 m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
1396 m->newest_map = sblock.newest_map;
1397
1398 int max = cct->_conf->osd_map_message_max;
1399 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1400
1401 if (since < m->cluster_osdmap_trim_lower_bound) {
1402 // we don't have the next map the target wants, so start with a
1403 // full map.
1404 bufferlist bl;
1405 dout(10) << __func__ << " cluster osdmap lower bound "
1406 << sblock.cluster_osdmap_trim_lower_bound
1407 << " > since " << since << ", starting with full map"
1408 << dendl;
1409 since = m->cluster_osdmap_trim_lower_bound;
1410 if (!get_map_bl(since, bl)) {
1411 derr << __func__ << " missing full map " << since << dendl;
1412 goto panic;
1413 }
1414 max--;
1415 max_bytes -= bl.length();
1416 m->maps[since] = std::move(bl);
1417 }
1418 for (epoch_t e = since + 1; e <= to; ++e) {
1419 bufferlist bl;
1420 if (get_inc_map_bl(e, bl)) {
1421 m->incremental_maps[e] = std::move(bl);
1422 } else {
1423 dout(10) << __func__ << " missing incremental map " << e << dendl;
1424 if (!get_map_bl(e, bl)) {
1425 derr << __func__ << " also missing full map " << e << dendl;
1426 goto panic;
1427 }
1428 m->maps[e] = std::move(bl);
1429 }
1430 max--;
1431 max_bytes -= bl.length();
1432 if (max <= 0 || max_bytes <= 0) {
1433 break;
1434 }
1435 }
1436 return m;
1437
1438 panic:
1439 if (!m->maps.empty() ||
1440 !m->incremental_maps.empty()) {
1441 // send what we have so far
1442 return m;
1443 }
1444 // send something
1445 bufferlist bl;
1446 if (get_inc_map_bl(m->newest_map, bl)) {
1447 m->incremental_maps[m->newest_map] = std::move(bl);
1448 } else {
1449 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1450 if (!get_map_bl(m->newest_map, bl)) {
1451 derr << __func__ << " unable to load latest full map " << m->newest_map
1452 << dendl;
1453 ceph_abort();
1454 }
1455 m->maps[m->newest_map] = std::move(bl);
1456 }
1457 return m;
1458 }
1459
1460 void OSDService::send_map(MOSDMap *m, Connection *con)
1461 {
1462 con->send_message(m);
1463 }
1464
1465 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1466 const OSDMapRef& osdmap)
1467 {
1468 epoch_t to = osdmap->get_epoch();
1469 dout(10) << "send_incremental_map " << since << " -> " << to
1470 << " to " << con << " " << con->get_peer_addr() << dendl;
1471
1472 MOSDMap *m = NULL;
1473 while (!m) {
1474 OSDSuperblock sblock(get_superblock());
1475 if (since < sblock.oldest_map) {
1476 // just send latest full map
1477 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1478 osdmap->get_encoding_features());
1479 m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
1480 m->newest_map = sblock.newest_map;
1481 get_map_bl(to, m->maps[to]);
1482 send_map(m, con);
1483 return;
1484 }
1485
1486 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1487 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1488 << ", only sending most recent" << dendl;
1489 since = to - cct->_conf->osd_map_share_max_epochs;
1490 }
1491
1492 m = build_incremental_map_msg(since, to, sblock);
1493 }
1494 send_map(m, con);
1495 }
1496
1497 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1498 {
1499 bool found = map_bl_cache.lookup(e, &bl);
1500 if (found) {
1501 logger->inc(l_osd_map_bl_cache_hit);
1502 return true;
1503 }
1504 logger->inc(l_osd_map_bl_cache_miss);
1505 found = store->read(meta_ch,
1506 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1507 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1508 if (found) {
1509 _add_map_bl(e, bl);
1510 }
1511 return found;
1512 }
1513
1514 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1515 {
1516 std::lock_guard l(map_cache_lock);
1517 bool found = map_bl_inc_cache.lookup(e, &bl);
1518 if (found) {
1519 logger->inc(l_osd_map_bl_cache_hit);
1520 return true;
1521 }
1522 logger->inc(l_osd_map_bl_cache_miss);
1523 found = store->read(meta_ch,
1524 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1525 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1526 if (found) {
1527 _add_map_inc_bl(e, bl);
1528 }
1529 return found;
1530 }
1531
1532 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1533 {
1534 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1535 // cache a contiguous buffer
1536 if (bl.get_num_buffers() > 1) {
1537 bl.rebuild();
1538 }
1539 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1540 map_bl_cache.add(e, bl);
1541 }
1542
1543 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1544 {
1545 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1546 // cache a contiguous buffer
1547 if (bl.get_num_buffers() > 1) {
1548 bl.rebuild();
1549 }
1550 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1551 map_bl_inc_cache.add(e, bl);
1552 }
1553
1554 OSDMapRef OSDService::_add_map(OSDMap *o)
1555 {
1556 epoch_t e = o->get_epoch();
1557
1558 if (cct->_conf->osd_map_dedup) {
1559 // Dedup against an existing map at a nearby epoch
1560 OSDMapRef for_dedup = map_cache.lower_bound(e);
1561 if (for_dedup) {
1562 OSDMap::dedup(for_dedup.get(), o);
1563 }
1564 }
1565 bool existed;
1566 OSDMapRef l = map_cache.add(e, o, &existed);
1567 if (existed) {
1568 delete o;
1569 }
1570 return l;
1571 }
1572
1573 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1574 {
1575 std::lock_guard l(map_cache_lock);
1576 OSDMapRef retval = map_cache.lookup(epoch);
1577 if (retval) {
1578 dout(30) << "get_map " << epoch << " -cached" << dendl;
1579 logger->inc(l_osd_map_cache_hit);
1580 return retval;
1581 }
1582 {
1583 logger->inc(l_osd_map_cache_miss);
1584 epoch_t lb = map_cache.cached_key_lower_bound();
1585 if (epoch < lb) {
1586 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1587 logger->inc(l_osd_map_cache_miss_low);
1588 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1589 }
1590 }
1591
1592 OSDMap *map = new OSDMap;
1593 if (epoch > 0) {
1594 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1595 bufferlist bl;
1596 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1597 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1598 delete map;
1599 return OSDMapRef();
1600 }
1601 map->decode(bl);
1602 } else {
1603 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1604 }
1605 return _add_map(map);
1606 }
1607
1608 // ops
1609
1610
1611 void OSDService::reply_op_error(OpRequestRef op, int err)
1612 {
1613 reply_op_error(op, err, eversion_t(), 0, {});
1614 }
1615
1616 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1617 version_t uv,
1618 vector<pg_log_op_return_item_t> op_returns)
1619 {
1620 auto m = op->get_req<MOSDOp>();
1621 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1622 int flags;
1623 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1624
1625 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1626 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1627 reply->set_reply_versions(v, uv);
1628 reply->set_op_returns(op_returns);
1629 m->get_connection()->send_message(reply);
1630 }
1631
1632 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1633 {
1634 if (!cct->_conf->osd_debug_misdirected_ops) {
1635 return;
1636 }
1637
1638 auto m = op->get_req<MOSDOp>();
1639 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1640
1641 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1642
1643 if (pg->is_ec_pg()) {
1644 /**
1645 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1646 * can get this result:
1647 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1648 * [CRUSH_ITEM_NONE, 2, 3]/3
1649 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1650 * [3, 2, 3]/3
1651 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1652 * -- misdirected op
1653 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1654 * it and fulfils it
1655 *
1656 * We can't compute the op target based on the sending map epoch due to
1657 * splitting. The simplest thing is to detect such cases here and drop
1658 * them without an error (the client will resend anyway).
1659 */
1660 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1661 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1662 if (!opmap) {
1663 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1664 << m->get_map_epoch() << ", dropping" << dendl;
1665 return;
1666 }
1667 pg_t _pgid = m->get_raw_pg();
1668 spg_t pgid;
1669 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1670 _pgid = opmap->raw_pg_to_pg(_pgid);
1671 if (opmap->get_primary_shard(_pgid, &pgid) &&
1672 pgid.shard != pg->pg_id.shard) {
1673 dout(7) << __func__ << ": " << *pg << " primary changed since "
1674 << m->get_map_epoch() << ", dropping" << dendl;
1675 return;
1676 }
1677 }
1678
1679 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1680 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1681 << " pg " << m->get_raw_pg()
1682 << " to osd." << whoami
1683 << " not " << pg->get_acting()
1684 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1685 }
1686
1687 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1688 {
1689 osd->op_shardedwq.queue(std::move(qi));
1690 }
1691
1692 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1693 {
1694 osd->op_shardedwq.queue_front(std::move(qi));
1695 }
1696
1697 void OSDService::queue_recovery_context(
1698 PG *pg,
1699 GenContext<ThreadPool::TPHandle&> *c,
1700 uint64_t cost,
1701 int priority)
1702 {
1703 epoch_t e = get_osdmap_epoch();
1704
1705 uint64_t cost_for_queue = [this, cost] {
1706 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
1707 return cost;
1708 } else {
1709 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
1710 * require very large costs for several messages in order to do any
1711 * meaningful amount of throttling. This branch should be removed after
1712 * Reef.
1713 */
1714 return cct->_conf->osd_recovery_cost;
1715 }
1716 }();
1717
1718 enqueue_back(
1719 OpSchedulerItem(
1720 unique_ptr<OpSchedulerItem::OpQueueable>(
1721 new PGRecoveryContext(pg->get_pgid(), c, e, priority)),
1722 cost_for_queue,
1723 cct->_conf->osd_recovery_priority,
1724 ceph_clock_now(),
1725 0,
1726 e));
1727 }
1728
1729 void OSDService::queue_for_snap_trim(PG *pg)
1730 {
1731 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1732 enqueue_back(
1733 OpSchedulerItem(
1734 unique_ptr<OpSchedulerItem::OpQueueable>(
1735 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1736 cct->_conf->osd_snap_trim_cost,
1737 cct->_conf->osd_snap_trim_priority,
1738 ceph_clock_now(),
1739 0,
1740 pg->get_osdmap_epoch()));
1741 }
1742
1743 template <class MSG_TYPE>
1744 void OSDService::queue_scrub_event_msg(PG* pg,
1745 Scrub::scrub_prio_t with_priority,
1746 unsigned int qu_priority,
1747 Scrub::act_token_t act_token)
1748 {
1749 const auto epoch = pg->get_osdmap_epoch();
1750 auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1751 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1752 << ". Epoch: " << epoch << " token: " << act_token << dendl;
1753 enqueue_back(OpSchedulerItem(
1754 unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
1755 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1756 }
1757
1758 template <class MSG_TYPE>
1759 void OSDService::queue_scrub_event_msg(PG* pg,
1760 Scrub::scrub_prio_t with_priority)
1761 {
1762 const auto epoch = pg->get_osdmap_epoch();
1763 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1764 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1765 enqueue_back(OpSchedulerItem(
1766 unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
1767 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1768 }
1769
1770 int64_t OSDService::get_scrub_cost()
1771 {
1772
1773 int64_t cost_for_queue = cct->_conf->osd_scrub_cost;
1774 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
1775 cost_for_queue = cct->_conf->osd_scrub_event_cost *
1776 cct->_conf->osd_shallow_scrub_chunk_max;
1777 }
1778 return cost_for_queue;
1779 }
1780
1781 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1782 {
1783 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1784 }
1785
1786 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1787 {
1788 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1789 }
1790
1791 void OSDService::queue_for_rep_scrub(PG* pg,
1792 Scrub::scrub_prio_t with_priority,
1793 unsigned int qu_priority,
1794 Scrub::act_token_t act_token)
1795 {
1796 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
1797 }
1798
1799 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1800 Scrub::scrub_prio_t with_priority,
1801 unsigned int qu_priority,
1802 Scrub::act_token_t act_token)
1803 {
1804 // Resulting scrub event: 'SchedReplica'
1805 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1806 act_token);
1807 }
1808
1809 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1810 {
1811 // Resulting scrub event: 'RemotesReserved'
1812 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1813 }
1814
1815 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1816 {
1817 // Resulting scrub event: 'ReservationFailure'
1818 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1819 }
1820
1821 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1822 {
1823 // Resulting scrub event: 'InternalSchedScrub'
1824 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1825 }
1826
1827 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1828 {
1829 // Resulting scrub event: 'ActivePushesUpd'
1830 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1831 }
1832
1833 void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1834 {
1835 // Resulting scrub event: 'SelectedChunkFree'
1836 queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1837 }
1838
1839 void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1840 {
1841 // Resulting scrub event: 'ChunkIsBusy'
1842 queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1843 }
1844
1845 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1846 {
1847 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1848 }
1849
1850 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1851 {
1852 // Resulting scrub event: 'Unblocked'
1853 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1854 }
1855
1856 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1857 {
1858 // Resulting scrub event: 'DigestUpdate'
1859 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1860 }
1861
1862 void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1863 {
1864 // Resulting scrub event: 'IntLocalMapDone'
1865 queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1866 }
1867
1868 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1869 {
1870 // Resulting scrub event: 'GotReplicas'
1871 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1872 }
1873
1874 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1875 {
1876 // Resulting scrub event: 'ReplicaPushesUpd'
1877 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1878 }
1879
1880 void OSDService::queue_scrub_is_finished(PG *pg)
1881 {
1882 // Resulting scrub event: 'ScrubFinished'
1883 queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1884 }
1885
1886 void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1887 {
1888 // Resulting scrub event: 'NextChunk'
1889 queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1890 }
1891
1892 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1893 {
1894 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1895 enqueue_back(
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
1898 new PGDelete(pgid, e)),
1899 cct->_conf->osd_pg_delete_cost,
1900 cct->_conf->osd_pg_delete_priority,
1901 ceph_clock_now(),
1902 0,
1903 e));
1904 }
1905
1906 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1907 {
1908 return osd->try_finish_pg_delete(pg, old_pg_num);
1909 }
1910
1911 // ---
1912
1913 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1914 {
1915 std::lock_guard l(merge_lock);
1916 dout(10) << __func__ << " " << pg->pg_id << dendl;
1917 ready_to_merge_source[pg->pg_id.pgid] = version;
1918 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1919 _send_ready_to_merge();
1920 }
1921
1922 void OSDService::set_ready_to_merge_target(PG *pg,
1923 eversion_t version,
1924 epoch_t last_epoch_started,
1925 epoch_t last_epoch_clean)
1926 {
1927 std::lock_guard l(merge_lock);
1928 dout(10) << __func__ << " " << pg->pg_id << dendl;
1929 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1930 make_tuple(version,
1931 last_epoch_started,
1932 last_epoch_clean)));
1933 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1934 _send_ready_to_merge();
1935 }
1936
1937 void OSDService::set_not_ready_to_merge_source(pg_t source)
1938 {
1939 std::lock_guard l(merge_lock);
1940 dout(10) << __func__ << " " << source << dendl;
1941 not_ready_to_merge_source.insert(source);
1942 assert(ready_to_merge_source.count(source) == 0);
1943 _send_ready_to_merge();
1944 }
1945
1946 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1947 {
1948 std::lock_guard l(merge_lock);
1949 dout(10) << __func__ << " " << target << " source " << source << dendl;
1950 not_ready_to_merge_target[target] = source;
1951 assert(ready_to_merge_target.count(target) == 0);
1952 _send_ready_to_merge();
1953 }
1954
1955 void OSDService::send_ready_to_merge()
1956 {
1957 std::lock_guard l(merge_lock);
1958 _send_ready_to_merge();
1959 }
1960
1961 void OSDService::_send_ready_to_merge()
1962 {
1963 dout(20) << __func__
1964 << " ready_to_merge_source " << ready_to_merge_source
1965 << " not_ready_to_merge_source " << not_ready_to_merge_source
1966 << " ready_to_merge_target " << ready_to_merge_target
1967 << " not_ready_to_merge_target " << not_ready_to_merge_target
1968 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1969 << dendl;
1970 for (auto src : not_ready_to_merge_source) {
1971 if (sent_ready_to_merge_source.count(src) == 0) {
1972 monc->send_mon_message(new MOSDPGReadyToMerge(
1973 src,
1974 {}, {}, 0, 0,
1975 false,
1976 osdmap->get_epoch()));
1977 sent_ready_to_merge_source.insert(src);
1978 }
1979 }
1980 for (auto p : not_ready_to_merge_target) {
1981 if (sent_ready_to_merge_source.count(p.second) == 0) {
1982 monc->send_mon_message(new MOSDPGReadyToMerge(
1983 p.second,
1984 {}, {}, 0, 0,
1985 false,
1986 osdmap->get_epoch()));
1987 sent_ready_to_merge_source.insert(p.second);
1988 }
1989 }
1990 for (auto src : ready_to_merge_source) {
1991 if (not_ready_to_merge_source.count(src.first) ||
1992 not_ready_to_merge_target.count(src.first.get_parent())) {
1993 continue;
1994 }
1995 auto p = ready_to_merge_target.find(src.first.get_parent());
1996 if (p != ready_to_merge_target.end() &&
1997 sent_ready_to_merge_source.count(src.first) == 0) {
1998 monc->send_mon_message(new MOSDPGReadyToMerge(
1999 src.first, // source pgid
2000 src.second, // src version
2001 std::get<0>(p->second), // target version
2002 std::get<1>(p->second), // PG's last_epoch_started
2003 std::get<2>(p->second), // PG's last_epoch_clean
2004 true,
2005 osdmap->get_epoch()));
2006 sent_ready_to_merge_source.insert(src.first);
2007 }
2008 }
2009 }
2010
2011 void OSDService::clear_ready_to_merge(PG *pg)
2012 {
2013 std::lock_guard l(merge_lock);
2014 dout(10) << __func__ << " " << pg->pg_id << dendl;
2015 ready_to_merge_source.erase(pg->pg_id.pgid);
2016 ready_to_merge_target.erase(pg->pg_id.pgid);
2017 not_ready_to_merge_source.erase(pg->pg_id.pgid);
2018 not_ready_to_merge_target.erase(pg->pg_id.pgid);
2019 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2020 }
2021
2022 void OSDService::clear_sent_ready_to_merge()
2023 {
2024 std::lock_guard l(merge_lock);
2025 sent_ready_to_merge_source.clear();
2026 }
2027
2028 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
2029 {
2030 std::lock_guard l(merge_lock);
2031 auto i = sent_ready_to_merge_source.begin();
2032 while (i != sent_ready_to_merge_source.end()) {
2033 if (!osdmap->pg_exists(*i)) {
2034 dout(10) << __func__ << " " << *i << dendl;
2035 i = sent_ready_to_merge_source.erase(i);
2036 } else {
2037 dout(20) << __func__ << " exist " << *i << dendl;
2038 ++i;
2039 }
2040 }
2041 }
2042
2043 // ---
2044
2045 void OSDService::_queue_for_recovery(
2046 pg_awaiting_throttle_t p,
2047 uint64_t reserved_pushes)
2048 {
2049 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
2050
2051 uint64_t cost_for_queue = [this, &reserved_pushes, &p] {
2052 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
2053 return p.cost_per_object * reserved_pushes;
2054 } else {
2055 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
2056 * require very large costs for several messages in order to do any
2057 * meaningful amount of throttling. This branch should be removed after
2058 * Reef.
2059 */
2060 return cct->_conf->osd_recovery_cost;
2061 }
2062 }();
2063
2064 enqueue_back(
2065 OpSchedulerItem(
2066 unique_ptr<OpSchedulerItem::OpQueueable>(
2067 new PGRecovery(
2068 p.pg->get_pgid(),
2069 p.epoch_queued,
2070 reserved_pushes,
2071 p.priority)),
2072 cost_for_queue,
2073 cct->_conf->osd_recovery_priority,
2074 ceph_clock_now(),
2075 0,
2076 p.epoch_queued));
2077 }
2078
2079 // ====================================================================
2080 // OSD
2081
2082 #undef dout_prefix
2083 #define dout_prefix *_dout
2084
2085 // Commands shared between OSD's console and admin console:
2086 namespace ceph::osd_cmds {
2087
2088 int heap(CephContext& cct,
2089 const cmdmap_t& cmdmap,
2090 std::ostream& outos,
2091 std::ostream& erros);
2092
2093 } // namespace ceph::osd_cmds
2094
2095 int OSD::mkfs(CephContext *cct,
2096 std::unique_ptr<ObjectStore> store,
2097 uuid_d fsid,
2098 int whoami,
2099 string osdspec_affinity)
2100 {
2101 int ret;
2102
2103 OSDSuperblock sb;
2104 bufferlist sbbl;
2105 // if we are fed a uuid for this osd, use it.
2106 store->set_fsid(cct->_conf->osd_uuid);
2107
2108 ret = store->mkfs();
2109 if (ret) {
2110 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2111 << cpp_strerror(ret) << dendl;
2112 return ret;
2113 }
2114
2115 store->set_cache_shards(1); // doesn't matter for mkfs!
2116
2117 ret = store->mount();
2118 if (ret) {
2119 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2120 << cpp_strerror(ret) << dendl;
2121 return ret;
2122 }
2123
2124 auto umount_store = make_scope_guard([&] {
2125 store->umount();
2126 });
2127
2128 ObjectStore::CollectionHandle ch =
2129 store->open_collection(coll_t::meta());
2130 if (ch) {
2131 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2132 if (ret < 0) {
2133 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2134 return ret;
2135 }
2136 /* if we already have superblock, check content of superblock */
2137 dout(0) << " have superblock" << dendl;
2138 auto p = sbbl.cbegin();
2139 decode(sb, p);
2140 if (whoami != sb.whoami) {
2141 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2142 << dendl;
2143 return -EINVAL;
2144 }
2145 if (fsid != sb.cluster_fsid) {
2146 derr << "provided cluster fsid " << fsid
2147 << " != superblock's " << sb.cluster_fsid << dendl;
2148 return -EINVAL;
2149 }
2150 } else {
2151 // create superblock
2152 sb.cluster_fsid = fsid;
2153 sb.osd_fsid = store->get_fsid();
2154 sb.whoami = whoami;
2155 sb.compat_features = get_osd_initial_compat_set();
2156
2157 bufferlist bl;
2158 encode(sb, bl);
2159
2160 ObjectStore::CollectionHandle ch = store->create_new_collection(
2161 coll_t::meta());
2162 ObjectStore::Transaction t;
2163 t.create_collection(coll_t::meta(), 0);
2164 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2165 ret = store->queue_transaction(ch, std::move(t));
2166 if (ret) {
2167 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2168 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2169 return ret;
2170 }
2171 ch->flush();
2172 }
2173
2174 ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2175 if (ret) {
2176 derr << "OSD::mkfs: failed to write fsid file: error "
2177 << cpp_strerror(ret) << dendl;
2178 }
2179 return ret;
2180 }
2181
2182 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2183 {
2184 char val[80];
2185 int r;
2186
2187 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2188 r = store->write_meta("magic", val);
2189 if (r < 0)
2190 return r;
2191
2192 snprintf(val, sizeof(val), "%d", whoami);
2193 r = store->write_meta("whoami", val);
2194 if (r < 0)
2195 return r;
2196
2197 cluster_fsid.print(val);
2198 r = store->write_meta("ceph_fsid", val);
2199 if (r < 0)
2200 return r;
2201
2202 string key = cct->_conf.get_val<string>("key");
2203 if (key.size()) {
2204 r = store->write_meta("osd_key", key);
2205 if (r < 0)
2206 return r;
2207 } else {
2208 string keyfile = cct->_conf.get_val<string>("keyfile");
2209 if (!keyfile.empty()) {
2210 bufferlist keybl;
2211 string err;
2212 r = keybl.read_file(keyfile.c_str(), &err);
2213 if (r < 0) {
2214 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2215 << err << ": " << cpp_strerror(r) << dendl;
2216 return r;
2217 }
2218 r = store->write_meta("osd_key", keybl.to_str());
2219 if (r < 0)
2220 return r;
2221 }
2222 }
2223 if (!osdspec_affinity.empty()) {
2224 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2225 if (r < 0)
2226 return r;
2227 }
2228
2229 r = store->write_meta("ceph_version_when_created", pretty_version_to_str());
2230 if (r < 0)
2231 return r;
2232
2233 ostringstream created_at;
2234 utime_t now = ceph_clock_now();
2235 now.gmtime(created_at);
2236 r = store->write_meta("created_at", created_at.str());
2237 if (r < 0)
2238 return r;
2239
2240 r = store->write_meta("ready", "ready");
2241 if (r < 0)
2242 return r;
2243
2244 return 0;
2245 }
2246
2247 int OSD::peek_meta(ObjectStore *store,
2248 std::string *magic,
2249 uuid_d *cluster_fsid,
2250 uuid_d *osd_fsid,
2251 int *whoami,
2252 ceph_release_t *require_osd_release)
2253 {
2254 string val;
2255
2256 int r = store->read_meta("magic", &val);
2257 if (r < 0)
2258 return r;
2259 *magic = val;
2260
2261 r = store->read_meta("whoami", &val);
2262 if (r < 0)
2263 return r;
2264 *whoami = atoi(val.c_str());
2265
2266 r = store->read_meta("ceph_fsid", &val);
2267 if (r < 0)
2268 return r;
2269 r = cluster_fsid->parse(val.c_str());
2270 if (!r)
2271 return -EINVAL;
2272
2273 r = store->read_meta("fsid", &val);
2274 if (r < 0) {
2275 *osd_fsid = uuid_d();
2276 } else {
2277 r = osd_fsid->parse(val.c_str());
2278 if (!r)
2279 return -EINVAL;
2280 }
2281
2282 r = store->read_meta("require_osd_release", &val);
2283 if (r >= 0) {
2284 *require_osd_release = ceph_release_from_name(val);
2285 }
2286
2287 return 0;
2288 }
2289
2290
2291 #undef dout_prefix
2292 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2293
2294 // cons/des
2295
2296 OSD::OSD(CephContext *cct_,
2297 std::unique_ptr<ObjectStore> store_,
2298 int id,
2299 Messenger *internal_messenger,
2300 Messenger *external_messenger,
2301 Messenger *hb_client_front,
2302 Messenger *hb_client_back,
2303 Messenger *hb_front_serverm,
2304 Messenger *hb_back_serverm,
2305 Messenger *osdc_messenger,
2306 MonClient *mc,
2307 const std::string &dev, const std::string &jdev,
2308 ceph::async::io_context_pool& poolctx) :
2309 Dispatcher(cct_),
2310 tick_timer(cct, osd_lock),
2311 tick_timer_without_osd_lock(cct, tick_timer_lock),
2312 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2313 cluster_messenger(internal_messenger),
2314 client_messenger(external_messenger),
2315 objecter_messenger(osdc_messenger),
2316 monc(mc),
2317 mgrc(cct_, client_messenger, &mc->monmap),
2318 logger(create_logger()),
2319 recoverystate_perf(create_recoverystate_perf()),
2320 store(std::move(store_)),
2321 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2322 clog(log_client.create_channel()),
2323 whoami(id),
2324 dev_path(dev), journal_path(jdev),
2325 store_is_rotational(store->is_rotational()),
2326 trace_endpoint("0.0.0.0", 0, "osd"),
2327 asok_hook(NULL),
2328 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2329 "osd_pg_epoch_max_lag_factor")),
2330 osd_compat(get_osd_compat_set()),
2331 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2332 get_num_op_threads()),
2333 heartbeat_stop(false),
2334 heartbeat_need_update(true),
2335 hb_front_client_messenger(hb_client_front),
2336 hb_back_client_messenger(hb_client_back),
2337 hb_front_server_messenger(hb_front_serverm),
2338 hb_back_server_messenger(hb_back_serverm),
2339 daily_loadavg(0.0),
2340 heartbeat_thread(this),
2341 heartbeat_dispatcher(this),
2342 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2343 cct->_conf->osd_num_op_tracker_shard),
2344 test_ops_hook(NULL),
2345 op_shardedwq(
2346 this,
2347 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2348 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2349 &osd_op_tp),
2350 last_pg_create_epoch(0),
2351 boot_finisher(cct),
2352 up_thru_wanted(0),
2353 requested_full_first(0),
2354 requested_full_last(0),
2355 service(this, poolctx)
2356 {
2357
2358 if (!gss_ktfile_client.empty()) {
2359 // Assert we can export environment variable
2360 /*
2361 The default client keytab is used, if it is present and readable,
2362 to automatically obtain initial credentials for GSSAPI client
2363 applications. The principal name of the first entry in the client
2364 keytab is used by default when obtaining initial credentials.
2365 1. The KRB5_CLIENT_KTNAME environment variable.
2366 2. The default_client_keytab_name profile variable in [libdefaults].
2367 3. The hardcoded default, DEFCKTNAME.
2368 */
2369 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2370 gss_ktfile_client.c_str(), 1));
2371 ceph_assert(set_result == 0);
2372 }
2373
2374 monc->set_messenger(client_messenger);
2375 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2376 cct->_conf->osd_op_log_threshold);
2377 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2378 cct->_conf->osd_op_history_duration);
2379 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2380 cct->_conf->osd_op_history_slow_op_threshold);
2381 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2382 #ifdef WITH_BLKIN
2383 std::stringstream ss;
2384 ss << "osd." << whoami;
2385 trace_endpoint.copy_name(ss.str());
2386 #endif
2387
2388 // initialize shards
2389 num_shards = get_num_op_shards();
2390 for (uint32_t i = 0; i < num_shards; i++) {
2391 OSDShard *one_shard = new OSDShard(
2392 i,
2393 cct,
2394 this);
2395 shards.push_back(one_shard);
2396 }
2397 }
2398
2399 OSD::~OSD()
2400 {
2401 while (!shards.empty()) {
2402 delete shards.back();
2403 shards.pop_back();
2404 }
2405 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2406 cct->get_perfcounters_collection()->remove(logger);
2407 delete recoverystate_perf;
2408 delete logger;
2409 }
2410
2411 double OSD::get_tick_interval() const
2412 {
2413 // vary +/- 5% to avoid scrub scheduling livelocks
2414 constexpr auto delta = 0.05;
2415 return (OSD_TICK_INTERVAL *
2416 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2417 }
2418
2419 void OSD::handle_signal(int signum)
2420 {
2421 ceph_assert(signum == SIGINT || signum == SIGTERM);
2422 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2423 shutdown();
2424 }
2425
2426 int OSD::pre_init()
2427 {
2428 std::lock_guard lock(osd_lock);
2429 if (is_stopping())
2430 return 0;
2431
2432 if (store->test_mount_in_use()) {
2433 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2434 << "currently in use. (Is ceph-osd already running?)" << dendl;
2435 return -EBUSY;
2436 }
2437
2438 cct->_conf.add_observer(this);
2439 return 0;
2440 }
2441
2442 int OSD::set_numa_affinity()
2443 {
2444 // storage numa node
2445 int store_node = -1;
2446 store->get_numa_node(&store_node, nullptr, nullptr);
2447 if (store_node >= 0) {
2448 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2449 }
2450
2451 // check network numa node(s)
2452 int front_node = -1, back_node = -1;
2453 string front_iface = pick_iface(
2454 cct,
2455 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2456 string back_iface = pick_iface(
2457 cct,
2458 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2459 int r = get_iface_numa_node(front_iface, &front_node);
2460 if (r >= 0 && front_node >= 0) {
2461 dout(1) << __func__ << " public network " << front_iface << " numa node "
2462 << front_node << dendl;
2463 r = get_iface_numa_node(back_iface, &back_node);
2464 if (r >= 0 && back_node >= 0) {
2465 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2466 << back_node << dendl;
2467 if (front_node == back_node &&
2468 front_node == store_node) {
2469 dout(1) << " objectstore and network numa nodes all match" << dendl;
2470 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2471 numa_node = front_node;
2472 }
2473 } else if (front_node != back_node) {
2474 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2475 << dendl;
2476 } else {
2477 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2478 << dendl;
2479 }
2480 } else if (back_node == -2) {
2481 dout(1) << __func__ << " cluster network " << back_iface
2482 << " ports numa nodes do not match" << dendl;
2483 } else {
2484 derr << __func__ << " unable to identify cluster interface '" << back_iface
2485 << "' numa node: " << cpp_strerror(r) << dendl;
2486 }
2487 } else if (front_node == -2) {
2488 dout(1) << __func__ << " public network " << front_iface
2489 << " ports numa nodes do not match" << dendl;
2490 } else {
2491 derr << __func__ << " unable to identify public interface '" << front_iface
2492 << "' numa node: " << cpp_strerror(r) << dendl;
2493 }
2494 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2495 // this takes precedence over the automagic logic above
2496 numa_node = node;
2497 }
2498 if (numa_node >= 0) {
2499 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2500 if (r < 0) {
2501 dout(1) << __func__ << " unable to determine numa node " << numa_node
2502 << " CPUs" << dendl;
2503 numa_node = -1;
2504 } else {
2505 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2506 << " cpus "
2507 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2508 << dendl;
2509 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2510 if (r < 0) {
2511 r = -errno;
2512 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2513 << dendl;
2514 numa_node = -1;
2515 }
2516 }
2517 } else {
2518 dout(1) << __func__ << " not setting numa affinity" << dendl;
2519 }
2520 return 0;
2521 }
2522
2523 // asok
2524
2525 class OSDSocketHook : public AdminSocketHook {
2526 OSD *osd;
2527 public:
2528 explicit OSDSocketHook(OSD *o) : osd(o) {}
2529 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2530 const bufferlist& inbl,
2531 Formatter *f,
2532 std::ostream& ss,
2533 bufferlist& out) override {
2534 ceph_abort("should use async hook");
2535 }
2536 void call_async(
2537 std::string_view prefix,
2538 const cmdmap_t& cmdmap,
2539 Formatter *f,
2540 const bufferlist& inbl,
2541 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2542 try {
2543 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2544 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2545 bufferlist empty;
2546 on_finish(-EINVAL, e.what(), empty);
2547 }
2548 }
2549 };
2550
2551 std::set<int64_t> OSD::get_mapped_pools()
2552 {
2553 std::set<int64_t> pools;
2554 std::vector<spg_t> pgids;
2555 _get_pgids(&pgids);
2556 for (const auto &pgid : pgids) {
2557 pools.insert(pgid.pool());
2558 }
2559 return pools;
2560 }
2561
2562 OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2563 stringstream& ss,
2564 bool only_primary)
2565 {
2566 string pgidstr;
2567 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2568 ss << "no pgid specified";
2569 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2570 }
2571
2572 pg_t pgid;
2573 if (!pgid.parse(pgidstr.c_str())) {
2574 ss << "couldn't parse pgid '" << pgidstr << "'";
2575 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2576 }
2577
2578 spg_t pcand;
2579 PGRef pg;
2580 if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2581 if (pg->is_primary() || !only_primary) {
2582 return OSD::PGRefOrError{pg, 0};
2583 }
2584
2585 ss << "not primary for pgid " << pgid;
2586 pg->unlock();
2587 return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2588 } else {
2589 ss << "i don't have pgid " << pgid;
2590 return OSD::PGRefOrError{std::nullopt, -ENOENT};
2591 }
2592 }
2593
2594 // note that the cmdmap is explicitly copied into asok_route_to_pg()
2595 int OSD::asok_route_to_pg(
2596 bool only_primary,
2597 std::string_view prefix,
2598 cmdmap_t cmdmap,
2599 Formatter* f,
2600 stringstream& ss,
2601 const bufferlist& inbl,
2602 bufferlist& outbl,
2603 std::function<void(int, const std::string&, bufferlist&)> on_finish)
2604 {
2605 auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2606
2607 if (!target_pg.has_value()) {
2608 // 'ss' and 'ret' already contain the error information
2609 on_finish(ret, ss.str(), outbl);
2610 return ret;
2611 }
2612
2613 // the PG was locked by locate_asok_target()
2614 try {
2615 (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2616 (*target_pg)->unlock();
2617 return 0; // the pg handler calls on_finish directly
2618 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2619 (*target_pg)->unlock();
2620 ss << e.what();
2621 on_finish(ret, ss.str(), outbl);
2622 return -EINVAL;
2623 }
2624 }
2625
2626 void OSD::asok_command(
2627 std::string_view prefix, const cmdmap_t& cmdmap,
2628 Formatter *f,
2629 const bufferlist& inbl,
2630 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2631 {
2632 int ret = 0;
2633 stringstream ss; // stderr error message stream
2634 bufferlist outbl; // if empty at end, we'll dump formatter as output
2635
2636 // --- PG commands are routed here to PG::do_command ---
2637 if (prefix == "pg" ||
2638 prefix == "query" ||
2639 prefix == "log" ||
2640 prefix == "mark_unfound_lost" ||
2641 prefix == "list_unfound" ||
2642 prefix == "scrub" ||
2643 prefix == "deep_scrub"
2644 ) {
2645 string pgidstr;
2646 pg_t pgid;
2647 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2648 ss << "no pgid specified";
2649 ret = -EINVAL;
2650 goto out;
2651 }
2652 if (!pgid.parse(pgidstr.c_str())) {
2653 ss << "couldn't parse pgid '" << pgidstr << "'";
2654 ret = -EINVAL;
2655 goto out;
2656 }
2657 spg_t pcand;
2658 PGRef pg;
2659 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2660 (pg = _lookup_lock_pg(pcand))) {
2661 if (pg->is_primary()) {
2662 cmdmap_t new_cmdmap = cmdmap;
2663 try {
2664 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2665 pg->unlock();
2666 return; // the pg handler calls on_finish directly
2667 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2668 pg->unlock();
2669 ss << e.what();
2670 ret = -EINVAL;
2671 goto out;
2672 }
2673 } else {
2674 ss << "not primary for pgid " << pgid;
2675 // do not reply; they will get newer maps and realize they
2676 // need to resend.
2677 pg->unlock();
2678 ret = -EAGAIN;
2679 goto out;
2680 }
2681 } else {
2682 ss << "i don't have pgid " << pgid;
2683 ret = -ENOENT;
2684 }
2685 }
2686
2687 // --- PG commands that will be answered even if !primary ---
2688
2689 else if (prefix == "scrubdebug") {
2690 asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2691 return;
2692 }
2693
2694 // --- OSD commands follow ---
2695
2696 else if (prefix == "status") {
2697 lock_guard l(osd_lock);
2698 f->open_object_section("status");
2699 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2700 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2701 f->dump_unsigned("whoami", superblock.whoami);
2702 f->dump_string("state", get_state_name(get_state()));
2703 f->dump_unsigned("oldest_map", superblock.oldest_map);
2704 f->dump_unsigned("cluster_osdmap_trim_lower_bound",
2705 superblock.cluster_osdmap_trim_lower_bound);
2706 f->dump_unsigned("newest_map", superblock.newest_map);
2707 f->dump_unsigned("num_pgs", num_pgs);
2708 f->close_section();
2709 } else if (prefix == "flush_journal") {
2710 store->flush_journal();
2711 } else if (prefix == "dump_ops_in_flight" ||
2712 prefix == "ops" ||
2713 prefix == "dump_blocked_ops" ||
2714 prefix == "dump_blocked_ops_count" ||
2715 prefix == "dump_historic_ops" ||
2716 prefix == "dump_historic_ops_by_duration" ||
2717 prefix == "dump_historic_slow_ops") {
2718
2719 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2720 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2721 will start to track new ops received afterwards.";
2722
2723 set<string> filters;
2724 vector<string> filter_str;
2725 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2726 copy(filter_str.begin(), filter_str.end(),
2727 inserter(filters, filters.end()));
2728 }
2729
2730 if (prefix == "dump_ops_in_flight" ||
2731 prefix == "ops") {
2732 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2733 ss << error_str;
2734 ret = -EINVAL;
2735 goto out;
2736 }
2737 }
2738 if (prefix == "dump_blocked_ops") {
2739 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2740 ss << error_str;
2741 ret = -EINVAL;
2742 goto out;
2743 }
2744 }
2745 if (prefix == "dump_blocked_ops_count") {
2746 if (!op_tracker.dump_ops_in_flight(f, true, filters, true)) {
2747 ss << error_str;
2748 ret = -EINVAL;
2749 goto out;
2750 }
2751 }
2752 if (prefix == "dump_historic_ops") {
2753 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2754 ss << error_str;
2755 ret = -EINVAL;
2756 goto out;
2757 }
2758 }
2759 if (prefix == "dump_historic_ops_by_duration") {
2760 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2761 ss << error_str;
2762 ret = -EINVAL;
2763 goto out;
2764 }
2765 }
2766 if (prefix == "dump_historic_slow_ops") {
2767 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2768 ss << error_str;
2769 ret = -EINVAL;
2770 goto out;
2771 }
2772 }
2773 } else if (prefix == "dump_op_pq_state") {
2774 f->open_object_section("pq");
2775 op_shardedwq.dump(f);
2776 f->close_section();
2777 } else if (prefix == "dump_blocklist") {
2778 list<pair<entity_addr_t,utime_t> > bl;
2779 list<pair<entity_addr_t,utime_t> > rbl;
2780 OSDMapRef curmap = service.get_osdmap();
2781 curmap->get_blocklist(&bl, &rbl);
2782
2783 f->open_array_section("blocklist");
2784 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2785 it != bl.end(); ++it) {
2786 f->open_object_section("entry");
2787 f->open_object_section("entity_addr_t");
2788 it->first.dump(f);
2789 f->close_section(); //entity_addr_t
2790 it->second.localtime(f->dump_stream("expire_time"));
2791 f->close_section(); //entry
2792 }
2793 f->close_section(); //blocklist
2794 f->open_array_section("range_blocklist");
2795 for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
2796 it != rbl.end(); ++it) {
2797 f->open_object_section("entry");
2798 f->open_object_section("entity_addr_t");
2799 it->first.dump(f);
2800 f->close_section(); //entity_addr_t
2801 it->second.localtime(f->dump_stream("expire_time"));
2802 f->close_section(); //entry
2803 }
2804 f->close_section(); //blocklist
2805 } else if (prefix == "dump_watchers") {
2806 list<obj_watch_item_t> watchers;
2807 // scan pg's
2808 vector<PGRef> pgs;
2809 _get_pgs(&pgs);
2810 for (auto& pg : pgs) {
2811 list<obj_watch_item_t> pg_watchers;
2812 pg->get_watchers(&pg_watchers);
2813 watchers.splice(watchers.end(), pg_watchers);
2814 }
2815
2816 f->open_array_section("watchers");
2817 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2818 it != watchers.end(); ++it) {
2819
2820 f->open_object_section("watch");
2821
2822 f->dump_string("namespace", it->obj.nspace);
2823 f->dump_string("object", it->obj.oid.name);
2824
2825 f->open_object_section("entity_name");
2826 it->wi.name.dump(f);
2827 f->close_section(); //entity_name_t
2828
2829 f->dump_unsigned("cookie", it->wi.cookie);
2830 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2831
2832 f->open_object_section("entity_addr_t");
2833 it->wi.addr.dump(f);
2834 f->close_section(); //entity_addr_t
2835
2836 f->close_section(); //watch
2837 }
2838
2839 f->close_section(); //watchers
2840 } else if (prefix == "dump_recovery_reservations") {
2841 f->open_object_section("reservations");
2842 f->open_object_section("local_reservations");
2843 service.local_reserver.dump(f);
2844 f->close_section();
2845 f->open_object_section("remote_reservations");
2846 service.remote_reserver.dump(f);
2847 f->close_section();
2848 f->close_section();
2849 } else if (prefix == "dump_scrub_reservations") {
2850 f->open_object_section("scrub_reservations");
2851 service.get_scrub_services().dump_scrub_reservations(f);
2852 f->close_section();
2853 } else if (prefix == "get_latest_osdmap") {
2854 get_latest_osdmap();
2855 } else if (prefix == "set_heap_property") {
2856 string property;
2857 int64_t value = 0;
2858 string error;
2859 bool success = false;
2860 if (!cmd_getval(cmdmap, "property", property)) {
2861 error = "unable to get property";
2862 success = false;
2863 } else if (!cmd_getval(cmdmap, "value", value)) {
2864 error = "unable to get value";
2865 success = false;
2866 } else if (value < 0) {
2867 error = "negative value not allowed";
2868 success = false;
2869 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2870 error = "invalid property";
2871 success = false;
2872 } else {
2873 success = true;
2874 }
2875 f->open_object_section("result");
2876 f->dump_string("error", error);
2877 f->dump_bool("success", success);
2878 f->close_section();
2879 } else if (prefix == "get_heap_property") {
2880 string property;
2881 size_t value = 0;
2882 string error;
2883 bool success = false;
2884 if (!cmd_getval(cmdmap, "property", property)) {
2885 error = "unable to get property";
2886 success = false;
2887 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2888 error = "invalid property";
2889 success = false;
2890 } else {
2891 success = true;
2892 }
2893 f->open_object_section("result");
2894 f->dump_string("error", error);
2895 f->dump_bool("success", success);
2896 f->dump_int("value", value);
2897 f->close_section();
2898 } else if (prefix == "dump_objectstore_kv_stats") {
2899 store->get_db_statistics(f);
2900 } else if (prefix == "dump_scrubs") {
2901 service.get_scrub_services().dump_scrubs(f);
2902 } else if (prefix == "calc_objectstore_db_histogram") {
2903 store->generate_db_histogram(f);
2904 } else if (prefix == "flush_store_cache") {
2905 store->flush_cache(&ss);
2906 } else if (prefix == "rotate-stored-key") {
2907 store->write_meta("osd_key", inbl.to_str());
2908 } else if (prefix == "dump_pgstate_history") {
2909 f->open_object_section("pgstate_history");
2910 f->open_array_section("pgs");
2911 vector<PGRef> pgs;
2912 _get_pgs(&pgs);
2913 for (auto& pg : pgs) {
2914 f->open_object_section("pg");
2915 f->dump_stream("pg") << pg->pg_id;
2916 f->dump_string("currently", pg->get_current_state());
2917 pg->dump_pgstate_history(f);
2918 f->close_section();
2919 }
2920 f->close_section();
2921 f->close_section();
2922 } else if (prefix == "compact") {
2923 dout(1) << "triggering manual compaction" << dendl;
2924 auto start = ceph::coarse_mono_clock::now();
2925 store->compact();
2926 auto end = ceph::coarse_mono_clock::now();
2927 double duration = std::chrono::duration<double>(end-start).count();
2928 dout(1) << "finished manual compaction in "
2929 << duration
2930 << " seconds" << dendl;
2931 f->open_object_section("compact_result");
2932 f->dump_float("elapsed_time", duration);
2933 f->close_section();
2934 } else if (prefix == "get_mapped_pools") {
2935 f->open_array_section("mapped_pools");
2936 set<int64_t> poollist = get_mapped_pools();
2937 for (auto pool : poollist) {
2938 f->dump_int("pool_id", pool);
2939 }
2940 f->close_section();
2941 } else if (prefix == "smart") {
2942 string devid;
2943 cmd_getval(cmdmap, "devid", devid);
2944 ostringstream out;
2945 probe_smart(devid, out);
2946 outbl.append(out.str());
2947 } else if (prefix == "list_devices") {
2948 set<string> devnames;
2949 store->get_devices(&devnames);
2950 f->open_array_section("list_devices");
2951 for (auto dev : devnames) {
2952 if (dev.find("dm-") == 0) {
2953 continue;
2954 }
2955 string err;
2956 f->open_object_section("device");
2957 f->dump_string("device", "/dev/" + dev);
2958 f->dump_string("device_id", get_device_id(dev, &err));
2959 f->close_section();
2960 }
2961 f->close_section();
2962 } else if (prefix == "send_beacon") {
2963 lock_guard l(osd_lock);
2964 if (is_active()) {
2965 send_beacon(ceph::coarse_mono_clock::now());
2966 }
2967 }
2968
2969 else if (prefix == "cluster_log") {
2970 vector<string> msg;
2971 cmd_getval(cmdmap, "message", msg);
2972 if (msg.empty()) {
2973 ret = -EINVAL;
2974 ss << "ignoring empty log message";
2975 goto out;
2976 }
2977 string message = msg.front();
2978 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2979 message += " " + *a;
2980 string lvl;
2981 cmd_getval(cmdmap, "level", lvl);
2982 clog_type level = string_to_clog_type(lvl);
2983 if (level < 0) {
2984 ret = -EINVAL;
2985 ss << "unknown level '" << lvl << "'";
2986 goto out;
2987 }
2988 clog->do_log(level, message);
2989 }
2990
2991 else if (prefix == "bench") {
2992 // default count 1G, size 4MB
2993 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2994 int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2995 int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2996 int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
2997 double elapsed = 0.0;
2998
2999 ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
3000 if (ret != 0) {
3001 goto out;
3002 }
3003
3004 double rate = count / elapsed;
3005 double iops = rate / bsize;
3006 f->open_object_section("osd_bench_results");
3007 f->dump_int("bytes_written", count);
3008 f->dump_int("blocksize", bsize);
3009 f->dump_float("elapsed_sec", elapsed);
3010 f->dump_float("bytes_per_sec", rate);
3011 f->dump_float("iops", iops);
3012 f->close_section();
3013 }
3014
3015 else if (prefix == "flush_pg_stats") {
3016 mgrc.send_pgstats();
3017 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
3018 }
3019
3020 else if (prefix == "heap") {
3021 std::stringstream outss;
3022 ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
3023 outbl.append(outss);
3024 }
3025
3026 else if (prefix == "debug dump_missing") {
3027 f->open_array_section("pgs");
3028 vector<PGRef> pgs;
3029 _get_pgs(&pgs);
3030 for (auto& pg : pgs) {
3031 string s = stringify(pg->pg_id);
3032 f->open_array_section(s.c_str());
3033 pg->lock();
3034 pg->dump_missing(f);
3035 pg->unlock();
3036 f->close_section();
3037 }
3038 f->close_section();
3039 }
3040
3041 else if (prefix == "debug kick_recovery_wq") {
3042 int64_t delay;
3043 cmd_getval(cmdmap, "delay", delay);
3044 ostringstream oss;
3045 oss << delay;
3046 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
3047 if (ret != 0) {
3048 ss << "kick_recovery_wq: error setting "
3049 << "osd_recovery_delay_start to '" << delay << "': error "
3050 << ret;
3051 goto out;
3052 }
3053 cct->_conf.apply_changes(nullptr);
3054 ss << "kicking recovery queue. set osd_recovery_delay_start "
3055 << "to " << cct->_conf->osd_recovery_delay_start;
3056 }
3057
3058 else if (prefix == "cpu_profiler") {
3059 ostringstream ds;
3060 string arg;
3061 cmd_getval(cmdmap, "arg", arg);
3062 vector<string> argvec;
3063 get_str_vec(arg, argvec);
3064 cpu_profiler_handle_command(argvec, ds);
3065 outbl.append(ds.str());
3066 }
3067
3068 else if (prefix == "dump_pg_recovery_stats") {
3069 lock_guard l(osd_lock);
3070 pg_recovery_stats.dump_formatted(f);
3071 }
3072
3073 else if (prefix == "reset_pg_recovery_stats") {
3074 lock_guard l(osd_lock);
3075 pg_recovery_stats.reset();
3076 }
3077
3078 else if (prefix == "perf histogram dump") {
3079 std::string logger;
3080 std::string counter;
3081 cmd_getval(cmdmap, "logger", logger);
3082 cmd_getval(cmdmap, "counter", counter);
3083 cct->get_perfcounters_collection()->dump_formatted_histograms(
3084 f, false, logger, counter);
3085 }
3086
3087 else if (prefix == "cache drop") {
3088 lock_guard l(osd_lock);
3089 dout(20) << "clearing all caches" << dendl;
3090 // Clear the objectstore's cache - onode and buffer for Bluestore,
3091 // system's pagecache for Filestore
3092 ret = store->flush_cache(&ss);
3093 if (ret < 0) {
3094 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
3095 goto out;
3096 }
3097 // Clear the objectcontext cache (per PG)
3098 vector<PGRef> pgs;
3099 _get_pgs(&pgs);
3100 for (auto& pg: pgs) {
3101 pg->clear_cache();
3102 }
3103 }
3104
3105 else if (prefix == "cache status") {
3106 lock_guard l(osd_lock);
3107 int obj_ctx_count = 0;
3108 vector<PGRef> pgs;
3109 _get_pgs(&pgs);
3110 for (auto& pg: pgs) {
3111 obj_ctx_count += pg->get_cache_obj_count();
3112 }
3113 f->open_object_section("cache_status");
3114 f->dump_int("object_ctx", obj_ctx_count);
3115 store->dump_cache_stats(f);
3116 f->close_section();
3117 }
3118
3119 else if (prefix == "scrub_purged_snaps") {
3120 lock_guard l(osd_lock);
3121 scrub_purged_snaps();
3122 }
3123
3124 else if (prefix == "dump_osd_network") {
3125 lock_guard l(osd_lock);
3126 int64_t value = 0;
3127 if (!(cmd_getval(cmdmap, "value", value))) {
3128 // Convert milliseconds to microseconds
3129 value = static_cast<double>(g_conf().get_val<double>(
3130 "mon_warn_on_slow_ping_time")) * 1000;
3131 if (value == 0) {
3132 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3133 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3134 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3135 }
3136 } else {
3137 // Convert user input to microseconds
3138 value *= 1000;
3139 }
3140 if (value < 0) value = 0;
3141
3142 struct osd_ping_time_t {
3143 uint32_t pingtime;
3144 int to;
3145 bool back;
3146 std::array<uint32_t,3> times;
3147 std::array<uint32_t,3> min;
3148 std::array<uint32_t,3> max;
3149 uint32_t last;
3150 uint32_t last_update;
3151
3152 bool operator<(const osd_ping_time_t& rhs) const {
3153 if (pingtime < rhs.pingtime)
3154 return true;
3155 if (pingtime > rhs.pingtime)
3156 return false;
3157 if (to < rhs.to)
3158 return true;
3159 if (to > rhs.to)
3160 return false;
3161 return back;
3162 }
3163 };
3164
3165 set<osd_ping_time_t> sorted;
3166 // Get pingtimes under lock and not on the stack
3167 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3168 service.get_hb_pingtime(pingtimes);
3169 for (auto j : *pingtimes) {
3170 if (j.second.last_update == 0)
3171 continue;
3172 osd_ping_time_t item;
3173 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3174 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3175 if (item.pingtime >= value) {
3176 item.to = j.first;
3177 item.times[0] = j.second.back_pingtime[0];
3178 item.times[1] = j.second.back_pingtime[1];
3179 item.times[2] = j.second.back_pingtime[2];
3180 item.min[0] = j.second.back_min[0];
3181 item.min[1] = j.second.back_min[1];
3182 item.min[2] = j.second.back_min[2];
3183 item.max[0] = j.second.back_max[0];
3184 item.max[1] = j.second.back_max[1];
3185 item.max[2] = j.second.back_max[2];
3186 item.last = j.second.back_last;
3187 item.back = true;
3188 item.last_update = j.second.last_update;
3189 sorted.emplace(item);
3190 }
3191 if (j.second.front_last == 0)
3192 continue;
3193 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3194 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3195 if (item.pingtime >= value) {
3196 item.to = j.first;
3197 item.times[0] = j.second.front_pingtime[0];
3198 item.times[1] = j.second.front_pingtime[1];
3199 item.times[2] = j.second.front_pingtime[2];
3200 item.min[0] = j.second.front_min[0];
3201 item.min[1] = j.second.front_min[1];
3202 item.min[2] = j.second.front_min[2];
3203 item.max[0] = j.second.front_max[0];
3204 item.max[1] = j.second.front_max[1];
3205 item.max[2] = j.second.front_max[2];
3206 item.last = j.second.front_last;
3207 item.last_update = j.second.last_update;
3208 item.back = false;
3209 sorted.emplace(item);
3210 }
3211 }
3212 delete pingtimes;
3213 //
3214 // Network ping times (1min 5min 15min)
3215 f->open_object_section("network_ping_times");
3216 f->dump_int("threshold", value / 1000);
3217 f->open_array_section("entries");
3218 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3219 ceph_assert(sitem.pingtime >= value);
3220 f->open_object_section("entry");
3221
3222 const time_t lu(sitem.last_update);
3223 char buffer[26];
3224 string lustr(ctime_r(&lu, buffer));
3225 lustr.pop_back(); // Remove trailing \n
3226 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3227 f->dump_string("last update", lustr);
3228 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3229 f->dump_int("from osd", whoami);
3230 f->dump_int("to osd", sitem.to);
3231 f->dump_string("interface", (sitem.back ? "back" : "front"));
3232 f->open_object_section("average");
3233 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3234 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3235 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3236 f->close_section(); // average
3237 f->open_object_section("min");
3238 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3239 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3240 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3241 f->close_section(); // min
3242 f->open_object_section("max");
3243 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3244 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3245 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3246 f->close_section(); // max
3247 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3248 f->close_section(); // entry
3249 }
3250 f->close_section(); // entries
3251 f->close_section(); // network_ping_times
3252 } else if (prefix == "dump_pool_statfs") {
3253 lock_guard l(osd_lock);
3254
3255 int64_t p = 0;
3256 if (!(cmd_getval(cmdmap, "poolid", p))) {
3257 ss << "Error dumping pool statfs: no poolid provided";
3258 ret = -EINVAL;
3259 goto out;
3260 }
3261
3262 store_statfs_t st;
3263 bool per_pool_omap_stats = false;
3264
3265 ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3266 if (ret < 0) {
3267 ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3268 goto out;
3269 } else {
3270 ss << "dumping pool statfs...";
3271 f->open_object_section("pool_statfs");
3272 f->dump_int("poolid", p);
3273 st.dump(f);
3274 f->close_section();
3275 }
3276 } else {
3277 ceph_abort_msg("broken asok registration");
3278 }
3279
3280 out:
3281 on_finish(ret, ss.str(), outbl);
3282 }
3283
3284 int OSD::run_osd_bench_test(
3285 int64_t count,
3286 int64_t bsize,
3287 int64_t osize,
3288 int64_t onum,
3289 double *elapsed,
3290 ostream &ss)
3291 {
3292 int ret = 0;
3293 srand(time(NULL) % (unsigned long) -1);
3294 uint32_t duration = cct->_conf->osd_bench_duration;
3295
3296 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3297 // let us limit the block size because the next checks rely on it
3298 // having a sane value. If we allow any block size to be set things
3299 // can still go sideways.
3300 ss << "block 'size' values are capped at "
3301 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3302 << " a higher value, please adjust 'osd_bench_max_block_size'";
3303 ret = -EINVAL;
3304 return ret;
3305 } else if (bsize < (int64_t) (1 << 20)) {
3306 // entering the realm of small block sizes.
3307 // limit the count to a sane value, assuming a configurable amount of
3308 // IOPS and duration, so that the OSD doesn't get hung up on this,
3309 // preventing timeouts from going off
3310 int64_t max_count =
3311 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3312 if (count > max_count) {
3313 ss << "'count' values greater than " << max_count
3314 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3315 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3316 << " for " << duration << " seconds,"
3317 << " can cause ill effects on osd. "
3318 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3319 << " value if you wish to use a higher 'count'.";
3320 ret = -EINVAL;
3321 return ret;
3322 }
3323 } else {
3324 // 1MB block sizes are big enough so that we get more stuff done.
3325 // However, to avoid the osd from getting hung on this and having
3326 // timers being triggered, we are going to limit the count assuming
3327 // a configurable throughput and duration.
3328 // NOTE: max_count is the total amount of bytes that we believe we
3329 // will be able to write during 'duration' for the given
3330 // throughput. The block size hardly impacts this unless it's
3331 // way too big. Given we already check how big the block size
3332 // is, it's safe to assume everything will check out.
3333 int64_t max_count =
3334 cct->_conf->osd_bench_large_size_max_throughput * duration;
3335 if (count > max_count) {
3336 ss << "'count' values greater than " << max_count
3337 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3338 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3339 << " for " << duration << " seconds,"
3340 << " can cause ill effects on osd. "
3341 << " Please adjust 'osd_bench_large_size_max_throughput'"
3342 << " with a higher value if you wish to use a higher 'count'.";
3343 ret = -EINVAL;
3344 return ret;
3345 }
3346 }
3347
3348 if (osize && bsize > osize) {
3349 bsize = osize;
3350 }
3351
3352 dout(1) << " bench count " << count
3353 << " bsize " << byte_u_t(bsize) << dendl;
3354
3355 ObjectStore::Transaction cleanupt;
3356
3357 if (osize && onum) {
3358 bufferlist bl;
3359 bufferptr bp(osize);
3360 memset(bp.c_str(), 'a', bp.length());
3361 bl.push_back(std::move(bp));
3362 bl.rebuild_page_aligned();
3363 for (int i=0; i<onum; ++i) {
3364 char nm[30];
3365 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3366 object_t oid(nm);
3367 hobject_t soid(sobject_t(oid, 0));
3368 ObjectStore::Transaction t;
3369 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3370 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3371 cleanupt.remove(coll_t(), ghobject_t(soid));
3372 }
3373 }
3374
3375 {
3376 C_SaferCond waiter;
3377 if (!service.meta_ch->flush_commit(&waiter)) {
3378 waiter.wait();
3379 }
3380 }
3381
3382 bufferlist bl;
3383 utime_t start = ceph_clock_now();
3384 for (int64_t pos = 0; pos < count; pos += bsize) {
3385 char nm[34];
3386 unsigned offset = 0;
3387 bufferptr bp(bsize);
3388 memset(bp.c_str(), rand() & 0xff, bp.length());
3389 bl.push_back(std::move(bp));
3390 bl.rebuild_page_aligned();
3391 if (onum && osize) {
3392 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3393 offset = rand() % (osize / bsize) * bsize;
3394 } else {
3395 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3396 }
3397 object_t oid(nm);
3398 hobject_t soid(sobject_t(oid, 0));
3399 ObjectStore::Transaction t;
3400 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3401 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3402 if (!onum || !osize) {
3403 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3404 }
3405 bl.clear();
3406 }
3407
3408 {
3409 C_SaferCond waiter;
3410 if (!service.meta_ch->flush_commit(&waiter)) {
3411 waiter.wait();
3412 }
3413 }
3414 utime_t end = ceph_clock_now();
3415 *elapsed = end - start;
3416
3417 // clean up
3418 store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3419 {
3420 C_SaferCond waiter;
3421 if (!service.meta_ch->flush_commit(&waiter)) {
3422 waiter.wait();
3423 }
3424 }
3425
3426 return ret;
3427 }
3428
3429 class TestOpsSocketHook : public AdminSocketHook {
3430 OSDService *service;
3431 ObjectStore *store;
3432 public:
3433 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3434 int call(std::string_view command, const cmdmap_t& cmdmap,
3435 const bufferlist&,
3436 Formatter *f,
3437 std::ostream& errss,
3438 bufferlist& out) override {
3439 int r = 0;
3440 stringstream outss;
3441 try {
3442 test_ops(service, store, command, cmdmap, outss);
3443 out.append(outss);
3444 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3445 errss << e.what();
3446 r = -EINVAL;
3447 }
3448 return r;
3449 }
3450 void test_ops(OSDService *service, ObjectStore *store,
3451 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3452
3453 };
3454
3455 class OSD::C_Tick : public Context {
3456 OSD *osd;
3457 public:
3458 explicit C_Tick(OSD *o) : osd(o) {}
3459 void finish(int r) override {
3460 osd->tick();
3461 }
3462 };
3463
3464 class OSD::C_Tick_WithoutOSDLock : public Context {
3465 OSD *osd;
3466 public:
3467 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3468 void finish(int r) override {
3469 osd->tick_without_osd_lock();
3470 }
3471 };
3472
3473 int OSD::enable_disable_fuse(bool stop)
3474 {
3475 #ifdef HAVE_LIBFUSE
3476 int r;
3477 string mntpath = cct->_conf->osd_data + "/fuse";
3478 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3479 dout(1) << __func__ << " disabling" << dendl;
3480 fuse_store->stop();
3481 delete fuse_store;
3482 fuse_store = NULL;
3483 r = ::rmdir(mntpath.c_str());
3484 if (r < 0) {
3485 r = -errno;
3486 derr << __func__ << " failed to rmdir " << mntpath << ": "
3487 << cpp_strerror(r) << dendl;
3488 return r;
3489 }
3490 return 0;
3491 }
3492 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3493 dout(1) << __func__ << " enabling" << dendl;
3494 r = ::mkdir(mntpath.c_str(), 0700);
3495 if (r < 0)
3496 r = -errno;
3497 if (r < 0 && r != -EEXIST) {
3498 derr << __func__ << " unable to create " << mntpath << ": "
3499 << cpp_strerror(r) << dendl;
3500 return r;
3501 }
3502 fuse_store = new FuseStore(store.get(), mntpath);
3503 r = fuse_store->start();
3504 if (r < 0) {
3505 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3506 delete fuse_store;
3507 fuse_store = NULL;
3508 return r;
3509 }
3510 }
3511 #endif // HAVE_LIBFUSE
3512 return 0;
3513 }
3514
3515 size_t OSD::get_num_cache_shards()
3516 {
3517 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3518 }
3519
3520 int OSD::get_num_op_shards()
3521 {
3522 if (cct->_conf->osd_op_num_shards)
3523 return cct->_conf->osd_op_num_shards;
3524 if (store_is_rotational)
3525 return cct->_conf->osd_op_num_shards_hdd;
3526 else
3527 return cct->_conf->osd_op_num_shards_ssd;
3528 }
3529
3530 int OSD::get_num_op_threads()
3531 {
3532 if (cct->_conf->osd_op_num_threads_per_shard)
3533 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3534 if (store_is_rotational)
3535 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3536 else
3537 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3538 }
3539
3540 float OSD::get_osd_recovery_sleep()
3541 {
3542 if (cct->_conf->osd_recovery_sleep)
3543 return cct->_conf->osd_recovery_sleep;
3544 if (!store_is_rotational && !journal_is_rotational)
3545 return cct->_conf->osd_recovery_sleep_ssd;
3546 else if (store_is_rotational && !journal_is_rotational)
3547 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3548 else
3549 return cct->_conf->osd_recovery_sleep_hdd;
3550 }
3551
3552 float OSD::get_osd_delete_sleep()
3553 {
3554 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3555 if (osd_delete_sleep > 0)
3556 return osd_delete_sleep;
3557 if (!store_is_rotational && !journal_is_rotational)
3558 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3559 if (store_is_rotational && !journal_is_rotational)
3560 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3561 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3562 }
3563
3564 int OSD::get_recovery_max_active()
3565 {
3566 if (cct->_conf->osd_recovery_max_active)
3567 return cct->_conf->osd_recovery_max_active;
3568 if (store_is_rotational)
3569 return cct->_conf->osd_recovery_max_active_hdd;
3570 else
3571 return cct->_conf->osd_recovery_max_active_ssd;
3572 }
3573
3574 float OSD::get_osd_snap_trim_sleep()
3575 {
3576 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3577 if (osd_snap_trim_sleep > 0)
3578 return osd_snap_trim_sleep;
3579 if (!store_is_rotational && !journal_is_rotational)
3580 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3581 if (store_is_rotational && !journal_is_rotational)
3582 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3583 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3584 }
3585
3586 int OSD::init()
3587 {
3588 OSDMapRef osdmap;
3589 CompatSet initial, diff;
3590 std::lock_guard lock(osd_lock);
3591 if (is_stopping())
3592 return 0;
3593 tracing::osd::tracer.init("osd");
3594 tick_timer.init();
3595 tick_timer_without_osd_lock.init();
3596 service.recovery_request_timer.init();
3597 service.sleep_timer.init();
3598
3599 boot_finisher.start();
3600
3601 {
3602 string val;
3603 store->read_meta("require_osd_release", &val);
3604 last_require_osd_release = ceph_release_from_name(val);
3605 }
3606
3607 // mount.
3608 dout(2) << "init " << dev_path
3609 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3610 << dendl;
3611 dout(2) << "journal " << journal_path << dendl;
3612 ceph_assert(store); // call pre_init() first!
3613
3614 store->set_cache_shards(get_num_cache_shards());
3615
3616 int rotating_auth_attempts = 0;
3617 auto rotating_auth_timeout =
3618 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3619
3620 int r = store->mount();
3621 if (r < 0) {
3622 derr << "OSD:init: unable to mount object store" << dendl;
3623 return r;
3624 }
3625 journal_is_rotational = store->is_journal_rotational();
3626 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3627 << dendl;
3628
3629 enable_disable_fuse(false);
3630
3631 dout(2) << "boot" << dendl;
3632
3633 service.meta_ch = store->open_collection(coll_t::meta());
3634 if (!service.meta_ch) {
3635 derr << "OSD:init: unable to open meta collection"
3636 << dendl;
3637 r = -ENOENT;
3638 goto out;
3639 }
3640 // initialize the daily loadavg with current 15min loadavg
3641 double loadavgs[3];
3642 if (getloadavg(loadavgs, 3) == 3) {
3643 daily_loadavg = loadavgs[2];
3644 } else {
3645 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3646 daily_loadavg = 1.0;
3647 }
3648
3649 // sanity check long object name handling
3650 {
3651 hobject_t l;
3652 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3653 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3654 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3655 r = store->validate_hobject_key(l);
3656 if (r < 0) {
3657 derr << "backend (" << store->get_type() << ") is unable to support max "
3658 << "object name[space] len" << dendl;
3659 derr << " osd max object name len = "
3660 << cct->_conf->osd_max_object_name_len << dendl;
3661 derr << " osd max object namespace len = "
3662 << cct->_conf->osd_max_object_namespace_len << dendl;
3663 derr << cpp_strerror(r) << dendl;
3664 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3665 goto out;
3666 }
3667 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3668 << dendl;
3669 } else {
3670 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3671 }
3672 }
3673
3674 // read superblock
3675 r = read_superblock();
3676 if (r < 0) {
3677 derr << "OSD::init() : unable to read osd superblock" << dendl;
3678 r = -EINVAL;
3679 goto out;
3680 }
3681
3682 if (osd_compat.compare(superblock.compat_features) < 0) {
3683 derr << "The disk uses features unsupported by the executable." << dendl;
3684 derr << " ondisk features " << superblock.compat_features << dendl;
3685 derr << " daemon features " << osd_compat << dendl;
3686
3687 if (osd_compat.writeable(superblock.compat_features)) {
3688 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3689 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3690 r = -EOPNOTSUPP;
3691 goto out;
3692 }
3693 else {
3694 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3695 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3696 r = -EOPNOTSUPP;
3697 goto out;
3698 }
3699 }
3700
3701 assert_warn(whoami == superblock.whoami);
3702 if (whoami != superblock.whoami) {
3703 derr << "OSD::init: superblock says osd"
3704 << superblock.whoami << " but I am osd." << whoami << dendl;
3705 r = -EINVAL;
3706 goto out;
3707 }
3708
3709 startup_time = ceph::mono_clock::now();
3710
3711 // load up "current" osdmap
3712 assert_warn(!get_osdmap());
3713 if (get_osdmap()) {
3714 derr << "OSD::init: unable to read current osdmap" << dendl;
3715 r = -EINVAL;
3716 goto out;
3717 }
3718 osdmap = get_map(superblock.current_epoch);
3719 set_osdmap(osdmap);
3720
3721 // make sure we don't have legacy pgs deleting
3722 {
3723 vector<coll_t> ls;
3724 int r = store->list_collections(ls);
3725 ceph_assert(r >= 0);
3726 for (auto c : ls) {
3727 spg_t pgid;
3728 if (c.is_pg(&pgid) &&
3729 !osdmap->have_pg_pool(pgid.pool())) {
3730 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3731 if (!store->exists(service.meta_ch, oid)) {
3732 derr << __func__ << " missing pg_pool_t for deleted pool "
3733 << pgid.pool() << " for pg " << pgid
3734 << "; please downgrade to luminous and allow "
3735 << "pg deletion to complete before upgrading" << dendl;
3736 ceph_abort();
3737 }
3738 }
3739 }
3740 }
3741
3742 initial = get_osd_initial_compat_set();
3743 diff = superblock.compat_features.unsupported(initial);
3744 if (superblock.compat_features.merge(initial)) {
3745 // Are we adding SNAPMAPPER2?
3746 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3747 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3748 << dendl;
3749 auto ch = service.meta_ch;
3750 auto hoid = make_snapmapper_oid();
3751 unsigned max = cct->_conf->osd_target_transaction_size;
3752 r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
3753 if (r < 0)
3754 goto out;
3755 }
3756 // We need to persist the new compat_set before we
3757 // do anything else
3758 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3759
3760 if (!superblock.cluster_osdmap_trim_lower_bound) {
3761 superblock.cluster_osdmap_trim_lower_bound = superblock.oldest_map;
3762 }
3763
3764 ObjectStore::Transaction t;
3765 write_superblock(t);
3766 r = store->queue_transaction(service.meta_ch, std::move(t));
3767 if (r < 0)
3768 goto out;
3769 }
3770
3771 // make sure snap mapper object exists
3772 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3773 dout(10) << "init creating/touching snapmapper object" << dendl;
3774 ObjectStore::Transaction t;
3775 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3776 r = store->queue_transaction(service.meta_ch, std::move(t));
3777 if (r < 0)
3778 goto out;
3779 }
3780 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3781 dout(10) << "init creating/touching purged_snaps object" << dendl;
3782 ObjectStore::Transaction t;
3783 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3784 r = store->queue_transaction(service.meta_ch, std::move(t));
3785 if (r < 0)
3786 goto out;
3787 }
3788
3789 if (cct->_conf->osd_open_classes_on_start) {
3790 int r = ClassHandler::get_instance().open_all_classes();
3791 if (r)
3792 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3793 }
3794
3795 check_osdmap_features();
3796
3797 {
3798 epoch_t bind_epoch = osdmap->get_epoch();
3799 service.set_epochs(NULL, NULL, &bind_epoch);
3800 }
3801
3802 clear_temp_objects();
3803
3804 // initialize osdmap references in sharded wq
3805 for (auto& shard : shards) {
3806 std::lock_guard l(shard->osdmap_lock);
3807 shard->shard_osdmap = osdmap;
3808 }
3809
3810 // load up pgs (as they previously existed)
3811 load_pgs();
3812
3813 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3814
3815 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3816 dout(2) << "compacting object store's omap" << dendl;
3817 store->compact();
3818 }
3819
3820 // prime osd stats
3821 {
3822 struct store_statfs_t stbuf;
3823 osd_alert_list_t alerts;
3824 int r = store->statfs(&stbuf, &alerts);
3825 ceph_assert(r == 0);
3826 service.set_statfs(stbuf, alerts);
3827 }
3828
3829 // client_messenger's auth_client will be set up by monc->init() later.
3830 for (auto m : { cluster_messenger,
3831 objecter_messenger,
3832 hb_front_client_messenger,
3833 hb_back_client_messenger,
3834 hb_front_server_messenger,
3835 hb_back_server_messenger } ) {
3836 m->set_auth_client(monc);
3837 }
3838 for (auto m : { client_messenger,
3839 cluster_messenger,
3840 hb_front_server_messenger,
3841 hb_back_server_messenger }) {
3842 m->set_auth_server(monc);
3843 }
3844 monc->set_handle_authentication_dispatcher(this);
3845
3846 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3847 | CEPH_ENTITY_TYPE_MGR);
3848 r = monc->init();
3849 if (r < 0)
3850 goto out;
3851
3852 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3853 mgrc.set_perf_metric_query_cb(
3854 [this](const ConfigPayload &config_payload) {
3855 set_perf_queries(config_payload);
3856 },
3857 [this] {
3858 return get_perf_reports();
3859 });
3860 mgrc.init();
3861
3862 // tell monc about log_client so it will know about mon session resets
3863 monc->set_log_client(&log_client);
3864 update_log_config();
3865
3866 // i'm ready!
3867 client_messenger->add_dispatcher_tail(&mgrc);
3868 client_messenger->add_dispatcher_tail(this);
3869 cluster_messenger->add_dispatcher_head(this);
3870
3871 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3872 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3873 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3874 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3875
3876 objecter_messenger->add_dispatcher_head(service.objecter.get());
3877
3878 service.init();
3879 service.publish_map(osdmap);
3880 service.publish_superblock(superblock);
3881
3882 for (auto& shard : shards) {
3883 // put PGs in a temporary set because we may modify pg_slots
3884 // unordered_map below.
3885 set<PGRef> pgs;
3886 for (auto& i : shard->pg_slots) {
3887 PGRef pg = i.second->pg;
3888 if (!pg) {
3889 continue;
3890 }
3891 pgs.insert(pg);
3892 }
3893 for (auto pg : pgs) {
3894 std::scoped_lock l{*pg};
3895 set<pair<spg_t,epoch_t>> new_children;
3896 set<pair<spg_t,epoch_t>> merge_pgs;
3897 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3898 &new_children, &merge_pgs);
3899 if (!new_children.empty()) {
3900 for (auto shard : shards) {
3901 shard->prime_splits(osdmap, &new_children);
3902 }
3903 assert(new_children.empty());
3904 }
3905 if (!merge_pgs.empty()) {
3906 for (auto shard : shards) {
3907 shard->prime_merges(osdmap, &merge_pgs);
3908 }
3909 assert(merge_pgs.empty());
3910 }
3911 }
3912 }
3913
3914 osd_op_tp.start();
3915
3916 // start the heartbeat
3917 heartbeat_thread.create("osd_srv_heartbt");
3918
3919 // tick
3920 tick_timer.add_event_after(get_tick_interval(),
3921 new C_Tick(this));
3922 {
3923 std::lock_guard l(tick_timer_lock);
3924 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3925 new C_Tick_WithoutOSDLock(this));
3926 }
3927
3928 osd_lock.unlock();
3929
3930 r = monc->authenticate();
3931 if (r < 0) {
3932 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3933 << dendl;
3934 exit(1);
3935 }
3936
3937 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3938 derr << "unable to obtain rotating service keys; retrying" << dendl;
3939 ++rotating_auth_attempts;
3940 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3941 derr << __func__ << " wait_auth_rotating timed out"
3942 <<" -- maybe I have a clock skew against the monitors?" << dendl;
3943 exit(1);
3944 }
3945 }
3946
3947 r = update_crush_device_class();
3948 if (r < 0) {
3949 derr << __func__ << " unable to update_crush_device_class: "
3950 << cpp_strerror(r) << dendl;
3951 exit(1);
3952 }
3953
3954 r = update_crush_location();
3955 if (r < 0) {
3956 derr << __func__ << " unable to update_crush_location: "
3957 << cpp_strerror(r) << dendl;
3958 exit(1);
3959 }
3960
3961 osd_lock.lock();
3962 if (is_stopping())
3963 return 0;
3964
3965 // start objecter *after* we have authenticated, so that we don't ignore
3966 // the OSDMaps it requests.
3967 service.final_init();
3968
3969 check_config();
3970
3971 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3972 consume_map();
3973
3974 dout(0) << "done with init, starting boot process" << dendl;
3975
3976 // subscribe to any pg creations
3977 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3978
3979 // MgrClient needs this (it doesn't have MonClient reference itself)
3980 monc->sub_want("mgrmap", 0, 0);
3981
3982 // we don't need to ask for an osdmap here; objecter will
3983 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3984
3985 monc->renew_subs();
3986
3987 start_boot();
3988
3989 // Override a few options if mclock scheduler is enabled.
3990 maybe_override_sleep_options_for_qos();
3991 maybe_override_cost_for_qos();
3992 maybe_override_options_for_qos();
3993 maybe_override_max_osd_capacity_for_qos();
3994
3995 return 0;
3996
3997 out:
3998 enable_disable_fuse(true);
3999 store->umount();
4000 store.reset();
4001 return r;
4002 }
4003
4004 void OSD::final_init()
4005 {
4006 AdminSocket *admin_socket = cct->get_admin_socket();
4007 asok_hook = new OSDSocketHook(this);
4008 int r = admin_socket->register_command("status", asok_hook,
4009 "high-level status of OSD");
4010 ceph_assert(r == 0);
4011 r = admin_socket->register_command("flush_journal",
4012 asok_hook,
4013 "flush the journal to permanent store");
4014 ceph_assert(r == 0);
4015 r = admin_socket->register_command("dump_ops_in_flight " \
4016 "name=filterstr,type=CephString,n=N,req=false",
4017 asok_hook,
4018 "show the ops currently in flight");
4019 ceph_assert(r == 0);
4020 r = admin_socket->register_command("ops " \
4021 "name=filterstr,type=CephString,n=N,req=false",
4022 asok_hook,
4023 "show the ops currently in flight");
4024 ceph_assert(r == 0);
4025 r = admin_socket->register_command("dump_blocked_ops " \
4026 "name=filterstr,type=CephString,n=N,req=false",
4027 asok_hook,
4028 "show the blocked ops currently in flight");
4029 ceph_assert(r == 0);
4030 r = admin_socket->register_command("dump_blocked_ops_count " \
4031 "name=filterstr,type=CephString,n=N,req=false",
4032 asok_hook,
4033 "show the count of blocked ops currently in flight");
4034 ceph_assert(r == 0);
4035 r = admin_socket->register_command("dump_historic_ops " \
4036 "name=filterstr,type=CephString,n=N,req=false",
4037 asok_hook,
4038 "show recent ops");
4039 ceph_assert(r == 0);
4040 r = admin_socket->register_command("dump_historic_slow_ops " \
4041 "name=filterstr,type=CephString,n=N,req=false",
4042 asok_hook,
4043 "show slowest recent ops");
4044 ceph_assert(r == 0);
4045 r = admin_socket->register_command("dump_historic_ops_by_duration " \
4046 "name=filterstr,type=CephString,n=N,req=false",
4047 asok_hook,
4048 "show slowest recent ops, sorted by duration");
4049 ceph_assert(r == 0);
4050 r = admin_socket->register_command("dump_op_pq_state",
4051 asok_hook,
4052 "dump op queue state");
4053 ceph_assert(r == 0);
4054 r = admin_socket->register_command("dump_blocklist",
4055 asok_hook,
4056 "dump blocklisted clients and times");
4057 ceph_assert(r == 0);
4058 r = admin_socket->register_command("dump_watchers",
4059 asok_hook,
4060 "show clients which have active watches,"
4061 " and on which objects");
4062 ceph_assert(r == 0);
4063 r = admin_socket->register_command("dump_recovery_reservations",
4064 asok_hook,
4065 "show recovery reservations");
4066 ceph_assert(r == 0);
4067 r = admin_socket->register_command("dump_scrub_reservations",
4068 asok_hook,
4069 "show scrub reservations");
4070 ceph_assert(r == 0);
4071 r = admin_socket->register_command("get_latest_osdmap",
4072 asok_hook,
4073 "force osd to update the latest map from "
4074 "the mon");
4075 ceph_assert(r == 0);
4076
4077 r = admin_socket->register_command("set_heap_property " \
4078 "name=property,type=CephString " \
4079 "name=value,type=CephInt",
4080 asok_hook,
4081 "update malloc extension heap property");
4082 ceph_assert(r == 0);
4083
4084 r = admin_socket->register_command("get_heap_property " \
4085 "name=property,type=CephString",
4086 asok_hook,
4087 "get malloc extension heap property");
4088 ceph_assert(r == 0);
4089
4090 r = admin_socket->register_command("dump_objectstore_kv_stats",
4091 asok_hook,
4092 "print statistics of kvdb which used by bluestore");
4093 ceph_assert(r == 0);
4094
4095 r = admin_socket->register_command("dump_scrubs",
4096 asok_hook,
4097 "print scheduled scrubs");
4098 ceph_assert(r == 0);
4099
4100 r = admin_socket->register_command("calc_objectstore_db_histogram",
4101 asok_hook,
4102 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
4103 ceph_assert(r == 0);
4104
4105 r = admin_socket->register_command("flush_store_cache",
4106 asok_hook,
4107 "Flush bluestore internal cache");
4108 ceph_assert(r == 0);
4109 r = admin_socket->register_command("rotate-stored-key",
4110 asok_hook,
4111 "Update the stored osd_key");
4112 ceph_assert(r == 0);
4113 r = admin_socket->register_command("dump_pgstate_history",
4114 asok_hook,
4115 "show recent state history");
4116 ceph_assert(r == 0);
4117
4118 r = admin_socket->register_command("compact",
4119 asok_hook,
4120 "Commpact object store's omap."
4121 " WARNING: Compaction probably slows your requests");
4122 ceph_assert(r == 0);
4123
4124 r = admin_socket->register_command("get_mapped_pools",
4125 asok_hook,
4126 "dump pools whose PG(s) are mapped to this OSD.");
4127
4128 ceph_assert(r == 0);
4129
4130 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
4131 asok_hook,
4132 "probe OSD devices for SMART data.");
4133
4134 ceph_assert(r == 0);
4135
4136 r = admin_socket->register_command("list_devices",
4137 asok_hook,
4138 "list OSD devices.");
4139 r = admin_socket->register_command("send_beacon",
4140 asok_hook,
4141 "send OSD beacon to mon immediately");
4142
4143 r = admin_socket->register_command(
4144 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
4145 "Dump osd heartbeat network ping times");
4146 ceph_assert(r == 0);
4147
4148 r = admin_socket->register_command(
4149 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
4150 "Dump store's statistics for the given pool");
4151 ceph_assert(r == 0);
4152
4153 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
4154 // Note: pools are CephString instead of CephPoolname because
4155 // these commands traditionally support both pool names and numbers
4156 r = admin_socket->register_command(
4157 "setomapval " \
4158 "name=pool,type=CephString " \
4159 "name=objname,type=CephObjectname " \
4160 "name=key,type=CephString "\
4161 "name=val,type=CephString",
4162 test_ops_hook,
4163 "set omap key");
4164 ceph_assert(r == 0);
4165 r = admin_socket->register_command(
4166 "rmomapkey " \
4167 "name=pool,type=CephString " \
4168 "name=objname,type=CephObjectname " \
4169 "name=key,type=CephString",
4170 test_ops_hook,
4171 "remove omap key");
4172 ceph_assert(r == 0);
4173 r = admin_socket->register_command(
4174 "setomapheader " \
4175 "name=pool,type=CephString " \
4176 "name=objname,type=CephObjectname " \
4177 "name=header,type=CephString",
4178 test_ops_hook,
4179 "set omap header");
4180 ceph_assert(r == 0);
4181
4182 r = admin_socket->register_command(
4183 "getomap " \
4184 "name=pool,type=CephString " \
4185 "name=objname,type=CephObjectname",
4186 test_ops_hook,
4187 "output entire object map");
4188 ceph_assert(r == 0);
4189
4190 r = admin_socket->register_command(
4191 "truncobj " \
4192 "name=pool,type=CephString " \
4193 "name=objname,type=CephObjectname " \
4194 "name=len,type=CephInt",
4195 test_ops_hook,
4196 "truncate object to length");
4197 ceph_assert(r == 0);
4198
4199 r = admin_socket->register_command(
4200 "injectdataerr " \
4201 "name=pool,type=CephString " \
4202 "name=objname,type=CephObjectname " \
4203 "name=shardid,type=CephInt,req=false,range=0|255",
4204 test_ops_hook,
4205 "inject data error to an object");
4206 ceph_assert(r == 0);
4207
4208 r = admin_socket->register_command(
4209 "injectmdataerr " \
4210 "name=pool,type=CephString " \
4211 "name=objname,type=CephObjectname " \
4212 "name=shardid,type=CephInt,req=false,range=0|255",
4213 test_ops_hook,
4214 "inject metadata error to an object");
4215 ceph_assert(r == 0);
4216 r = admin_socket->register_command(
4217 "set_recovery_delay " \
4218 "name=utime,type=CephInt,req=false",
4219 test_ops_hook,
4220 "Delay osd recovery by specified seconds");
4221 ceph_assert(r == 0);
4222 r = admin_socket->register_command(
4223 "injectfull " \
4224 "name=type,type=CephString,req=false " \
4225 "name=count,type=CephInt,req=false ",
4226 test_ops_hook,
4227 "Inject a full disk (optional count times)");
4228 ceph_assert(r == 0);
4229 r = admin_socket->register_command(
4230 "bench " \
4231 "name=count,type=CephInt,req=false " \
4232 "name=size,type=CephInt,req=false " \
4233 "name=object_size,type=CephInt,req=false " \
4234 "name=object_num,type=CephInt,req=false ",
4235 asok_hook,
4236 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4237 "(default count=1G default size=4MB). Results in log.");
4238 ceph_assert(r == 0);
4239 r = admin_socket->register_command(
4240 "cluster_log " \
4241 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4242 "name=message,type=CephString,n=N",
4243 asok_hook,
4244 "log a message to the cluster log");
4245 ceph_assert(r == 0);
4246 r = admin_socket->register_command(
4247 "flush_pg_stats",
4248 asok_hook,
4249 "flush pg stats");
4250 ceph_assert(r == 0);
4251 r = admin_socket->register_command(
4252 "heap " \
4253 "name=heapcmd,type=CephChoices,strings=" \
4254 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4255 "name=value,type=CephString,req=false",
4256 asok_hook,
4257 "show heap usage info (available only if compiled with tcmalloc)");
4258 ceph_assert(r == 0);
4259 r = admin_socket->register_command(
4260 "debug dump_missing " \
4261 "name=filename,type=CephFilepath",
4262 asok_hook,
4263 "dump missing objects to a named file");
4264 ceph_assert(r == 0);
4265 r = admin_socket->register_command(
4266 "debug kick_recovery_wq " \
4267 "name=delay,type=CephInt,range=0",
4268 asok_hook,
4269 "set osd_recovery_delay_start to <val>");
4270 ceph_assert(r == 0);
4271 r = admin_socket->register_command(
4272 "cpu_profiler " \
4273 "name=arg,type=CephChoices,strings=status|flush",
4274 asok_hook,
4275 "run cpu profiling on daemon");
4276 ceph_assert(r == 0);
4277 r = admin_socket->register_command(
4278 "dump_pg_recovery_stats",
4279 asok_hook,
4280 "dump pg recovery statistics");
4281 ceph_assert(r == 0);
4282 r = admin_socket->register_command(
4283 "reset_pg_recovery_stats",
4284 asok_hook,
4285 "reset pg recovery statistics");
4286 ceph_assert(r == 0);
4287 r = admin_socket->register_command(
4288 "cache drop",
4289 asok_hook,
4290 "Drop all OSD caches");
4291 ceph_assert(r == 0);
4292 r = admin_socket->register_command(
4293 "cache status",
4294 asok_hook,
4295 "Get OSD caches statistics");
4296 ceph_assert(r == 0);
4297 r = admin_socket->register_command(
4298 "scrub_purged_snaps",
4299 asok_hook,
4300 "Scrub purged_snaps vs snapmapper index");
4301 ceph_assert(r == 0);
4302 r = admin_socket->register_command(
4303 "scrubdebug " \
4304 "name=pgid,type=CephPgid " \
4305 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4306 "name=value,type=CephString,req=false",
4307 asok_hook,
4308 "debug the scrubber");
4309 ceph_assert(r == 0);
4310
4311 // -- pg commands --
4312 // old form: ceph pg <pgid> command ...
4313 r = admin_socket->register_command(
4314 "pg " \
4315 "name=pgid,type=CephPgid " \
4316 "name=cmd,type=CephChoices,strings=query",
4317 asok_hook,
4318 "");
4319 ceph_assert(r == 0);
4320 r = admin_socket->register_command(
4321 "pg " \
4322 "name=pgid,type=CephPgid " \
4323 "name=cmd,type=CephChoices,strings=log",
4324 asok_hook,
4325 "");
4326 ceph_assert(r == 0);
4327 r = admin_socket->register_command(
4328 "pg " \
4329 "name=pgid,type=CephPgid " \
4330 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4331 "name=mulcmd,type=CephChoices,strings=revert|delete",
4332 asok_hook,
4333 "");
4334 ceph_assert(r == 0);
4335 r = admin_socket->register_command(
4336 "pg " \
4337 "name=pgid,type=CephPgid " \
4338 "name=cmd,type=CephChoices,strings=list_unfound " \
4339 "name=offset,type=CephString,req=false",
4340 asok_hook,
4341 "");
4342 ceph_assert(r == 0);
4343 r = admin_socket->register_command(
4344 "pg " \
4345 "name=pgid,type=CephPgid " \
4346 "name=cmd,type=CephChoices,strings=scrub " \
4347 "name=time,type=CephInt,req=false",
4348 asok_hook,
4349 "");
4350 ceph_assert(r == 0);
4351 r = admin_socket->register_command(
4352 "pg " \
4353 "name=pgid,type=CephPgid " \
4354 "name=cmd,type=CephChoices,strings=deep_scrub " \
4355 "name=time,type=CephInt,req=false",
4356 asok_hook,
4357 "");
4358 ceph_assert(r == 0);
4359 // new form: tell <pgid> <cmd> for both cli and rest
4360 r = admin_socket->register_command(
4361 "query",
4362 asok_hook,
4363 "show details of a specific pg");
4364 ceph_assert(r == 0);
4365 r = admin_socket->register_command(
4366 "log",
4367 asok_hook,
4368 "dump pg_log of a specific pg");
4369 ceph_assert(r == 0);
4370 r = admin_socket->register_command(
4371 "mark_unfound_lost " \
4372 "name=pgid,type=CephPgid,req=false " \
4373 "name=mulcmd,type=CephChoices,strings=revert|delete",
4374 asok_hook,
4375 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4376 ceph_assert(r == 0);
4377 r = admin_socket->register_command(
4378 "list_unfound " \
4379 "name=pgid,type=CephPgid,req=false " \
4380 "name=offset,type=CephString,req=false",
4381 asok_hook,
4382 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4383 ceph_assert(r == 0);
4384 r = admin_socket->register_command(
4385 "scrub " \
4386 "name=pgid,type=CephPgid,req=false " \
4387 "name=time,type=CephInt,req=false",
4388 asok_hook,
4389 "Trigger a scheduled scrub ");
4390 ceph_assert(r == 0);
4391 r = admin_socket->register_command(
4392 "deep_scrub " \
4393 "name=pgid,type=CephPgid,req=false " \
4394 "name=time,type=CephInt,req=false",
4395 asok_hook,
4396 "Trigger a scheduled deep scrub ");
4397 ceph_assert(r == 0);
4398 }
4399
4400 PerfCounters* OSD::create_logger()
4401 {
4402 PerfCounters* logger = build_osd_logger(cct);
4403 cct->get_perfcounters_collection()->add(logger);
4404 return logger;
4405 }
4406
4407 PerfCounters* OSD::create_recoverystate_perf()
4408 {
4409 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4410 cct->get_perfcounters_collection()->add(recoverystate_perf);
4411 return recoverystate_perf;
4412 }
4413
4414 int OSD::shutdown()
4415 {
4416 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4417 //cct->_conf->osd_fast_shutdown = true;
4418
4419 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4420 << cct->_conf->osd_fast_shutdown
4421 << ", null-fm = " << store->has_null_manager() << dendl;
4422
4423 utime_t start_time_func = ceph_clock_now();
4424
4425 if (cct->_conf->osd_fast_shutdown) {
4426 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4427 if (cct->_conf->osd_fast_shutdown_notify_mon)
4428 service.prepare_to_stop();
4429
4430 // There is no state we need to keep wehn running in NULL-FM moode
4431 if (!store->has_null_manager()) {
4432 cct->_log->flush();
4433 _exit(0);
4434 }
4435 } else if (!service.prepare_to_stop()) {
4436 return 0; // already shutting down
4437 }
4438
4439 osd_lock.lock();
4440 if (is_stopping()) {
4441 osd_lock.unlock();
4442 return 0;
4443 }
4444
4445 if (!cct->_conf->osd_fast_shutdown) {
4446 dout(0) << "shutdown" << dendl;
4447 }
4448
4449 // don't accept new task for this OSD
4450 set_state(STATE_STOPPING);
4451
4452 // Disabled debugging during fast-shutdown
4453 if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4454 cct->_conf.set_val("debug_osd", "100");
4455 cct->_conf.set_val("debug_journal", "100");
4456 cct->_conf.set_val("debug_filestore", "100");
4457 cct->_conf.set_val("debug_bluestore", "100");
4458 cct->_conf.set_val("debug_ms", "100");
4459 cct->_conf.apply_changes(nullptr);
4460 }
4461
4462 // stop MgrClient earlier as it's more like an internal consumer of OSD
4463 //
4464 // should occur before unmounting the database in fast-shutdown to avoid
4465 // a race condition (see https://tracker.ceph.com/issues/56101)
4466 mgrc.shutdown();
4467
4468 if (cct->_conf->osd_fast_shutdown) {
4469 // first, stop new task from being taken from op_shardedwq
4470 // and clear all pending tasks
4471 op_shardedwq.stop_for_fast_shutdown();
4472
4473 utime_t start_time_timer = ceph_clock_now();
4474 tick_timer.shutdown();
4475 {
4476 std::lock_guard l(tick_timer_lock);
4477 tick_timer_without_osd_lock.shutdown();
4478 }
4479
4480 osd_lock.unlock();
4481 utime_t start_time_osd_drain = ceph_clock_now();
4482
4483 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4484 osd_op_tp.drain();
4485 osd_op_tp.stop();
4486
4487 utime_t start_time_umount = ceph_clock_now();
4488 store->prepare_for_fast_shutdown();
4489 std::lock_guard lock(osd_lock);
4490 // TBD: assert in allocator that nothing is being add
4491 store->umount();
4492
4493 utime_t end_time = ceph_clock_now();
4494 if (cct->_conf->osd_fast_shutdown_timeout) {
4495 ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4496 }
4497 dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
4498 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
4499 dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
4500 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
4501 cct->_log->flush();
4502
4503 // now it is safe to exit
4504 _exit(0);
4505 }
4506
4507 service.start_shutdown();
4508
4509 // stop sending work to pgs. this just prevents any new work in _process
4510 // from racing with on_shutdown and potentially entering the pg after.
4511 op_shardedwq.drain();
4512
4513 // Shutdown PGs
4514 {
4515 vector<PGRef> pgs;
4516 _get_pgs(&pgs);
4517 for (auto pg : pgs) {
4518 pg->shutdown();
4519 }
4520 }
4521
4522 // drain op queue again (in case PGs requeued something)
4523 op_shardedwq.drain();
4524
4525 // unregister commands
4526 cct->get_admin_socket()->unregister_commands(asok_hook);
4527 delete asok_hook;
4528 asok_hook = NULL;
4529
4530 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4531 delete test_ops_hook;
4532 test_ops_hook = NULL;
4533
4534 osd_lock.unlock();
4535
4536 {
4537 std::lock_guard l{heartbeat_lock};
4538 heartbeat_stop = true;
4539 heartbeat_cond.notify_all();
4540 heartbeat_peers.clear();
4541 }
4542 heartbeat_thread.join();
4543
4544 hb_back_server_messenger->mark_down_all();
4545 hb_front_server_messenger->mark_down_all();
4546 hb_front_client_messenger->mark_down_all();
4547 hb_back_client_messenger->mark_down_all();
4548
4549 osd_op_tp.drain();
4550 osd_op_tp.stop();
4551 dout(10) << "op sharded tp stopped" << dendl;
4552
4553 dout(10) << "stopping agent" << dendl;
4554 service.agent_stop();
4555
4556 boot_finisher.wait_for_empty();
4557
4558 osd_lock.lock();
4559
4560 boot_finisher.stop();
4561 reset_heartbeat_peers(true);
4562
4563 tick_timer.shutdown();
4564
4565 {
4566 std::lock_guard l(tick_timer_lock);
4567 tick_timer_without_osd_lock.shutdown();
4568 }
4569
4570 // note unmount epoch
4571 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4572 superblock.mounted = service.get_boot_epoch();
4573 superblock.clean_thru = get_osdmap_epoch();
4574 ObjectStore::Transaction t;
4575 write_superblock(t);
4576 int r = store->queue_transaction(service.meta_ch, std::move(t));
4577 if (r) {
4578 derr << "OSD::shutdown: error writing superblock: "
4579 << cpp_strerror(r) << dendl;
4580 }
4581
4582
4583 service.shutdown_reserver();
4584
4585 // Remove PGs
4586 #ifdef PG_DEBUG_REFS
4587 service.dump_live_pgids();
4588 #endif
4589 while (true) {
4590 vector<PGRef> pgs;
4591 _get_pgs(&pgs, true);
4592 if (pgs.empty()) {
4593 break;
4594 }
4595 for (auto& pg : pgs) {
4596 if (pg->is_deleted()) {
4597 continue;
4598 }
4599 dout(20) << " kicking pg " << pg << dendl;
4600 pg->lock();
4601 if (pg->get_num_ref() != 1) {
4602 derr << "pgid " << pg->get_pgid() << " has ref count of "
4603 << pg->get_num_ref() << dendl;
4604 #ifdef PG_DEBUG_REFS
4605 pg->dump_live_ids();
4606 #endif
4607 if (cct->_conf->osd_shutdown_pgref_assert) {
4608 ceph_abort();
4609 }
4610 }
4611 pg->ch.reset();
4612 pg->unlock();
4613 }
4614 }
4615 #ifdef PG_DEBUG_REFS
4616 service.dump_live_pgids();
4617 #endif
4618
4619 osd_lock.unlock();
4620 cct->_conf.remove_observer(this);
4621 osd_lock.lock();
4622
4623 service.meta_ch.reset();
4624
4625 dout(10) << "syncing store" << dendl;
4626 enable_disable_fuse(true);
4627
4628 if (cct->_conf->osd_journal_flush_on_shutdown) {
4629 dout(10) << "flushing journal" << dendl;
4630 store->flush_journal();
4631 }
4632
4633 monc->shutdown();
4634 osd_lock.unlock();
4635 {
4636 std::unique_lock l{map_lock};
4637 set_osdmap(OSDMapRef());
4638 }
4639 for (auto s : shards) {
4640 std::lock_guard l(s->osdmap_lock);
4641 s->shard_osdmap = OSDMapRef();
4642 }
4643 service.shutdown();
4644
4645 std::lock_guard lock(osd_lock);
4646 store->umount();
4647 store.reset();
4648 dout(10) << "Store synced" << dendl;
4649
4650 op_tracker.on_shutdown();
4651
4652 ClassHandler::get_instance().shutdown();
4653 client_messenger->shutdown();
4654 cluster_messenger->shutdown();
4655 hb_front_client_messenger->shutdown();
4656 hb_back_client_messenger->shutdown();
4657 objecter_messenger->shutdown();
4658 hb_front_server_messenger->shutdown();
4659 hb_back_server_messenger->shutdown();
4660
4661 utime_t duration = ceph_clock_now() - start_time_func;
4662 dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4663
4664
4665 return r;
4666 }
4667
4668 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4669 {
4670 bool created = false;
4671 while (true) {
4672 dout(10) << __func__ << " cmd: " << cmd << dendl;
4673 vector<string> vcmd{cmd};
4674 bufferlist inbl;
4675 C_SaferCond w;
4676 string outs;
4677 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4678 int r = w.wait();
4679 if (r < 0) {
4680 if (r == -ENOENT && !created) {
4681 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4682 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4683 vector<string> vnewcmd{newcmd};
4684 bufferlist inbl;
4685 C_SaferCond w;
4686 string outs;
4687 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4688 int r = w.wait();
4689 if (r < 0) {
4690 derr << __func__ << " fail: osd does not exist and created failed: "
4691 << cpp_strerror(r) << dendl;
4692 return r;
4693 }
4694 created = true;
4695 continue;
4696 }
4697 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4698 return r;
4699 }
4700 break;
4701 }
4702
4703 return 0;
4704 }
4705
4706 int OSD::update_crush_location()
4707 {
4708 if (!cct->_conf->osd_crush_update_on_start) {
4709 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4710 return 0;
4711 }
4712
4713 char weight[32];
4714 if (cct->_conf->osd_crush_initial_weight >= 0) {
4715 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4716 } else {
4717 struct store_statfs_t st;
4718 osd_alert_list_t alerts;
4719 int r = store->statfs(&st, &alerts);
4720 if (r < 0) {
4721 derr << "statfs: " << cpp_strerror(r) << dendl;
4722 return r;
4723 }
4724 snprintf(weight, sizeof(weight), "%.4lf",
4725 std::max(.00001,
4726 double(st.total) /
4727 double(1ull << 40 /* TB */)));
4728 }
4729
4730 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4731
4732 string cmd =
4733 string("{\"prefix\": \"osd crush create-or-move\", ") +
4734 string("\"id\": ") + stringify(whoami) + ", " +
4735 string("\"weight\":") + weight + ", " +
4736 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4737 return mon_cmd_maybe_osd_create(cmd);
4738 }
4739
4740 int OSD::update_crush_device_class()
4741 {
4742 if (!cct->_conf->osd_class_update_on_start) {
4743 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4744 return 0;
4745 }
4746
4747 string device_class;
4748 int r = store->read_meta("crush_device_class", &device_class);
4749 if (r < 0 || device_class.empty()) {
4750 device_class = store->get_default_device_class();
4751 }
4752
4753 if (device_class.empty()) {
4754 dout(20) << __func__ << " no device class stored locally" << dendl;
4755 return 0;
4756 }
4757
4758 string cmd =
4759 string("{\"prefix\": \"osd crush set-device-class\", ") +
4760 string("\"class\": \"") + device_class + string("\", ") +
4761 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4762
4763 r = mon_cmd_maybe_osd_create(cmd);
4764 if (r == -EBUSY) {
4765 // good, already bound to a device-class
4766 return 0;
4767 } else {
4768 return r;
4769 }
4770 }
4771
4772 void OSD::write_superblock(ObjectStore::Transaction& t)
4773 {
4774 dout(10) << "write_superblock " << superblock << dendl;
4775
4776 //hack: at minimum it's using the baseline feature set
4777 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4778 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4779
4780 bufferlist bl;
4781 encode(superblock, bl);
4782 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4783 }
4784
4785 int OSD::read_superblock()
4786 {
4787 bufferlist bl;
4788 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4789 if (r < 0)
4790 return r;
4791
4792 auto p = bl.cbegin();
4793 decode(superblock, p);
4794
4795 dout(10) << "read_superblock " << superblock << dendl;
4796
4797 return 0;
4798 }
4799
4800 void OSD::clear_temp_objects()
4801 {
4802 dout(10) << __func__ << dendl;
4803 vector<coll_t> ls;
4804 store->list_collections(ls);
4805 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4806 spg_t pgid;
4807 if (!p->is_pg(&pgid))
4808 continue;
4809
4810 // list temp objects
4811 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4812
4813 vector<ghobject_t> temps;
4814 ghobject_t next;
4815 while (1) {
4816 vector<ghobject_t> objects;
4817 auto ch = store->open_collection(*p);
4818 ceph_assert(ch);
4819 store->collection_list(ch, next, ghobject_t::get_max(),
4820 store->get_ideal_list_max(),
4821 &objects, &next);
4822 if (objects.empty())
4823 break;
4824 vector<ghobject_t>::iterator q;
4825 for (q = objects.begin(); q != objects.end(); ++q) {
4826 // Hammer set pool for temps to -1, so check for clean-up
4827 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4828 temps.push_back(*q);
4829 } else {
4830 break;
4831 }
4832 }
4833 // If we saw a non-temp object and hit the break above we can
4834 // break out of the while loop too.
4835 if (q != objects.end())
4836 break;
4837 }
4838 if (!temps.empty()) {
4839 ObjectStore::Transaction t;
4840 int removed = 0;
4841 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4842 dout(20) << " removing " << *p << " object " << *q << dendl;
4843 t.remove(*p, *q);
4844 if (++removed > cct->_conf->osd_target_transaction_size) {
4845 store->queue_transaction(service.meta_ch, std::move(t));
4846 t = ObjectStore::Transaction();
4847 removed = 0;
4848 }
4849 }
4850 if (removed) {
4851 store->queue_transaction(service.meta_ch, std::move(t));
4852 }
4853 }
4854 }
4855 }
4856
4857 void OSD::recursive_remove_collection(CephContext* cct,
4858 ObjectStore *store, spg_t pgid,
4859 coll_t tmp)
4860 {
4861 OSDriver driver(
4862 store,
4863 coll_t(),
4864 make_snapmapper_oid());
4865
4866 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4867 ObjectStore::Transaction t;
4868 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4869
4870 ghobject_t next;
4871 int max = cct->_conf->osd_target_transaction_size;
4872 vector<ghobject_t> objects;
4873 objects.reserve(max);
4874 while (true) {
4875 objects.clear();
4876 store->collection_list(ch, next, ghobject_t::get_max(),
4877 max, &objects, &next);
4878 generic_dout(10) << __func__ << " " << objects << dendl;
4879 if (objects.empty())
4880 break;
4881 for (auto& p: objects) {
4882 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4883 int r = mapper.remove_oid(p.hobj, &_t);
4884 if (r != 0 && r != -ENOENT)
4885 ceph_abort();
4886 t.remove(tmp, p);
4887 }
4888 int r = store->queue_transaction(ch, std::move(t));
4889 ceph_assert(r == 0);
4890 t = ObjectStore::Transaction();
4891 }
4892 t.remove_collection(tmp);
4893 int r = store->queue_transaction(ch, std::move(t));
4894 ceph_assert(r == 0);
4895
4896 C_SaferCond waiter;
4897 if (!ch->flush_commit(&waiter)) {
4898 waiter.wait();
4899 }
4900 }
4901
4902
4903 // ======================================================
4904 // PG's
4905
4906 PG* OSD::_make_pg(
4907 OSDMapRef createmap,
4908 spg_t pgid)
4909 {
4910 dout(10) << __func__ << " " << pgid << dendl;
4911 pg_pool_t pi;
4912 map<string,string> ec_profile;
4913 string name;
4914 if (createmap->have_pg_pool(pgid.pool())) {
4915 pi = *createmap->get_pg_pool(pgid.pool());
4916 name = createmap->get_pool_name(pgid.pool());
4917 if (pi.is_erasure()) {
4918 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4919 }
4920 } else {
4921 // pool was deleted; grab final pg_pool_t off disk.
4922 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4923 bufferlist bl;
4924 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4925 if (r < 0) {
4926 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4927 << dendl;
4928 return nullptr;
4929 }
4930 ceph_assert(r >= 0);
4931 auto p = bl.cbegin();
4932 decode(pi, p);
4933 decode(name, p);
4934 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4935 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4936 << " tombstone" << dendl;
4937 return nullptr;
4938 }
4939 decode(ec_profile, p);
4940 }
4941 PGPool pool(createmap, pgid.pool(), pi, name);
4942 PG *pg;
4943 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4944 pi.type == pg_pool_t::TYPE_ERASURE)
4945 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4946 else
4947 ceph_abort();
4948 return pg;
4949 }
4950
4951 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4952 {
4953 v->clear();
4954 v->reserve(get_num_pgs());
4955 for (auto& s : shards) {
4956 std::lock_guard l(s->shard_lock);
4957 for (auto& j : s->pg_slots) {
4958 if (j.second->pg &&
4959 !j.second->pg->is_deleted()) {
4960 v->push_back(j.second->pg);
4961 if (clear_too) {
4962 s->_detach_pg(j.second.get());
4963 }
4964 }
4965 }
4966 }
4967 }
4968
4969 void OSD::_get_pgids(vector<spg_t> *v)
4970 {
4971 v->clear();
4972 v->reserve(get_num_pgs());
4973 for (auto& s : shards) {
4974 std::lock_guard l(s->shard_lock);
4975 for (auto& j : s->pg_slots) {
4976 if (j.second->pg &&
4977 !j.second->pg->is_deleted()) {
4978 v->push_back(j.first);
4979 }
4980 }
4981 }
4982 }
4983
4984 void OSD::register_pg(PGRef pg)
4985 {
4986 spg_t pgid = pg->get_pgid();
4987 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4988 auto sdata = shards[shard_index];
4989 std::lock_guard l(sdata->shard_lock);
4990 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4991 ceph_assert(r.second);
4992 auto *slot = r.first->second.get();
4993 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4994 sdata->_attach_pg(slot, pg.get());
4995 }
4996
4997 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4998 {
4999 auto sdata = pg->osd_shard;
5000 ceph_assert(sdata);
5001 {
5002 std::lock_guard l(sdata->shard_lock);
5003 auto p = sdata->pg_slots.find(pg->pg_id);
5004 if (p == sdata->pg_slots.end() ||
5005 !p->second->pg) {
5006 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
5007 return false;
5008 }
5009 if (p->second->waiting_for_merge_epoch) {
5010 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
5011 return false;
5012 }
5013 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
5014 sdata->_detach_pg(p->second.get());
5015 }
5016
5017 for (auto shard : shards) {
5018 shard->unprime_split_children(pg->pg_id, old_pg_num);
5019 }
5020
5021 // update pg count now since we might not get an osdmap any time soon.
5022 if (pg->is_primary())
5023 service.logger->dec(l_osd_pg_primary);
5024 else if (pg->is_nonprimary())
5025 service.logger->dec(l_osd_pg_replica); // misnomver
5026 else
5027 service.logger->dec(l_osd_pg_stray);
5028
5029 return true;
5030 }
5031
5032 PGRef OSD::_lookup_pg(spg_t pgid)
5033 {
5034 uint32_t shard_index = pgid.hash_to_shard(num_shards);
5035 auto sdata = shards[shard_index];
5036 std::lock_guard l(sdata->shard_lock);
5037 auto p = sdata->pg_slots.find(pgid);
5038 if (p == sdata->pg_slots.end()) {
5039 return nullptr;
5040 }
5041 return p->second->pg;
5042 }
5043
5044 PGRef OSD::_lookup_lock_pg(spg_t pgid)
5045 {
5046 PGRef pg = _lookup_pg(pgid);
5047 if (!pg) {
5048 return nullptr;
5049 }
5050 pg->lock();
5051 if (!pg->is_deleted()) {
5052 return pg;
5053 }
5054 pg->unlock();
5055 return nullptr;
5056 }
5057
5058 PGRef OSD::lookup_lock_pg(spg_t pgid)
5059 {
5060 return _lookup_lock_pg(pgid);
5061 }
5062
5063 void OSD::load_pgs()
5064 {
5065 ceph_assert(ceph_mutex_is_locked(osd_lock));
5066 dout(0) << "load_pgs" << dendl;
5067
5068 {
5069 auto pghist = make_pg_num_history_oid();
5070 bufferlist bl;
5071 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
5072 if (r >= 0 && bl.length() > 0) {
5073 auto p = bl.cbegin();
5074 decode(pg_num_history, p);
5075 }
5076 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
5077 }
5078
5079 vector<coll_t> ls;
5080 int r = store->list_collections(ls);
5081 if (r < 0) {
5082 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
5083 }
5084
5085 int num = 0;
5086 for (vector<coll_t>::iterator it = ls.begin();
5087 it != ls.end();
5088 ++it) {
5089 spg_t pgid;
5090 if (it->is_temp(&pgid) ||
5091 (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
5092 dout(10) << "load_pgs " << *it
5093 << " removing, legacy or flagged for removal pg" << dendl;
5094 recursive_remove_collection(cct, store.get(), pgid, *it);
5095 continue;
5096 }
5097
5098 if (!it->is_pg(&pgid)) {
5099 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
5100 continue;
5101 }
5102
5103 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
5104 epoch_t map_epoch = 0;
5105 int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
5106 if (r < 0) {
5107 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
5108 << dendl;
5109 continue;
5110 }
5111
5112 PGRef pg;
5113 if (map_epoch > 0) {
5114 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
5115 if (!pgosdmap) {
5116 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
5117 derr << __func__ << ": could not find map for epoch " << map_epoch
5118 << " on pg " << pgid << ", but the pool is not present in the "
5119 << "current map, so this is probably a result of bug 10617. "
5120 << "Skipping the pg for now, you can use ceph-objectstore-tool "
5121 << "to clean it up later." << dendl;
5122 continue;
5123 } else {
5124 derr << __func__ << ": have pgid " << pgid << " at epoch "
5125 << map_epoch << ", but missing map. Crashing."
5126 << dendl;
5127 ceph_abort_msg("Missing map in load_pgs");
5128 }
5129 }
5130 pg = _make_pg(pgosdmap, pgid);
5131 } else {
5132 pg = _make_pg(get_osdmap(), pgid);
5133 }
5134 if (!pg) {
5135 recursive_remove_collection(cct, store.get(), pgid, *it);
5136 continue;
5137 }
5138
5139 // there can be no waiters here, so we don't call _wake_pg_slot
5140
5141 pg->lock();
5142 pg->ch = store->open_collection(pg->coll);
5143
5144 // read pg state, log
5145 pg->read_state(store.get());
5146
5147 if (pg->dne()) {
5148 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
5149 pg->ch = nullptr;
5150 pg->unlock();
5151 recursive_remove_collection(cct, store.get(), pgid, *it);
5152 continue;
5153 }
5154 {
5155 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5156 assert(NULL != shards[shard_index]);
5157 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5158 }
5159
5160 dout(10) << __func__ << " loaded " << *pg << dendl;
5161 pg->unlock();
5162
5163 register_pg(pg);
5164 ++num;
5165 }
5166 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
5167 }
5168
5169
5170 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5171 const PGCreateInfo *info)
5172 {
5173 spg_t pgid = info->pgid;
5174
5175 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5176 dout(10) << __func__ << " hit max pg, dropping" << dendl;
5177 return nullptr;
5178 }
5179
5180 OSDMapRef startmap = get_map(info->epoch);
5181
5182 if (info->by_mon) {
5183 int64_t pool_id = pgid.pgid.pool();
5184 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5185 if (!pool) {
5186 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5187 return nullptr;
5188 }
5189 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
5190 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5191 // this ensures we do not process old creating messages after the
5192 // pool's initial pgs have been created (and pg are subsequently
5193 // allowed to split or merge).
5194 dout(20) << __func__ << " dropping " << pgid
5195 << "create, pool does not have CREATING flag set" << dendl;
5196 return nullptr;
5197 }
5198 }
5199
5200 int up_primary, acting_primary;
5201 vector<int> up, acting;
5202 startmap->pg_to_up_acting_osds(
5203 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5204
5205 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5206 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5207 store->get_type() != "bluestore") {
5208 clog->warn() << "pg " << pgid
5209 << " is at risk of silent data corruption: "
5210 << "the pool allows ec overwrites but is not stored in "
5211 << "bluestore, so deep scrubbing will not detect bitrot";
5212 }
5213 PeeringCtx rctx;
5214 create_pg_collection(
5215 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5216 init_pg_ondisk(rctx.transaction, pgid, pp);
5217
5218 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
5219
5220 PGRef pg = _make_pg(startmap, pgid);
5221 pg->ch = store->create_new_collection(pg->coll);
5222
5223 {
5224 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5225 assert(NULL != shards[shard_index]);
5226 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5227 }
5228
5229 pg->lock(true);
5230
5231 // we are holding the shard lock
5232 ceph_assert(!pg->is_deleted());
5233
5234 pg->init(
5235 role,
5236 up,
5237 up_primary,
5238 acting,
5239 acting_primary,
5240 info->history,
5241 info->past_intervals,
5242 rctx.transaction);
5243
5244 pg->init_collection_pool_opts();
5245
5246 if (pg->is_primary()) {
5247 std::lock_guard locker{m_perf_queries_lock};
5248 pg->set_dynamic_perf_stats_queries(m_perf_queries);
5249 }
5250
5251 pg->handle_initialize(rctx);
5252 pg->handle_activate_map(rctx);
5253
5254 dispatch_context(rctx, pg.get(), osdmap, nullptr);
5255
5256 dout(10) << __func__ << " new pg " << *pg << dendl;
5257 return pg;
5258 }
5259
5260 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5261 spg_t pgid,
5262 bool is_mon_create)
5263 {
5264 const auto max_pgs_per_osd =
5265 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5266 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5267
5268 if (num_pgs < max_pgs_per_osd) {
5269 return false;
5270 }
5271
5272 std::lock_guard l(pending_creates_lock);
5273 if (is_mon_create) {
5274 pending_creates_from_mon++;
5275 } else {
5276 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5277 pending_creates_from_osd.emplace(pgid, is_primary);
5278 }
5279 dout(1) << __func__ << " withhold creation of pg " << pgid
5280 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
5281 return true;
5282 }
5283
5284 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5285 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5286 // to up set if pg_temp is empty. so an empty pg_temp won't work.
5287 static vector<int32_t> twiddle(const vector<int>& acting) {
5288 if (acting.size() > 1) {
5289 return {acting[0]};
5290 } else {
5291 vector<int32_t> twiddled(acting.begin(), acting.end());
5292 twiddled.push_back(-1);
5293 return twiddled;
5294 }
5295 }
5296
5297 void OSD::resume_creating_pg()
5298 {
5299 bool do_sub_pg_creates = false;
5300 bool have_pending_creates = false;
5301 {
5302 const auto max_pgs_per_osd =
5303 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5304 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5305 if (max_pgs_per_osd <= num_pgs) {
5306 // this could happen if admin decreases this setting before a PG is removed
5307 return;
5308 }
5309 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5310 std::lock_guard l(pending_creates_lock);
5311 if (pending_creates_from_mon > 0) {
5312 dout(20) << __func__ << " pending_creates_from_mon "
5313 << pending_creates_from_mon << dendl;
5314 do_sub_pg_creates = true;
5315 if (pending_creates_from_mon >= spare_pgs) {
5316 spare_pgs = pending_creates_from_mon = 0;
5317 } else {
5318 spare_pgs -= pending_creates_from_mon;
5319 pending_creates_from_mon = 0;
5320 }
5321 }
5322 auto pg = pending_creates_from_osd.cbegin();
5323 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5324 dout(20) << __func__ << " pg " << pg->first << dendl;
5325 vector<int> acting;
5326 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5327 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5328 pg = pending_creates_from_osd.erase(pg);
5329 do_sub_pg_creates = true;
5330 spare_pgs--;
5331 }
5332 have_pending_creates = (pending_creates_from_mon > 0 ||
5333 !pending_creates_from_osd.empty());
5334 }
5335
5336 bool do_renew_subs = false;
5337 if (do_sub_pg_creates) {
5338 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5339 dout(4) << __func__ << ": resolicit pg creates from mon since "
5340 << last_pg_create_epoch << dendl;
5341 do_renew_subs = true;
5342 }
5343 }
5344 version_t start = get_osdmap_epoch() + 1;
5345 if (have_pending_creates) {
5346 // don't miss any new osdmap deleting PGs
5347 if (monc->sub_want("osdmap", start, 0)) {
5348 dout(4) << __func__ << ": resolicit osdmap from mon since "
5349 << start << dendl;
5350 do_renew_subs = true;
5351 }
5352 } else if (do_sub_pg_creates) {
5353 // no need to subscribe the osdmap continuously anymore
5354 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5355 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5356 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5357 << start << dendl;
5358 do_renew_subs = true;
5359 }
5360 }
5361
5362 if (do_renew_subs) {
5363 monc->renew_subs();
5364 }
5365
5366 service.send_pg_temp();
5367 }
5368
5369 void OSD::_add_heartbeat_peer(int p)
5370 {
5371 if (p == whoami)
5372 return;
5373 HeartbeatInfo *hi;
5374
5375 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5376 if (i == heartbeat_peers.end()) {
5377 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5378 if (!cons.first)
5379 return;
5380 assert(cons.second);
5381
5382 hi = &heartbeat_peers[p];
5383 hi->peer = p;
5384
5385 auto stamps = service.get_hb_stamps(p);
5386
5387 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5388 sb->peer = p;
5389 sb->stamps = stamps;
5390 hi->hb_interval_start = ceph_clock_now();
5391 hi->con_back = cons.first.get();
5392 hi->con_back->set_priv(sb);
5393
5394 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5395 sf->peer = p;
5396 sf->stamps = stamps;
5397 hi->con_front = cons.second.get();
5398 hi->con_front->set_priv(sf);
5399
5400 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5401 << " " << hi->con_back->get_peer_addr()
5402 << " " << hi->con_front->get_peer_addr()
5403 << dendl;
5404 } else {
5405 hi = &i->second;
5406 }
5407 hi->epoch = get_osdmap_epoch();
5408 }
5409
5410 void OSD::_remove_heartbeat_peer(int n)
5411 {
5412 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5413 ceph_assert(q != heartbeat_peers.end());
5414 dout(20) << " removing heartbeat peer osd." << n
5415 << " " << q->second.con_back->get_peer_addr()
5416 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5417 << dendl;
5418 q->second.clear_mark_down();
5419 heartbeat_peers.erase(q);
5420 }
5421
5422 void OSD::need_heartbeat_peer_update()
5423 {
5424 if (is_stopping())
5425 return;
5426 dout(20) << "need_heartbeat_peer_update" << dendl;
5427 heartbeat_set_peers_need_update();
5428 }
5429
5430 void OSD::maybe_update_heartbeat_peers()
5431 {
5432 ceph_assert(ceph_mutex_is_locked(osd_lock));
5433
5434 if (is_waiting_for_healthy() || is_active()) {
5435 utime_t now = ceph_clock_now();
5436 if (last_heartbeat_resample == utime_t()) {
5437 last_heartbeat_resample = now;
5438 heartbeat_set_peers_need_update();
5439 } else if (!heartbeat_peers_need_update()) {
5440 utime_t dur = now - last_heartbeat_resample;
5441 if (dur > cct->_conf->osd_heartbeat_grace) {
5442 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5443 heartbeat_set_peers_need_update();
5444 last_heartbeat_resample = now;
5445 // automatically clean up any stale heartbeat peers
5446 // if we are unhealthy, then clean all
5447 reset_heartbeat_peers(is_waiting_for_healthy());
5448 }
5449 }
5450 }
5451
5452 if (!heartbeat_peers_need_update())
5453 return;
5454 heartbeat_clear_peers_need_update();
5455
5456 std::lock_guard l(heartbeat_lock);
5457
5458 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5459
5460
5461 // build heartbeat from set
5462 if (is_active()) {
5463 vector<PGRef> pgs;
5464 _get_pgs(&pgs);
5465 for (auto& pg : pgs) {
5466 pg->with_heartbeat_peers([&](int peer) {
5467 if (get_osdmap()->is_up(peer)) {
5468 _add_heartbeat_peer(peer);
5469 }
5470 });
5471 }
5472 }
5473
5474 // include next and previous up osds to ensure we have a fully-connected set
5475 set<int> want, extras;
5476 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5477 if (next >= 0)
5478 want.insert(next);
5479 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5480 if (prev >= 0 && prev != next)
5481 want.insert(prev);
5482
5483 // make sure we have at least **min_down** osds coming from different
5484 // subtree level (e.g., hosts) for fast failure detection.
5485 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5486 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5487 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5488 get_osdmap()->get_random_up_osds_by_subtree(
5489 whoami, subtree, limit, want, &want);
5490
5491 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5492 dout(10) << " adding neighbor peer osd." << *p << dendl;
5493 extras.insert(*p);
5494 _add_heartbeat_peer(*p);
5495 }
5496
5497 // remove down peers; enumerate extras
5498 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5499 while (p != heartbeat_peers.end()) {
5500 if (!get_osdmap()->is_up(p->first)) {
5501 int o = p->first;
5502 ++p;
5503 _remove_heartbeat_peer(o);
5504 continue;
5505 }
5506 if (p->second.epoch < get_osdmap_epoch()) {
5507 extras.insert(p->first);
5508 }
5509 ++p;
5510 }
5511
5512 // too few?
5513 for (int n = next; n >= 0; ) {
5514 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5515 break;
5516 if (!extras.count(n) && !want.count(n) && n != whoami) {
5517 dout(10) << " adding random peer osd." << n << dendl;
5518 extras.insert(n);
5519 _add_heartbeat_peer(n);
5520 }
5521 n = get_osdmap()->get_next_up_osd_after(n);
5522 if (n == next)
5523 break; // came full circle; stop
5524 }
5525
5526 // too many?
5527 for (set<int>::iterator p = extras.begin();
5528 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5529 ++p) {
5530 if (want.count(*p))
5531 continue;
5532 _remove_heartbeat_peer(*p);
5533 }
5534
5535 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5536
5537 // clean up stale failure pending
5538 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5539 if (heartbeat_peers.count(it->first) == 0) {
5540 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5541 failure_pending.erase(it++);
5542 } else {
5543 it++;
5544 }
5545 }
5546 }
5547
5548 void OSD::reset_heartbeat_peers(bool all)
5549 {
5550 ceph_assert(ceph_mutex_is_locked(osd_lock));
5551 dout(10) << "reset_heartbeat_peers" << dendl;
5552 utime_t stale = ceph_clock_now();
5553 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5554 std::lock_guard l(heartbeat_lock);
5555 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5556 auto& [peer, hi] = *it;
5557 if (all || hi.is_stale(stale)) {
5558 hi.clear_mark_down();
5559 // stop sending failure_report to mon too
5560 failure_queue.erase(peer);
5561 failure_pending.erase(peer);
5562 it = heartbeat_peers.erase(it);
5563 } else {
5564 ++it;
5565 }
5566 }
5567 }
5568
5569 void OSD::handle_osd_ping(MOSDPing *m)
5570 {
5571 if (superblock.cluster_fsid != m->fsid) {
5572 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5573 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5574 << dendl;
5575 m->put();
5576 return;
5577 }
5578
5579 int from = m->get_source().num();
5580
5581 heartbeat_lock.lock();
5582 if (is_stopping()) {
5583 heartbeat_lock.unlock();
5584 m->put();
5585 return;
5586 }
5587
5588 utime_t now = ceph_clock_now();
5589 auto mnow = service.get_mnow();
5590 ConnectionRef con(m->get_connection());
5591 OSDMapRef curmap = service.get_osdmap();
5592 if (!curmap) {
5593 heartbeat_lock.unlock();
5594 m->put();
5595 return;
5596 }
5597
5598 auto sref = con->get_priv();
5599 Session *s = static_cast<Session*>(sref.get());
5600 if (!s) {
5601 heartbeat_lock.unlock();
5602 m->put();
5603 return;
5604 }
5605 if (!s->stamps) {
5606 s->peer = from;
5607 s->stamps = service.get_hb_stamps(from);
5608 }
5609
5610 switch (m->op) {
5611
5612 case MOSDPing::PING:
5613 {
5614 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5615 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5616 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5617 if (heartbeat_drop->second == 0) {
5618 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5619 } else {
5620 --heartbeat_drop->second;
5621 dout(5) << "Dropping heartbeat from " << from
5622 << ", " << heartbeat_drop->second
5623 << " remaining to drop" << dendl;
5624 break;
5625 }
5626 } else if (cct->_conf->osd_debug_drop_ping_probability >
5627 ((((double)(rand()%100))/100.0))) {
5628 heartbeat_drop =
5629 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5630 cct->_conf->osd_debug_drop_ping_duration)).first;
5631 dout(5) << "Dropping heartbeat from " << from
5632 << ", " << heartbeat_drop->second
5633 << " remaining to drop" << dendl;
5634 break;
5635 }
5636 }
5637
5638 ceph::signedspan sender_delta_ub{};
5639 s->stamps->got_ping(
5640 m->up_from,
5641 mnow,
5642 m->mono_send_stamp,
5643 m->delta_ub,
5644 &sender_delta_ub);
5645 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5646
5647 if (!cct->get_heartbeat_map()->is_healthy()) {
5648 dout(10) << "internal heartbeat not healthy, dropping ping request"
5649 << dendl;
5650 break;
5651 }
5652
5653 Message *r = new MOSDPing(monc->get_fsid(),
5654 curmap->get_epoch(),
5655 MOSDPing::PING_REPLY,
5656 m->ping_stamp,
5657 m->mono_ping_stamp,
5658 mnow,
5659 service.get_up_epoch(),
5660 cct->_conf->osd_heartbeat_min_size,
5661 sender_delta_ub);
5662 con->send_message(r);
5663
5664 if (curmap->is_up(from)) {
5665 if (is_active()) {
5666 ConnectionRef cluster_con = service.get_con_osd_cluster(
5667 from, curmap->get_epoch());
5668 if (cluster_con) {
5669 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5670 }
5671 }
5672 } else if (!curmap->exists(from) ||
5673 curmap->get_down_at(from) > m->map_epoch) {
5674 // tell them they have died
5675 Message *r = new MOSDPing(monc->get_fsid(),
5676 curmap->get_epoch(),
5677 MOSDPing::YOU_DIED,
5678 m->ping_stamp,
5679 m->mono_ping_stamp,
5680 mnow,
5681 service.get_up_epoch(),
5682 cct->_conf->osd_heartbeat_min_size);
5683 con->send_message(r);
5684 }
5685 }
5686 break;
5687
5688 case MOSDPing::PING_REPLY:
5689 {
5690 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5691 if (i != heartbeat_peers.end()) {
5692 auto acked = i->second.ping_history.find(m->ping_stamp);
5693 if (acked != i->second.ping_history.end()) {
5694 int &unacknowledged = acked->second.second;
5695 if (con == i->second.con_back) {
5696 dout(25) << "handle_osd_ping got reply from osd." << from
5697 << " first_tx " << i->second.first_tx
5698 << " last_tx " << i->second.last_tx
5699 << " last_rx_back " << i->second.last_rx_back
5700 << " -> " << now
5701 << " last_rx_front " << i->second.last_rx_front
5702 << dendl;
5703 i->second.last_rx_back = now;
5704 ceph_assert(unacknowledged > 0);
5705 --unacknowledged;
5706 // if there is no front con, set both stamps.
5707 if (i->second.con_front == NULL) {
5708 i->second.last_rx_front = now;
5709 ceph_assert(unacknowledged > 0);
5710 --unacknowledged;
5711 }
5712 } else if (con == i->second.con_front) {
5713 dout(25) << "handle_osd_ping got reply from osd." << from
5714 << " first_tx " << i->second.first_tx
5715 << " last_tx " << i->second.last_tx
5716 << " last_rx_back " << i->second.last_rx_back
5717 << " last_rx_front " << i->second.last_rx_front
5718 << " -> " << now
5719 << dendl;
5720 i->second.last_rx_front = now;
5721 ceph_assert(unacknowledged > 0);
5722 --unacknowledged;
5723 }
5724
5725 if (unacknowledged == 0) {
5726 // succeeded in getting all replies
5727 dout(25) << "handle_osd_ping got all replies from osd." << from
5728 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5729 << " and older pending ping(s)"
5730 << dendl;
5731
5732 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5733 ++i->second.hb_average_count;
5734 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5735 i->second.hb_total_back += back_pingtime;
5736 if (back_pingtime < i->second.hb_min_back)
5737 i->second.hb_min_back = back_pingtime;
5738 if (back_pingtime > i->second.hb_max_back)
5739 i->second.hb_max_back = back_pingtime;
5740 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5741 i->second.hb_total_front += front_pingtime;
5742 if (front_pingtime < i->second.hb_min_front)
5743 i->second.hb_min_front = front_pingtime;
5744 if (front_pingtime > i->second.hb_max_front)
5745 i->second.hb_max_front = front_pingtime;
5746
5747 ceph_assert(i->second.hb_interval_start != utime_t());
5748 if (i->second.hb_interval_start == utime_t())
5749 i->second.hb_interval_start = now;
5750 int64_t hb_avg_time_period = 60;
5751 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5752 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5753 }
5754 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5755 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5756 uint32_t back_min = i->second.hb_min_back;
5757 uint32_t back_max = i->second.hb_max_back;
5758 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5759 uint32_t front_min = i->second.hb_min_front;
5760 uint32_t front_max = i->second.hb_max_front;
5761
5762 // Reset for new interval
5763 i->second.hb_average_count = 0;
5764 i->second.hb_interval_start = now;
5765 i->second.hb_total_back = i->second.hb_max_back = 0;
5766 i->second.hb_min_back = UINT_MAX;
5767 i->second.hb_total_front = i->second.hb_max_front = 0;
5768 i->second.hb_min_front = UINT_MAX;
5769
5770 // Record per osd interace ping times
5771 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5772 if (i->second.hb_back_pingtime.size() == 0) {
5773 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5774 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5775 i->second.hb_back_pingtime.push_back(back_avg);
5776 i->second.hb_back_min.push_back(back_min);
5777 i->second.hb_back_max.push_back(back_max);
5778 i->second.hb_front_pingtime.push_back(front_avg);
5779 i->second.hb_front_min.push_back(front_min);
5780 i->second.hb_front_max.push_back(front_max);
5781 ++i->second.hb_index;
5782 }
5783 } else {
5784 int index = i->second.hb_index & (hb_vector_size - 1);
5785 i->second.hb_back_pingtime[index] = back_avg;
5786 i->second.hb_back_min[index] = back_min;
5787 i->second.hb_back_max[index] = back_max;
5788 i->second.hb_front_pingtime[index] = front_avg;
5789 i->second.hb_front_min[index] = front_min;
5790 i->second.hb_front_max[index] = front_max;
5791 ++i->second.hb_index;
5792 }
5793
5794 {
5795 std::lock_guard l(service.stat_lock);
5796 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5797 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5798
5799 uint32_t total = 0;
5800 uint32_t min = UINT_MAX;
5801 uint32_t max = 0;
5802 uint32_t count = 0;
5803 uint32_t which = 0;
5804 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5805 for (int32_t k = size - 1 ; k >= 0; --k) {
5806 ++count;
5807 int index = (i->second.hb_index + k) % size;
5808 total += i->second.hb_back_pingtime[index];
5809 if (i->second.hb_back_min[index] < min)
5810 min = i->second.hb_back_min[index];
5811 if (i->second.hb_back_max[index] > max)
5812 max = i->second.hb_back_max[index];
5813 if (count == 1 || count == 5 || count == 15) {
5814 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5815 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5816 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5817 which++;
5818 if (count == 15)
5819 break;
5820 }
5821 }
5822
5823 if (i->second.con_front != NULL) {
5824 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5825
5826 total = 0;
5827 min = UINT_MAX;
5828 max = 0;
5829 count = 0;
5830 which = 0;
5831 for (int32_t k = size - 1 ; k >= 0; --k) {
5832 ++count;
5833 int index = (i->second.hb_index + k) % size;
5834 total += i->second.hb_front_pingtime[index];
5835 if (i->second.hb_front_min[index] < min)
5836 min = i->second.hb_front_min[index];
5837 if (i->second.hb_front_max[index] > max)
5838 max = i->second.hb_front_max[index];
5839 if (count == 1 || count == 5 || count == 15) {
5840 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5841 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5842 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5843 which++;
5844 if (count == 15)
5845 break;
5846 }
5847 }
5848 }
5849 }
5850 } else {
5851 std::lock_guard l(service.stat_lock);
5852 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5853 if (i->second.con_front != NULL)
5854 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5855 }
5856 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5857 }
5858
5859 if (i->second.is_healthy(now)) {
5860 // Cancel false reports
5861 auto failure_queue_entry = failure_queue.find(from);
5862 if (failure_queue_entry != failure_queue.end()) {
5863 dout(10) << "handle_osd_ping canceling queued "
5864 << "failure report for osd." << from << dendl;
5865 failure_queue.erase(failure_queue_entry);
5866 }
5867
5868 auto failure_pending_entry = failure_pending.find(from);
5869 if (failure_pending_entry != failure_pending.end()) {
5870 dout(10) << "handle_osd_ping canceling in-flight "
5871 << "failure report for osd." << from << dendl;
5872 send_still_alive(curmap->get_epoch(),
5873 from,
5874 failure_pending_entry->second.second);
5875 failure_pending.erase(failure_pending_entry);
5876 }
5877 }
5878 } else {
5879 // old replies, deprecated by newly sent pings.
5880 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5881 << ") is found, treat as covered by newly sent pings "
5882 << "and ignore"
5883 << dendl;
5884 }
5885 }
5886
5887 if (m->map_epoch &&
5888 curmap->is_up(from)) {
5889 if (is_active()) {
5890 ConnectionRef cluster_con = service.get_con_osd_cluster(
5891 from, curmap->get_epoch());
5892 if (cluster_con) {
5893 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5894 }
5895 }
5896 }
5897
5898 s->stamps->got_ping_reply(
5899 mnow,
5900 m->mono_send_stamp,
5901 m->delta_ub);
5902 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5903 }
5904 break;
5905
5906 case MOSDPing::YOU_DIED:
5907 dout(10) << "handle_osd_ping " << m->get_source_inst()
5908 << " says i am down in " << m->map_epoch << dendl;
5909 osdmap_subscribe(curmap->get_epoch()+1, false);
5910 break;
5911 }
5912
5913 heartbeat_lock.unlock();
5914 m->put();
5915 }
5916
5917 void OSD::heartbeat_entry()
5918 {
5919 std::unique_lock l(heartbeat_lock);
5920 if (is_stopping())
5921 return;
5922 while (!heartbeat_stop) {
5923 heartbeat();
5924
5925 double wait;
5926 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5927 wait = (float)cct->_conf->osd_heartbeat_interval;
5928 } else {
5929 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5930 }
5931 auto w = ceph::make_timespan(wait);
5932 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5933 heartbeat_cond.wait_for(l, w);
5934 if (is_stopping())
5935 return;
5936 dout(30) << "heartbeat_entry woke up" << dendl;
5937 }
5938 }
5939
5940 void OSD::heartbeat_check()
5941 {
5942 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5943 utime_t now = ceph_clock_now();
5944
5945 // check for incoming heartbeats (move me elsewhere?)
5946 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5947 p != heartbeat_peers.end();
5948 ++p) {
5949
5950 if (p->second.first_tx == utime_t()) {
5951 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5952 << " yet, skipping" << dendl;
5953 continue;
5954 }
5955
5956 dout(25) << "heartbeat_check osd." << p->first
5957 << " first_tx " << p->second.first_tx
5958 << " last_tx " << p->second.last_tx
5959 << " last_rx_back " << p->second.last_rx_back
5960 << " last_rx_front " << p->second.last_rx_front
5961 << dendl;
5962 if (p->second.is_unhealthy(now)) {
5963 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5964 if (p->second.last_rx_back == utime_t() ||
5965 p->second.last_rx_front == utime_t()) {
5966 derr << "heartbeat_check: no reply from "
5967 << p->second.con_front->get_peer_addr().get_sockaddr()
5968 << " osd." << p->first
5969 << " ever on either front or back, first ping sent "
5970 << p->second.first_tx
5971 << " (oldest deadline " << oldest_deadline << ")"
5972 << dendl;
5973 // fail
5974 failure_queue[p->first] = p->second.first_tx;
5975 } else {
5976 derr << "heartbeat_check: no reply from "
5977 << p->second.con_front->get_peer_addr().get_sockaddr()
5978 << " osd." << p->first << " since back " << p->second.last_rx_back
5979 << " front " << p->second.last_rx_front
5980 << " (oldest deadline " << oldest_deadline << ")"
5981 << dendl;
5982 // fail
5983 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5984 }
5985 }
5986 }
5987 }
5988
5989 void OSD::heartbeat()
5990 {
5991 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5992 dout(30) << "heartbeat" << dendl;
5993
5994 auto load_for_logger = service.get_scrub_services().update_load_average();
5995 if (load_for_logger) {
5996 logger->set(l_osd_loadavg, load_for_logger.value());
5997 }
5998 dout(30) << "heartbeat checking stats" << dendl;
5999
6000 // refresh peer list and osd stats
6001 vector<int> hb_peers;
6002 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6003 p != heartbeat_peers.end();
6004 ++p)
6005 hb_peers.push_back(p->first);
6006
6007 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
6008 dout(5) << __func__ << " " << new_stat << dendl;
6009 ceph_assert(new_stat.statfs.total);
6010
6011 float pratio;
6012 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
6013
6014 service.check_full_status(ratio, pratio);
6015
6016 utime_t now = ceph_clock_now();
6017 auto mnow = service.get_mnow();
6018 utime_t deadline = now;
6019 deadline += cct->_conf->osd_heartbeat_grace;
6020
6021 // send heartbeats
6022 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
6023 i != heartbeat_peers.end();
6024 ++i) {
6025 int peer = i->first;
6026 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
6027 if (!s) {
6028 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
6029 continue;
6030 }
6031 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
6032
6033 i->second.last_tx = now;
6034 if (i->second.first_tx == utime_t())
6035 i->second.first_tx = now;
6036 i->second.ping_history[now] = make_pair(deadline,
6037 HeartbeatInfo::HEARTBEAT_MAX_CONN);
6038 if (i->second.hb_interval_start == utime_t())
6039 i->second.hb_interval_start = now;
6040
6041 std::optional<ceph::signedspan> delta_ub;
6042 s->stamps->sent_ping(&delta_ub);
6043
6044 i->second.con_back->send_message(
6045 new MOSDPing(monc->get_fsid(),
6046 service.get_osdmap_epoch(),
6047 MOSDPing::PING,
6048 now,
6049 mnow,
6050 mnow,
6051 service.get_up_epoch(),
6052 cct->_conf->osd_heartbeat_min_size,
6053 delta_ub));
6054
6055 if (i->second.con_front)
6056 i->second.con_front->send_message(
6057 new MOSDPing(monc->get_fsid(),
6058 service.get_osdmap_epoch(),
6059 MOSDPing::PING,
6060 now,
6061 mnow,
6062 mnow,
6063 service.get_up_epoch(),
6064 cct->_conf->osd_heartbeat_min_size,
6065 delta_ub));
6066 }
6067
6068 logger->set(l_osd_hb_to, heartbeat_peers.size());
6069
6070 // hmm.. am i all alone?
6071 dout(30) << "heartbeat lonely?" << dendl;
6072 if (heartbeat_peers.empty()) {
6073 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
6074 last_mon_heartbeat = now;
6075 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
6076 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6077 }
6078 }
6079
6080 dout(30) << "heartbeat done" << dendl;
6081 }
6082
6083 bool OSD::heartbeat_reset(Connection *con)
6084 {
6085 std::lock_guard l(heartbeat_lock);
6086 auto s = con->get_priv();
6087 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
6088 con->set_priv(nullptr);
6089 if (s) {
6090 if (is_stopping()) {
6091 return true;
6092 }
6093 auto session = static_cast<Session*>(s.get());
6094 auto p = heartbeat_peers.find(session->peer);
6095 if (p != heartbeat_peers.end() &&
6096 (p->second.con_back == con ||
6097 p->second.con_front == con)) {
6098 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6099 << ", reopening" << dendl;
6100 p->second.clear_mark_down(con);
6101 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6102 if (newcon.first) {
6103 p->second.con_back = newcon.first.get();
6104 p->second.con_back->set_priv(s);
6105 if (newcon.second) {
6106 p->second.con_front = newcon.second.get();
6107 p->second.con_front->set_priv(s);
6108 }
6109 p->second.ping_history.clear();
6110 } else {
6111 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6112 << ", raced with osdmap update, closing out peer" << dendl;
6113 heartbeat_peers.erase(p);
6114 }
6115 } else {
6116 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6117 }
6118 }
6119 return true;
6120 }
6121
6122
6123
6124 // =========================================
6125
6126 void OSD::tick()
6127 {
6128 ceph_assert(ceph_mutex_is_locked(osd_lock));
6129 dout(10) << "tick" << dendl;
6130
6131 utime_t now = ceph_clock_now();
6132 // throw out any obsolete markdown log
6133 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6134 while (!osd_markdown_log.empty() &&
6135 osd_markdown_log.front() + grace < now)
6136 osd_markdown_log.pop_front();
6137
6138 if (is_active() || is_waiting_for_healthy()) {
6139 maybe_update_heartbeat_peers();
6140 }
6141
6142 if (is_waiting_for_healthy()) {
6143 start_boot();
6144 }
6145
6146 if (is_waiting_for_healthy() || is_booting()) {
6147 std::lock_guard l(heartbeat_lock);
6148 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6149 last_mon_heartbeat = now;
6150 dout(1) << __func__ << " checking mon for new map" << dendl;
6151 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6152 }
6153 }
6154
6155 // scrub purged_snaps every deep scrub interval
6156 {
6157 const utime_t last = superblock.last_purged_snaps_scrub;
6158 utime_t next = last;
6159 next += cct->_conf->osd_scrub_min_interval;
6160 std::mt19937 rng;
6161 // use a seed that is stable for each scrub interval, but varies
6162 // by OSD to avoid any herds.
6163 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
6164 double r = (rng() % 1024) / 1024.0;
6165 next +=
6166 cct->_conf->osd_scrub_min_interval *
6167 cct->_conf->osd_scrub_interval_randomize_ratio * r;
6168 if (next < ceph_clock_now()) {
6169 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6170 << " next " << next << " ... now" << dendl;
6171 scrub_purged_snaps();
6172 } else {
6173 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6174 << " next " << next << dendl;
6175 }
6176 }
6177
6178 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
6179 }
6180
6181 void OSD::tick_without_osd_lock()
6182 {
6183 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
6184 dout(10) << "tick_without_osd_lock" << dendl;
6185
6186 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6187 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6188 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
6189
6190 // refresh osd stats
6191 struct store_statfs_t stbuf;
6192 osd_alert_list_t alerts;
6193 int r = store->statfs(&stbuf, &alerts);
6194 ceph_assert(r == 0);
6195 service.set_statfs(stbuf, alerts);
6196
6197 // osd_lock is not being held, which means the OSD state
6198 // might change when doing the monitor report
6199 if (is_active() || is_waiting_for_healthy()) {
6200 {
6201 std::lock_guard l{heartbeat_lock};
6202 heartbeat_check();
6203 }
6204 map_lock.lock_shared();
6205 std::lock_guard l(mon_report_lock);
6206
6207 // mon report?
6208 utime_t now = ceph_clock_now();
6209 if (service.need_fullness_update() ||
6210 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
6211 last_mon_report = now;
6212 send_full_update();
6213 send_failures();
6214 }
6215 map_lock.unlock_shared();
6216
6217 epoch_t max_waiting_epoch = 0;
6218 for (auto s : shards) {
6219 max_waiting_epoch = std::max(max_waiting_epoch,
6220 s->get_max_waiting_epoch());
6221 }
6222 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6223 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6224 << ", requesting new map" << dendl;
6225 osdmap_subscribe(superblock.newest_map + 1, false);
6226 }
6227 }
6228
6229 if (is_active()) {
6230 if (!scrub_random_backoff()) {
6231 sched_scrub();
6232 }
6233 service.promote_throttle_recalibrate();
6234 resume_creating_pg();
6235 bool need_send_beacon = false;
6236 const auto now = ceph::coarse_mono_clock::now();
6237 {
6238 // borrow lec lock to pretect last_sent_beacon from changing
6239 std::lock_guard l{min_last_epoch_clean_lock};
6240 const auto elapsed = now - last_sent_beacon;
6241 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6242 cct->_conf->osd_beacon_report_interval) {
6243 need_send_beacon = true;
6244 }
6245 }
6246 if (need_send_beacon) {
6247 send_beacon(now);
6248 }
6249 }
6250
6251 mgrc.update_daemon_health(get_health_metrics());
6252 service.kick_recovery_queue();
6253 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6254 new C_Tick_WithoutOSDLock(this));
6255 }
6256
6257 // Usage:
6258 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6259 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6260 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6261 // getomap <pool> [namespace/]<obj-name>
6262 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6263 // injectmdataerr [namespace/]<obj-name> [shardid]
6264 // injectdataerr [namespace/]<obj-name> [shardid]
6265 //
6266 // set_recovery_delay [utime]
6267 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6268 std::string_view command,
6269 const cmdmap_t& cmdmap, ostream &ss)
6270 {
6271 //Test support
6272 //Support changing the omap on a single osd by using the Admin Socket to
6273 //directly request the osd make a change.
6274 if (command == "setomapval" || command == "rmomapkey" ||
6275 command == "setomapheader" || command == "getomap" ||
6276 command == "truncobj" || command == "injectmdataerr" ||
6277 command == "injectdataerr"
6278 ) {
6279 pg_t rawpg;
6280 int64_t pool;
6281 OSDMapRef curmap = service->get_osdmap();
6282 int r = -1;
6283
6284 string poolstr;
6285
6286 cmd_getval(cmdmap, "pool", poolstr);
6287 pool = curmap->lookup_pg_pool_name(poolstr);
6288 //If we can't find it by name then maybe id specified
6289 if (pool < 0 && isdigit(poolstr[0]))
6290 pool = atoll(poolstr.c_str());
6291 if (pool < 0) {
6292 ss << "Invalid pool '" << poolstr << "''";
6293 return;
6294 }
6295
6296 string objname, nspace;
6297 cmd_getval(cmdmap, "objname", objname);
6298 std::size_t found = objname.find_first_of('/');
6299 if (found != string::npos) {
6300 nspace = objname.substr(0, found);
6301 objname = objname.substr(found+1);
6302 }
6303 object_locator_t oloc(pool, nspace);
6304 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6305
6306 if (r < 0) {
6307 ss << "Invalid namespace/objname";
6308 return;
6309 }
6310
6311 int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
6312 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6313 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6314 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6315 if (curmap->pg_is_ec(rawpg)) {
6316 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6317 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6318 return;
6319 }
6320 }
6321
6322 ObjectStore::Transaction t;
6323
6324 if (command == "setomapval") {
6325 map<string, bufferlist> newattrs;
6326 bufferlist val;
6327 string key, valstr;
6328 cmd_getval(cmdmap, "key", key);
6329 cmd_getval(cmdmap, "val", valstr);
6330
6331 val.append(valstr);
6332 newattrs[key] = val;
6333 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6334 r = store->queue_transaction(service->meta_ch, std::move(t));
6335 if (r < 0)
6336 ss << "error=" << r;
6337 else
6338 ss << "ok";
6339 } else if (command == "rmomapkey") {
6340 string key;
6341 cmd_getval(cmdmap, "key", key);
6342
6343 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6344 r = store->queue_transaction(service->meta_ch, std::move(t));
6345 if (r < 0)
6346 ss << "error=" << r;
6347 else
6348 ss << "ok";
6349 } else if (command == "setomapheader") {
6350 bufferlist newheader;
6351 string headerstr;
6352
6353 cmd_getval(cmdmap, "header", headerstr);
6354 newheader.append(headerstr);
6355 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6356 r = store->queue_transaction(service->meta_ch, std::move(t));
6357 if (r < 0)
6358 ss << "error=" << r;
6359 else
6360 ss << "ok";
6361 } else if (command == "getomap") {
6362 //Debug: Output entire omap
6363 bufferlist hdrbl;
6364 map<string, bufferlist> keyvals;
6365 auto ch = store->open_collection(coll_t(pgid));
6366 if (!ch) {
6367 ss << "unable to open collection for " << pgid;
6368 r = -ENOENT;
6369 } else {
6370 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6371 if (r >= 0) {
6372 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6373 for (map<string, bufferlist>::iterator it = keyvals.begin();
6374 it != keyvals.end(); ++it)
6375 ss << " key=" << (*it).first << " val="
6376 << string((*it).second.c_str(), (*it).second.length());
6377 } else {
6378 ss << "error=" << r;
6379 }
6380 }
6381 } else if (command == "truncobj") {
6382 int64_t trunclen;
6383 cmd_getval(cmdmap, "len", trunclen);
6384 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6385 r = store->queue_transaction(service->meta_ch, std::move(t));
6386 if (r < 0)
6387 ss << "error=" << r;
6388 else
6389 ss << "ok";
6390 } else if (command == "injectdataerr") {
6391 store->inject_data_error(gobj);
6392 ss << "ok";
6393 } else if (command == "injectmdataerr") {
6394 store->inject_mdata_error(gobj);
6395 ss << "ok";
6396 }
6397 return;
6398 }
6399 if (command == "set_recovery_delay") {
6400 int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
6401 ostringstream oss;
6402 oss << delay;
6403 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6404 oss.str().c_str());
6405 if (r != 0) {
6406 ss << "set_recovery_delay: error setting "
6407 << "osd_recovery_delay_start to '" << delay << "': error "
6408 << r;
6409 return;
6410 }
6411 service->cct->_conf.apply_changes(nullptr);
6412 ss << "set_recovery_delay: set osd_recovery_delay_start "
6413 << "to " << service->cct->_conf->osd_recovery_delay_start;
6414 return;
6415 }
6416 if (command == "injectfull") {
6417 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6418 string type = cmd_getval_or<string>(cmdmap, "type", "full");
6419 OSDService::s_names state;
6420
6421 if (type == "none" || count == 0) {
6422 type = "none";
6423 count = 0;
6424 }
6425 state = service->get_full_state(type);
6426 if (state == OSDService::s_names::INVALID) {
6427 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6428 return;
6429 }
6430 service->set_injectfull(state, count);
6431 return;
6432 }
6433 ss << "Internal error - command=" << command;
6434 }
6435
6436 // =========================================
6437
6438 void OSD::ms_handle_connect(Connection *con)
6439 {
6440 dout(10) << __func__ << " con " << con << dendl;
6441 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6442 std::lock_guard l(osd_lock);
6443 if (is_stopping())
6444 return;
6445 dout(10) << __func__ << " on mon" << dendl;
6446
6447 if (is_preboot()) {
6448 start_boot();
6449 } else if (is_booting()) {
6450 _send_boot(); // resend boot message
6451 } else {
6452 map_lock.lock_shared();
6453 std::lock_guard l2(mon_report_lock);
6454
6455 utime_t now = ceph_clock_now();
6456 last_mon_report = now;
6457
6458 // resend everything, it's a new session
6459 send_full_update();
6460 send_alive();
6461 service.requeue_pg_temp();
6462 service.clear_sent_ready_to_merge();
6463 service.send_pg_temp();
6464 service.send_ready_to_merge();
6465 service.send_pg_created();
6466 requeue_failures();
6467 send_failures();
6468
6469 map_lock.unlock_shared();
6470 if (is_active()) {
6471 send_beacon(ceph::coarse_mono_clock::now());
6472 }
6473 }
6474
6475 // full map requests may happen while active or pre-boot
6476 if (requested_full_first) {
6477 rerequest_full_maps();
6478 }
6479 }
6480 }
6481
6482 void OSD::ms_handle_fast_connect(Connection *con)
6483 {
6484 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6485 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6486 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6487 s = ceph::make_ref<Session>(cct, con);
6488 con->set_priv(s);
6489 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6490 << " addr=" << s->con->get_peer_addr() << dendl;
6491 // we don't connect to clients
6492 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6493 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6494 }
6495 }
6496 }
6497
6498 void OSD::ms_handle_fast_accept(Connection *con)
6499 {
6500 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6501 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6502 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6503 s = ceph::make_ref<Session>(cct, con);
6504 con->set_priv(s);
6505 dout(10) << "new session (incoming)" << s << " con=" << con
6506 << " addr=" << con->get_peer_addr()
6507 << " must have raced with connect" << dendl;
6508 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6509 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6510 }
6511 }
6512 }
6513
6514 bool OSD::ms_handle_reset(Connection *con)
6515 {
6516 auto session = ceph::ref_cast<Session>(con->get_priv());
6517 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6518 if (!session)
6519 return false;
6520 session->wstate.reset(con);
6521 session->con->set_priv(nullptr);
6522 session->con.reset(); // break con <-> session ref cycle
6523 // note that we break session->con *before* the session_handle_reset
6524 // cleanup below. this avoids a race between us and
6525 // PG::add_backoff, Session::check_backoff, etc.
6526 session_handle_reset(session);
6527 return true;
6528 }
6529
6530 bool OSD::ms_handle_refused(Connection *con)
6531 {
6532 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6533 return false;
6534
6535 auto session = ceph::ref_cast<Session>(con->get_priv());
6536 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6537 if (!session)
6538 return false;
6539 int type = con->get_peer_type();
6540 // handle only OSD failures here
6541 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6542 OSDMapRef osdmap = get_osdmap();
6543 if (osdmap) {
6544 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6545 if (id >= 0 && osdmap->is_up(id)) {
6546 // I'm cheating mon heartbeat grace logic, because we know it's not going
6547 // to respawn alone. +1 so we won't hit any boundary case.
6548 monc->send_mon_message(
6549 new MOSDFailure(
6550 monc->get_fsid(),
6551 id,
6552 osdmap->get_addrs(id),
6553 cct->_conf->osd_heartbeat_grace + 1,
6554 osdmap->get_epoch(),
6555 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6556 ));
6557 }
6558 }
6559 }
6560 return true;
6561 }
6562
6563 struct CB_OSD_GetVersion {
6564 OSD *osd;
6565 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6566 void operator ()(boost::system::error_code ec, version_t newest,
6567 version_t oldest) {
6568 if (!ec)
6569 osd->_got_mon_epochs(oldest, newest);
6570 }
6571 };
6572
6573 void OSD::start_boot()
6574 {
6575 if (!_is_healthy()) {
6576 // if we are not healthy, do not mark ourselves up (yet)
6577 dout(1) << "not healthy; waiting to boot" << dendl;
6578 if (!is_waiting_for_healthy())
6579 start_waiting_for_healthy();
6580 // send pings sooner rather than later
6581 heartbeat_kick();
6582 return;
6583 }
6584 dout(1) << __func__ << dendl;
6585 set_state(STATE_PREBOOT);
6586 dout(10) << "start_boot - have maps " << superblock.oldest_map
6587 << ".." << superblock.newest_map << dendl;
6588 monc->get_version("osdmap", CB_OSD_GetVersion(this));
6589 }
6590
6591 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6592 {
6593 std::lock_guard l(osd_lock);
6594 if (is_preboot()) {
6595 _preboot(oldest, newest);
6596 }
6597 }
6598
6599 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6600 {
6601 ceph_assert(is_preboot());
6602 dout(10) << __func__ << " _preboot mon has osdmaps "
6603 << oldest << ".." << newest << dendl;
6604
6605 // ensure our local fullness awareness is accurate
6606 {
6607 std::lock_guard l(heartbeat_lock);
6608 heartbeat();
6609 }
6610
6611 const auto& monmap = monc->monmap;
6612 const auto osdmap = get_osdmap();
6613 // if our map within recent history, try to add ourselves to the osdmap.
6614 if (osdmap->get_epoch() == 0) {
6615 derr << "waiting for initial osdmap" << dendl;
6616 } else if (osdmap->is_destroyed(whoami)) {
6617 derr << "osdmap says I am destroyed" << dendl;
6618 // provide a small margin so we don't livelock seeing if we
6619 // un-destroyed ourselves.
6620 if (osdmap->get_epoch() > newest - 1) {
6621 exit(0);
6622 }
6623 } else if (osdmap->is_noup(whoami)) {
6624 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6625 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6626 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6627 << dendl;
6628 } else if (service.need_fullness_update()) {
6629 derr << "osdmap fullness state needs update" << dendl;
6630 send_full_update();
6631 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6632 superblock.purged_snaps_last < superblock.current_epoch) {
6633 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6634 << " < newest_map " << superblock.current_epoch << dendl;
6635 _get_purged_snaps();
6636 } else if (osdmap->get_epoch() >= oldest - 1 &&
6637 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6638
6639 // wait for pgs to fully catch up in a different thread, since
6640 // this thread might be required for splitting and merging PGs to
6641 // make progress.
6642 boot_finisher.queue(
6643 new LambdaContext(
6644 [this](int r) {
6645 std::unique_lock l(osd_lock);
6646 if (is_preboot()) {
6647 dout(10) << __func__ << " waiting for peering work to drain"
6648 << dendl;
6649 l.unlock();
6650 for (auto shard : shards) {
6651 shard->wait_min_pg_epoch(get_osdmap_epoch());
6652 }
6653 l.lock();
6654 }
6655 if (is_preboot()) {
6656 _send_boot();
6657 }
6658 }));
6659 return;
6660 }
6661
6662 // get all the latest maps
6663 if (osdmap->get_epoch() + 1 >= oldest)
6664 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6665 else
6666 osdmap_subscribe(oldest - 1, true);
6667 }
6668
6669 void OSD::_get_purged_snaps()
6670 {
6671 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6672 // overlapping requests to the mon, which will be somewhat inefficient, but
6673 // it should be reliable.
6674 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6675 << ", newest_map " << superblock.current_epoch << dendl;
6676 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6677 superblock.purged_snaps_last + 1,
6678 superblock.current_epoch + 1);
6679 monc->send_mon_message(m);
6680 }
6681
6682 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6683 {
6684 dout(10) << __func__ << " " << *m << dendl;
6685 ObjectStore::Transaction t;
6686 if (!is_preboot() ||
6687 m->last < superblock.purged_snaps_last) {
6688 goto out;
6689 } else {
6690 OSDriver osdriver{store.get(), service.meta_ch, make_purged_snaps_oid()};
6691 SnapMapper::record_purged_snaps(
6692 cct,
6693 osdriver,
6694 osdriver.get_transaction(&t),
6695 m->purged_snaps);
6696 }
6697 superblock.purged_snaps_last = m->last;
6698 write_superblock(t);
6699 store->queue_transaction(
6700 service.meta_ch,
6701 std::move(t));
6702 service.publish_superblock(superblock);
6703 if (m->last < superblock.current_epoch) {
6704 _get_purged_snaps();
6705 } else {
6706 start_boot();
6707 }
6708 out:
6709 m->put();
6710 }
6711
6712 void OSD::send_full_update()
6713 {
6714 if (!service.need_fullness_update())
6715 return;
6716 unsigned state = 0;
6717 if (service.is_full()) {
6718 state = CEPH_OSD_FULL;
6719 } else if (service.is_backfillfull()) {
6720 state = CEPH_OSD_BACKFILLFULL;
6721 } else if (service.is_nearfull()) {
6722 state = CEPH_OSD_NEARFULL;
6723 }
6724 set<string> s;
6725 OSDMap::calc_state_set(state, s);
6726 dout(10) << __func__ << " want state " << s << dendl;
6727 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6728 }
6729
6730 void OSD::start_waiting_for_healthy()
6731 {
6732 dout(1) << "start_waiting_for_healthy" << dendl;
6733 set_state(STATE_WAITING_FOR_HEALTHY);
6734 last_heartbeat_resample = utime_t();
6735
6736 // subscribe to osdmap updates, in case our peers really are known to be dead
6737 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6738 }
6739
6740 bool OSD::_is_healthy()
6741 {
6742 if (!cct->get_heartbeat_map()->is_healthy()) {
6743 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6744 return false;
6745 }
6746
6747 if (is_waiting_for_healthy()) {
6748 utime_t now = ceph_clock_now();
6749 if (osd_markdown_log.empty()) {
6750 dout(5) << __func__ << " force returning true since last markdown"
6751 << " was " << cct->_conf->osd_max_markdown_period
6752 << "s ago" << dendl;
6753 return true;
6754 }
6755 std::lock_guard l(heartbeat_lock);
6756 int num = 0, up = 0;
6757 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6758 p != heartbeat_peers.end();
6759 ++p) {
6760 if (p->second.is_healthy(now))
6761 ++up;
6762 ++num;
6763 }
6764 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6765 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6766 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6767 return false;
6768 }
6769 }
6770
6771 return true;
6772 }
6773
6774 void OSD::_send_boot()
6775 {
6776 dout(10) << "_send_boot" << dendl;
6777 Connection *local_connection =
6778 cluster_messenger->get_loopback_connection().get();
6779 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6780 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6781 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6782 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6783
6784 dout(20) << " initial client_addrs " << client_addrs
6785 << ", cluster_addrs " << cluster_addrs
6786 << ", hb_back_addrs " << hb_back_addrs
6787 << ", hb_front_addrs " << hb_front_addrs
6788 << dendl;
6789 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6790 dout(10) << " assuming cluster_addrs match client_addrs "
6791 << client_addrs << dendl;
6792 cluster_addrs = cluster_messenger->get_myaddrs();
6793 }
6794 if (auto session = local_connection->get_priv(); !session) {
6795 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6796 }
6797
6798 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6799 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6800 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6801 << cluster_addrs << dendl;
6802 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6803 }
6804 if (auto session = local_connection->get_priv(); !session) {
6805 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6806 }
6807
6808 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6809 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6810 dout(10) << " assuming hb_front_addrs match client_addrs "
6811 << client_addrs << dendl;
6812 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6813 }
6814 if (auto session = local_connection->get_priv(); !session) {
6815 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6816 }
6817
6818 // we now know what our front and back addrs will be, and we are
6819 // about to tell the mon what our metadata (including numa bindings)
6820 // are, so now is a good time!
6821 set_numa_affinity();
6822
6823 MOSDBoot *mboot = new MOSDBoot(
6824 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6825 hb_back_addrs, hb_front_addrs, cluster_addrs,
6826 CEPH_FEATURES_ALL);
6827 dout(10) << " final client_addrs " << client_addrs
6828 << ", cluster_addrs " << cluster_addrs
6829 << ", hb_back_addrs " << hb_back_addrs
6830 << ", hb_front_addrs " << hb_front_addrs
6831 << dendl;
6832 _collect_metadata(&mboot->metadata);
6833 monc->send_mon_message(mboot);
6834 set_state(STATE_BOOTING);
6835 }
6836
6837 void OSD::_collect_metadata(map<string,string> *pm)
6838 {
6839 // config info
6840 (*pm)["osd_data"] = dev_path;
6841 if (store->get_type() == "filestore") {
6842 // not applicable for bluestore
6843 (*pm)["osd_journal"] = journal_path;
6844 }
6845 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6846 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6847 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6848 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6849
6850 // backend
6851 (*pm)["osd_objectstore"] = store->get_type();
6852 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6853 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6854 (*pm)["default_device_class"] = store->get_default_device_class();
6855 string osdspec_affinity;
6856 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6857 if (r < 0 || osdspec_affinity.empty()) {
6858 osdspec_affinity = "";
6859 }
6860 (*pm)["osdspec_affinity"] = osdspec_affinity;
6861 string ceph_version_when_created;
6862 r = store->read_meta("ceph_version_when_created", &ceph_version_when_created);
6863 if (r <0 || ceph_version_when_created.empty()) {
6864 ceph_version_when_created = "";
6865 }
6866 (*pm)["ceph_version_when_created"] = ceph_version_when_created;
6867 string created_at;
6868 r = store->read_meta("created_at", &created_at);
6869 if (r < 0 || created_at.empty()) {
6870 created_at = "";
6871 }
6872 (*pm)["created_at"] = created_at;
6873 store->collect_metadata(pm);
6874
6875 collect_sys_info(pm, cct);
6876
6877 (*pm)["front_iface"] = pick_iface(
6878 cct,
6879 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6880 (*pm)["back_iface"] = pick_iface(
6881 cct,
6882 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6883
6884 // network numa
6885 {
6886 int node = -1;
6887 set<int> nodes;
6888 set<string> unknown;
6889 for (auto nm : { "front_iface", "back_iface" }) {
6890 if (!(*pm)[nm].size()) {
6891 unknown.insert(nm);
6892 continue;
6893 }
6894 int n = -1;
6895 int r = get_iface_numa_node((*pm)[nm], &n);
6896 if (r < 0) {
6897 unknown.insert((*pm)[nm]);
6898 continue;
6899 }
6900 nodes.insert(n);
6901 if (node < 0) {
6902 node = n;
6903 }
6904 }
6905 if (unknown.size()) {
6906 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6907 }
6908 if (!nodes.empty()) {
6909 (*pm)["network_numa_nodes"] = stringify(nodes);
6910 }
6911 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6912 (*pm)["network_numa_node"] = stringify(node);
6913 }
6914 }
6915
6916 if (numa_node >= 0) {
6917 (*pm)["numa_node"] = stringify(numa_node);
6918 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6919 &numa_cpu_set);
6920 }
6921
6922 set<string> devnames;
6923 store->get_devices(&devnames);
6924 map<string,string> errs;
6925 get_device_metadata(devnames, pm, &errs);
6926 for (auto& i : errs) {
6927 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6928 }
6929 dout(10) << __func__ << " " << *pm << dendl;
6930 }
6931
6932 void OSD::queue_want_up_thru(epoch_t want)
6933 {
6934 std::shared_lock map_locker{map_lock};
6935 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6936 std::lock_guard report_locker(mon_report_lock);
6937 if (want > up_thru_wanted) {
6938 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6939 << ", currently " << cur
6940 << dendl;
6941 up_thru_wanted = want;
6942 send_alive();
6943 } else {
6944 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6945 << ", currently " << cur
6946 << dendl;
6947 }
6948 }
6949
6950 void OSD::send_alive()
6951 {
6952 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6953 const auto osdmap = get_osdmap();
6954 if (!osdmap->exists(whoami))
6955 return;
6956 epoch_t up_thru = osdmap->get_up_thru(whoami);
6957 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6958 if (up_thru_wanted > up_thru) {
6959 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6960 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6961 }
6962 }
6963
6964 void OSD::request_full_map(epoch_t first, epoch_t last)
6965 {
6966 dout(10) << __func__ << " " << first << ".." << last
6967 << ", previously requested "
6968 << requested_full_first << ".." << requested_full_last << dendl;
6969 ceph_assert(ceph_mutex_is_locked(osd_lock));
6970 ceph_assert(first > 0 && last > 0);
6971 ceph_assert(first <= last);
6972 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6973 if (requested_full_first == 0) {
6974 // first request
6975 requested_full_first = first;
6976 requested_full_last = last;
6977 } else if (last <= requested_full_last) {
6978 // dup
6979 return;
6980 } else {
6981 // additional request
6982 first = requested_full_last + 1;
6983 requested_full_last = last;
6984 }
6985 MMonGetOSDMap *req = new MMonGetOSDMap;
6986 req->request_full(first, last);
6987 monc->send_mon_message(req);
6988 }
6989
6990 void OSD::got_full_map(epoch_t e)
6991 {
6992 ceph_assert(requested_full_first <= requested_full_last);
6993 ceph_assert(ceph_mutex_is_locked(osd_lock));
6994 if (requested_full_first == 0) {
6995 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6996 return;
6997 }
6998 if (e < requested_full_first) {
6999 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7000 << ".." << requested_full_last
7001 << ", ignoring" << dendl;
7002 return;
7003 }
7004 if (e >= requested_full_last) {
7005 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7006 << ".." << requested_full_last << ", resetting" << dendl;
7007 requested_full_first = requested_full_last = 0;
7008 return;
7009 }
7010
7011 requested_full_first = e + 1;
7012
7013 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7014 << ".." << requested_full_last
7015 << ", still need more" << dendl;
7016 }
7017
7018 void OSD::requeue_failures()
7019 {
7020 std::lock_guard l(heartbeat_lock);
7021 unsigned old_queue = failure_queue.size();
7022 unsigned old_pending = failure_pending.size();
7023 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7024 failure_queue[p->first] = p->second.first;
7025 failure_pending.erase(p++);
7026 }
7027 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
7028 << failure_queue.size() << dendl;
7029 }
7030
7031 void OSD::send_failures()
7032 {
7033 ceph_assert(ceph_mutex_is_locked(map_lock));
7034 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
7035 std::lock_guard l(heartbeat_lock);
7036 utime_t now = ceph_clock_now();
7037 const auto osdmap = get_osdmap();
7038 while (!failure_queue.empty()) {
7039 int osd = failure_queue.begin()->first;
7040 if (!failure_pending.count(osd)) {
7041 int failed_for = (int)(double)(now - failure_queue.begin()->second);
7042 monc->send_mon_message(
7043 new MOSDFailure(
7044 monc->get_fsid(),
7045 osd,
7046 osdmap->get_addrs(osd),
7047 failed_for,
7048 osdmap->get_epoch()));
7049 failure_pending[osd] = make_pair(failure_queue.begin()->second,
7050 osdmap->get_addrs(osd));
7051 }
7052 failure_queue.erase(osd);
7053 }
7054 }
7055
7056 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7057 {
7058 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
7059 MOSDFailure::FLAG_ALIVE);
7060 monc->send_mon_message(m);
7061 }
7062
7063 void OSD::cancel_pending_failures()
7064 {
7065 std::lock_guard l(heartbeat_lock);
7066 auto it = failure_pending.begin();
7067 while (it != failure_pending.end()) {
7068 dout(10) << __func__ << " canceling in-flight failure report for osd."
7069 << it->first << dendl;
7070 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
7071 failure_pending.erase(it++);
7072 }
7073 }
7074
7075 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
7076 {
7077 const auto& monmap = monc->monmap;
7078 // send beacon to mon even if we are just connected, and the monmap is not
7079 // initialized yet by then.
7080 if (monmap.epoch > 0 &&
7081 monmap.get_required_features().contains_all(
7082 ceph::features::mon::FEATURE_LUMINOUS)) {
7083 dout(20) << __func__ << " sending" << dendl;
7084 MOSDBeacon* beacon = nullptr;
7085 {
7086 std::lock_guard l{min_last_epoch_clean_lock};
7087 beacon = new MOSDBeacon(get_osdmap_epoch(),
7088 min_last_epoch_clean,
7089 superblock.last_purged_snaps_scrub,
7090 cct->_conf->osd_beacon_report_interval);
7091 beacon->pgs = min_last_epoch_clean_pgs;
7092 last_sent_beacon = now;
7093 }
7094 monc->send_mon_message(beacon);
7095 } else {
7096 dout(20) << __func__ << " not sending" << dendl;
7097 }
7098 }
7099
7100 void OSD::handle_command(MCommand *m)
7101 {
7102 ConnectionRef con = m->get_connection();
7103 auto session = ceph::ref_cast<Session>(con->get_priv());
7104 if (!session) {
7105 con->send_message(new MCommandReply(m, -EACCES));
7106 m->put();
7107 return;
7108 }
7109 if (!session->caps.allow_all()) {
7110 con->send_message(new MCommandReply(m, -EACCES));
7111 m->put();
7112 return;
7113 }
7114 cct->get_admin_socket()->queue_tell_command(m);
7115 m->put();
7116 }
7117
7118 namespace {
7119 class unlock_guard {
7120 ceph::mutex& m;
7121 public:
7122 explicit unlock_guard(ceph::mutex& mutex)
7123 : m(mutex)
7124 {
7125 m.unlock();
7126 }
7127 unlock_guard(unlock_guard&) = delete;
7128 ~unlock_guard() {
7129 m.lock();
7130 }
7131 };
7132 }
7133
7134 void OSD::scrub_purged_snaps()
7135 {
7136 dout(10) << __func__ << dendl;
7137 ceph_assert(ceph_mutex_is_locked(osd_lock));
7138 SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
7139 make_snapmapper_oid(),
7140 make_purged_snaps_oid());
7141 clog->debug() << "purged_snaps scrub starts";
7142 osd_lock.unlock();
7143 s.run();
7144 if (s.stray.size()) {
7145 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7146 } else {
7147 clog->debug() << "purged_snaps scrub ok";
7148 }
7149 set<pair<spg_t,snapid_t>> queued;
7150 for (auto& [pool, snap, hash, shard] : s.stray) {
7151 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7152 if (!pi) {
7153 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7154 continue;
7155 }
7156 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7157 spg_t spgid(pgid, shard);
7158 pair<spg_t,snapid_t> p(spgid, snap);
7159 if (queued.count(p)) {
7160 dout(20) << __func__ << " pg " << spgid << " snap " << snap
7161 << " already queued" << dendl;
7162 continue;
7163 }
7164 PGRef pg = lookup_lock_pg(spgid);
7165 if (!pg) {
7166 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7167 continue;
7168 }
7169 queued.insert(p);
7170 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7171 << snap << dendl;
7172 pg->queue_snap_retrim(snap);
7173 pg->unlock();
7174 }
7175 osd_lock.lock();
7176 if (is_stopping()) {
7177 return;
7178 }
7179 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7180 ObjectStore::Transaction t;
7181 superblock.last_purged_snaps_scrub = ceph_clock_now();
7182 write_superblock(t);
7183 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7184 ceph_assert(tr == 0);
7185 if (is_active()) {
7186 send_beacon(ceph::coarse_mono_clock::now());
7187 }
7188 dout(10) << __func__ << " done" << dendl;
7189 }
7190
7191 void OSD::probe_smart(const string& only_devid, ostream& ss)
7192 {
7193 set<string> devnames;
7194 store->get_devices(&devnames);
7195 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7196 "osd_smart_report_timeout");
7197
7198 // == typedef std::map<std::string, mValue> mObject;
7199 json_spirit::mObject json_map;
7200
7201 for (auto dev : devnames) {
7202 // smartctl works only on physical devices; filter out any logical device
7203 if (dev.find("dm-") == 0) {
7204 continue;
7205 }
7206
7207 string err;
7208 string devid = get_device_id(dev, &err);
7209 if (devid.size() == 0) {
7210 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7211 << err << "), skipping" << dendl;
7212 continue;
7213 }
7214 if (only_devid.size() && devid != only_devid) {
7215 continue;
7216 }
7217
7218 json_spirit::mValue smart_json;
7219 if (block_device_get_metrics(dev, smart_timeout,
7220 &smart_json)) {
7221 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7222 continue;
7223 }
7224 json_map[devid] = smart_json;
7225 }
7226 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7227 }
7228
7229 bool OSD::heartbeat_dispatch(Message *m)
7230 {
7231 dout(30) << "heartbeat_dispatch " << m << dendl;
7232 switch (m->get_type()) {
7233
7234 case CEPH_MSG_PING:
7235 dout(10) << "ping from " << m->get_source_inst() << dendl;
7236 m->put();
7237 break;
7238
7239 case MSG_OSD_PING:
7240 handle_osd_ping(static_cast<MOSDPing*>(m));
7241 break;
7242
7243 default:
7244 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7245 m->put();
7246 }
7247
7248 return true;
7249 }
7250
7251 bool OSD::ms_dispatch(Message *m)
7252 {
7253 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7254 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7255 service.got_stop_ack();
7256 m->put();
7257 return true;
7258 }
7259
7260 // lock!
7261
7262 osd_lock.lock();
7263 if (is_stopping()) {
7264 osd_lock.unlock();
7265 m->put();
7266 return true;
7267 }
7268
7269 _dispatch(m);
7270
7271 osd_lock.unlock();
7272
7273 return true;
7274 }
7275
7276 void OSDService::maybe_share_map(
7277 Connection *con,
7278 const OSDMapRef& osdmap,
7279 epoch_t peer_epoch_lb)
7280 {
7281 // NOTE: we assume caller hold something that keeps the Connection itself
7282 // pinned (e.g., an OpRequest's MessageRef).
7283 auto session = ceph::ref_cast<Session>(con->get_priv());
7284 if (!session) {
7285 return;
7286 }
7287
7288 // assume the peer has the newer of the op's sent_epoch and what
7289 // we think we sent them.
7290 session->sent_epoch_lock.lock();
7291 if (peer_epoch_lb > session->last_sent_epoch) {
7292 dout(10) << __func__ << " con " << con
7293 << " " << con->get_peer_addr()
7294 << " map epoch " << session->last_sent_epoch
7295 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7296 session->last_sent_epoch = peer_epoch_lb;
7297 }
7298 epoch_t last_sent_epoch = session->last_sent_epoch;
7299 session->sent_epoch_lock.unlock();
7300
7301 if (osdmap->get_epoch() <= last_sent_epoch) {
7302 return;
7303 }
7304
7305 send_incremental_map(last_sent_epoch, con, osdmap);
7306 last_sent_epoch = osdmap->get_epoch();
7307
7308 session->sent_epoch_lock.lock();
7309 if (session->last_sent_epoch < last_sent_epoch) {
7310 dout(10) << __func__ << " con " << con
7311 << " " << con->get_peer_addr()
7312 << " map epoch " << session->last_sent_epoch
7313 << " -> " << last_sent_epoch << " (shared)" << dendl;
7314 session->last_sent_epoch = last_sent_epoch;
7315 }
7316 session->sent_epoch_lock.unlock();
7317 }
7318
7319 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7320 {
7321 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7322
7323 auto i = session->waiting_on_map.begin();
7324 while (i != session->waiting_on_map.end()) {
7325 OpRequestRef op = &(*i);
7326 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7327 auto m = op->get_req<MOSDFastDispatchOp>();
7328 if (m->get_min_epoch() > osdmap->get_epoch()) {
7329 break;
7330 }
7331 session->waiting_on_map.erase(i++);
7332 op->put();
7333
7334 spg_t pgid;
7335 if (m->get_type() == CEPH_MSG_OSD_OP) {
7336 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7337 static_cast<const MOSDOp*>(m)->get_pg());
7338 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7339 continue;
7340 }
7341 } else {
7342 pgid = m->get_spg();
7343 }
7344 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7345 }
7346
7347 if (session->waiting_on_map.empty()) {
7348 clear_session_waiting_on_map(session);
7349 } else {
7350 register_session_waiting_on_map(session);
7351 }
7352 }
7353
7354 void OSD::ms_fast_dispatch(Message *m)
7355 {
7356 FUNCTRACE(cct);
7357 if (service.is_stopping()) {
7358 m->put();
7359 return;
7360 }
7361 // peering event?
7362 switch (m->get_type()) {
7363 case CEPH_MSG_PING:
7364 dout(10) << "ping from " << m->get_source() << dendl;
7365 m->put();
7366 return;
7367 case MSG_OSD_FORCE_RECOVERY:
7368 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7369 return;
7370 case MSG_OSD_SCRUB2:
7371 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7372 return;
7373 case MSG_OSD_PG_CREATE2:
7374 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7375 case MSG_OSD_PG_NOTIFY:
7376 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7377 case MSG_OSD_PG_INFO:
7378 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7379 case MSG_OSD_PG_REMOVE:
7380 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7381 // these are single-pg messages that handle themselves
7382 case MSG_OSD_PG_LOG:
7383 case MSG_OSD_PG_TRIM:
7384 case MSG_OSD_PG_NOTIFY2:
7385 case MSG_OSD_PG_QUERY2:
7386 case MSG_OSD_PG_INFO2:
7387 case MSG_OSD_BACKFILL_RESERVE:
7388 case MSG_OSD_RECOVERY_RESERVE:
7389 case MSG_OSD_PG_LEASE:
7390 case MSG_OSD_PG_LEASE_ACK:
7391 {
7392 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7393 if (require_osd_peer(pm)) {
7394 enqueue_peering_evt(
7395 pm->get_spg(),
7396 PGPeeringEventRef(pm->get_event()));
7397 }
7398 pm->put();
7399 return;
7400 }
7401 }
7402
7403 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7404 {
7405 #ifdef WITH_LTTNG
7406 osd_reqid_t reqid = op->get_reqid();
7407 #endif
7408 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7409 reqid.name._num, reqid.tid, reqid.inc);
7410 }
7411 op->osd_parent_span = tracing::osd::tracer.start_trace("op-request-created");
7412
7413 if (m->trace)
7414 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7415
7416 // note sender epoch, min req's epoch
7417 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7418 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7419 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7420
7421 service.maybe_inject_dispatch_delay();
7422
7423 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7424 m->get_type() != CEPH_MSG_OSD_OP) {
7425 // queue it directly
7426 enqueue_op(
7427 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7428 std::move(op),
7429 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7430 } else {
7431 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7432 // message that didn't have an explicit spg_t); we need to map
7433 // them to an spg_t while preserving delivery order.
7434 auto priv = m->get_connection()->get_priv();
7435 if (auto session = static_cast<Session*>(priv.get()); session) {
7436 std::lock_guard l{session->session_dispatch_lock};
7437 op->get();
7438 session->waiting_on_map.push_back(*op);
7439 OSDMapRef nextmap = service.get_nextmap_reserved();
7440 dispatch_session_waiting(session, nextmap);
7441 service.release_map(nextmap);
7442 }
7443 }
7444 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7445 }
7446
7447 int OSD::ms_handle_authentication(Connection *con)
7448 {
7449 int ret = 0;
7450 auto s = ceph::ref_cast<Session>(con->get_priv());
7451 if (!s) {
7452 s = ceph::make_ref<Session>(cct, con);
7453 con->set_priv(s);
7454 s->entity_name = con->get_peer_entity_name();
7455 dout(10) << __func__ << " new session " << s << " con " << s->con
7456 << " entity " << s->entity_name
7457 << " addr " << con->get_peer_addrs() << dendl;
7458 } else {
7459 dout(10) << __func__ << " existing session " << s << " con " << s->con
7460 << " entity " << s->entity_name
7461 << " addr " << con->get_peer_addrs() << dendl;
7462 }
7463
7464 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7465 if (caps_info.allow_all) {
7466 s->caps.set_allow_all();
7467 } else if (caps_info.caps.length() > 0) {
7468 bufferlist::const_iterator p = caps_info.caps.cbegin();
7469 string str;
7470 try {
7471 decode(str, p);
7472 }
7473 catch (ceph::buffer::error& e) {
7474 dout(10) << __func__ << " session " << s << " " << s->entity_name
7475 << " failed to decode caps string" << dendl;
7476 ret = -EACCES;
7477 }
7478 if (!ret) {
7479 bool success = s->caps.parse(str);
7480 if (success) {
7481 dout(10) << __func__ << " session " << s
7482 << " " << s->entity_name
7483 << " has caps " << s->caps << " '" << str << "'" << dendl;
7484 ret = 1;
7485 } else {
7486 dout(10) << __func__ << " session " << s << " " << s->entity_name
7487 << " failed to parse caps '" << str << "'" << dendl;
7488 ret = -EACCES;
7489 }
7490 }
7491 }
7492 return ret;
7493 }
7494
7495 void OSD::_dispatch(Message *m)
7496 {
7497 ceph_assert(ceph_mutex_is_locked(osd_lock));
7498 dout(20) << "_dispatch " << m << " " << *m << dendl;
7499
7500 switch (m->get_type()) {
7501 // -- don't need OSDMap --
7502
7503 // map and replication
7504 case CEPH_MSG_OSD_MAP:
7505 handle_osd_map(static_cast<MOSDMap*>(m));
7506 break;
7507 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7508 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7509 break;
7510
7511 // osd
7512 case MSG_COMMAND:
7513 handle_command(static_cast<MCommand*>(m));
7514 return;
7515 }
7516 }
7517
7518 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7519 {
7520 dout(10) << __func__ << " " << *m << dendl;
7521 if (!require_mon_or_mgr_peer(m)) {
7522 m->put();
7523 return;
7524 }
7525 if (m->fsid != monc->get_fsid()) {
7526 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7527 << dendl;
7528 m->put();
7529 return;
7530 }
7531 for (auto pgid : m->scrub_pgs) {
7532 enqueue_peering_evt(
7533 pgid,
7534 PGPeeringEventRef(
7535 std::make_shared<PGPeeringEvent>(
7536 m->epoch,
7537 m->epoch,
7538 PeeringState::RequestScrub(m->deep, m->repair))));
7539 }
7540 m->put();
7541 }
7542
7543 bool OSD::scrub_random_backoff()
7544 {
7545 bool coin_flip = (rand() / (double)RAND_MAX >=
7546 cct->_conf->osd_scrub_backoff_ratio);
7547 if (!coin_flip) {
7548 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off (ratio: "
7549 << cct->_conf->osd_scrub_backoff_ratio << ")" << dendl;
7550 return true;
7551 }
7552 return false;
7553 }
7554
7555
7556 void OSD::sched_scrub()
7557 {
7558 auto& scrub_scheduler = service.get_scrub_services();
7559
7560 if (auto blocked_pgs = scrub_scheduler.get_blocked_pgs_count();
7561 blocked_pgs > 0) {
7562 // some PGs managed by this OSD were blocked by a locked object during
7563 // scrub. This means we might not have the resources needed to scrub now.
7564 dout(10)
7565 << fmt::format(
7566 "{}: PGs are blocked while scrubbing due to locked objects ({} PGs)",
7567 __func__,
7568 blocked_pgs)
7569 << dendl;
7570 }
7571
7572 // fail fast if no resources are available
7573 if (!scrub_scheduler.can_inc_scrubs()) {
7574 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7575 return;
7576 }
7577
7578 // if there is a PG that is just now trying to reserve scrub replica resources -
7579 // we should wait and not initiate a new scrub
7580 if (scrub_scheduler.is_reserving_now()) {
7581 dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7582 return;
7583 }
7584
7585 Scrub::ScrubPreconds env_conditions;
7586
7587 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7588 if (!cct->_conf->osd_repair_during_recovery) {
7589 dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7590 << dendl;
7591 return;
7592 }
7593 dout(10) << __func__
7594 << " will only schedule explicitly requested repair due to active recovery"
7595 << dendl;
7596 env_conditions.allow_requested_repair_only = true;
7597 }
7598
7599 if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7600 dout(20) << __func__ << " sched_scrub starts" << dendl;
7601 auto all_jobs = scrub_scheduler.list_registered_jobs();
7602 for (const auto& sj : all_jobs) {
7603 dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7604 }
7605 }
7606
7607 auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7608 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7609 << ")" << dendl;
7610 }
7611
7612 Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7613 bool allow_requested_repair_only)
7614 {
7615 dout(20) << __func__ << " trying " << pgid << dendl;
7616
7617 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7618 // allowed
7619
7620 PGRef pg = osd->lookup_lock_pg(pgid);
7621 if (!pg) {
7622 // the PG was dequeued in the short timespan between creating the candidates list
7623 // (collect_ripe_jobs()) and here
7624 dout(5) << __func__ << " pg " << pgid << " not found" << dendl;
7625 return Scrub::schedule_result_t::no_such_pg;
7626 }
7627
7628 // This has already started, so go on to the next scrub job
7629 if (pg->is_scrub_queued_or_active()) {
7630 pg->unlock();
7631 dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7632 return Scrub::schedule_result_t::already_started;
7633 }
7634 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7635 if (allow_requested_repair_only && !pg->get_planned_scrub().must_repair) {
7636 pg->unlock();
7637 dout(10) << __func__ << " skip " << pgid
7638 << " because repairing is not explicitly requested on it" << dendl;
7639 return Scrub::schedule_result_t::preconditions;
7640 }
7641
7642 auto scrub_attempt = pg->sched_scrub();
7643 pg->unlock();
7644 return scrub_attempt;
7645 }
7646
7647 void OSD::resched_all_scrubs()
7648 {
7649 dout(10) << __func__ << ": start" << dendl;
7650 auto all_jobs = service.get_scrub_services().list_registered_jobs();
7651 for (auto& e : all_jobs) {
7652
7653 auto& job = *e;
7654 dout(20) << __func__ << ": examine " << job.pgid << dendl;
7655
7656 PGRef pg = _lookup_lock_pg(job.pgid);
7657 if (!pg)
7658 continue;
7659
7660 if (!pg->get_planned_scrub().must_scrub && !pg->get_planned_scrub().need_auto) {
7661 dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7662 pg->reschedule_scrub();
7663 }
7664 pg->unlock();
7665 }
7666 dout(10) << __func__ << ": done" << dendl;
7667 }
7668
7669 MPGStats* OSD::collect_pg_stats()
7670 {
7671 dout(15) << __func__ << dendl;
7672 // This implementation unconditionally sends every is_primary PG's
7673 // stats every time we're called. This has equivalent cost to the
7674 // previous implementation's worst case where all PGs are busy and
7675 // their stats are always enqueued for sending.
7676 std::shared_lock l{map_lock};
7677
7678 osd_stat_t cur_stat = service.get_osd_stat();
7679 cur_stat.os_perf_stat = store->get_cur_stats();
7680
7681 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7682 m->osd_stat = cur_stat;
7683
7684 std::lock_guard lec{min_last_epoch_clean_lock};
7685 min_last_epoch_clean = get_osdmap_epoch();
7686 min_last_epoch_clean_pgs.clear();
7687
7688 auto now_is = ceph::coarse_real_clock::now();
7689
7690 std::set<int64_t> pool_set;
7691 vector<PGRef> pgs;
7692 _get_pgs(&pgs);
7693 for (auto& pg : pgs) {
7694 auto pool = pg->pg_id.pgid.pool();
7695 pool_set.emplace((int64_t)pool);
7696 if (!pg->is_primary()) {
7697 continue;
7698 }
7699 pg->with_pg_stats(now_is, [&](const pg_stat_t& s, epoch_t lec) {
7700 m->pg_stat[pg->pg_id.pgid] = s;
7701 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7702 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7703 });
7704 }
7705 store_statfs_t st;
7706 bool per_pool_stats = true;
7707 bool per_pool_omap_stats = false;
7708 for (auto p : pool_set) {
7709 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7710 if (r == -ENOTSUP) {
7711 per_pool_stats = false;
7712 break;
7713 } else {
7714 assert(r >= 0);
7715 m->pool_stat[p] = st;
7716 }
7717 }
7718
7719 // indicate whether we are reporting per-pool stats
7720 m->osd_stat.num_osds = 1;
7721 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7722 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7723
7724 return m;
7725 }
7726
7727 vector<DaemonHealthMetric> OSD::get_health_metrics()
7728 {
7729 vector<DaemonHealthMetric> metrics;
7730 {
7731 utime_t oldest_secs;
7732 const utime_t now = ceph_clock_now();
7733 auto too_old = now;
7734 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7735 int slow = 0;
7736 TrackedOpRef oldest_op;
7737 OSDMapRef osdmap = get_osdmap();
7738 // map of slow op counts by slow op event type for an aggregated logging to
7739 // the cluster log.
7740 map<uint8_t, int> slow_op_types;
7741 // map of slow op counts by pool for reporting a pool name with highest
7742 // slow ops.
7743 map<uint64_t, int> slow_op_pools;
7744 bool log_aggregated_slow_op =
7745 cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
7746 auto count_slow_ops = [&](TrackedOp& op) {
7747 if (op.get_initiated() < too_old) {
7748 stringstream ss;
7749 ss << "slow request " << op.get_desc()
7750 << " initiated "
7751 << op.get_initiated()
7752 << " currently "
7753 << op.state_string();
7754 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7755 if (log_aggregated_slow_op) {
7756 if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7757 uint8_t op_type = req->state_flag();
7758 auto m = req->get_req<MOSDFastDispatchOp>();
7759 uint64_t poolid = m->get_spg().pgid.m_pool;
7760 slow_op_types[op_type]++;
7761 if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7762 slow_op_pools[poolid]++;
7763 }
7764 }
7765 } else {
7766 clog->warn() << ss.str();
7767 }
7768 slow++;
7769 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7770 oldest_op = &op;
7771 }
7772 return true;
7773 } else {
7774 return false;
7775 }
7776 };
7777 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7778 if (slow) {
7779 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7780 << oldest_op->get_desc() << dendl;
7781 if (log_aggregated_slow_op &&
7782 slow_op_types.size() > 0) {
7783 stringstream ss;
7784 ss << slow << " slow requests (by type [ ";
7785 for (const auto& [op_type, count] : slow_op_types) {
7786 ss << "'" << OpRequest::get_state_string(op_type)
7787 << "' : " << count
7788 << " ";
7789 }
7790 auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7791 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7792 return p1.second < p2.second;
7793 });
7794 if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7795 string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7796 ss << "] most affected pool [ '"
7797 << pool_name
7798 << "' : "
7799 << slow_pool_it->second
7800 << " ])";
7801 } else {
7802 ss << "])";
7803 }
7804 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7805 clog->warn() << ss.str();
7806 }
7807 }
7808 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7809 } else {
7810 // no news is not good news.
7811 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7812 }
7813 }
7814 {
7815 std::lock_guard l(pending_creates_lock);
7816 auto n_primaries = pending_creates_from_mon;
7817 for (const auto& create : pending_creates_from_osd) {
7818 if (create.second) {
7819 n_primaries++;
7820 }
7821 }
7822 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7823 }
7824 return metrics;
7825 }
7826
7827 // =====================================================
7828 // MAP
7829 /** update_map
7830 * assimilate new OSDMap(s). scan pgs, etc.
7831 */
7832
7833 void OSD::note_down_osd(int peer)
7834 {
7835 ceph_assert(ceph_mutex_is_locked(osd_lock));
7836 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7837
7838 std::lock_guard l{heartbeat_lock};
7839 failure_queue.erase(peer);
7840 failure_pending.erase(peer);
7841 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7842 if (p != heartbeat_peers.end()) {
7843 p->second.clear_mark_down();
7844 heartbeat_peers.erase(p);
7845 }
7846 }
7847
7848 void OSD::note_up_osd(int peer)
7849 {
7850 heartbeat_set_peers_need_update();
7851 }
7852
7853 struct C_OnMapCommit : public Context {
7854 OSD *osd;
7855 epoch_t first, last;
7856 MOSDMap *msg;
7857 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7858 : osd(o), first(f), last(l), msg(m) {}
7859 void finish(int r) override {
7860 osd->_committed_osd_maps(first, last, msg);
7861 msg->put();
7862 }
7863 };
7864
7865 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7866 {
7867 std::lock_guard l(osdmap_subscribe_lock);
7868 if (latest_subscribed_epoch >= epoch && !force_request)
7869 return;
7870
7871 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7872
7873 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7874 force_request) {
7875 monc->renew_subs();
7876 }
7877 }
7878
7879 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7880 {
7881 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7882 if (min <= superblock.oldest_map)
7883 return;
7884
7885 int num = 0;
7886 ObjectStore::Transaction t;
7887 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7888 dout(20) << " removing old osdmap epoch " << e << dendl;
7889 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7890 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7891 superblock.oldest_map = e + 1;
7892 num++;
7893 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7894 service.publish_superblock(superblock);
7895 write_superblock(t);
7896 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7897 ceph_assert(tr == 0);
7898 num = 0;
7899 if (!skip_maps) {
7900 // skip_maps leaves us with a range of old maps if we fail to remove all
7901 // of them before moving superblock.oldest_map forward to the first map
7902 // in the incoming MOSDMap msg. so we should continue removing them in
7903 // this case, even we could do huge series of delete transactions all at
7904 // once.
7905 break;
7906 }
7907 }
7908 }
7909 if (num > 0) {
7910 service.publish_superblock(superblock);
7911 write_superblock(t);
7912 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7913 ceph_assert(tr == 0);
7914 }
7915 // we should not remove the cached maps
7916 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7917 }
7918
7919 void OSD::handle_osd_map(MOSDMap *m)
7920 {
7921 // wait for pgs to catch up
7922 {
7923 // we extend the map cache pins to accomodate pgs slow to consume maps
7924 // for some period, until we hit the max_lag_factor bound, at which point
7925 // we block here to stop injesting more maps than they are able to keep
7926 // up with.
7927 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7928 m_osd_pg_epoch_max_lag_factor;
7929 ceph_assert(max_lag > 0);
7930 epoch_t osd_min = 0;
7931 for (auto shard : shards) {
7932 epoch_t min = shard->get_min_pg_epoch();
7933 if (osd_min == 0 || min < osd_min) {
7934 osd_min = min;
7935 }
7936 }
7937 epoch_t osdmap_epoch = get_osdmap_epoch();
7938 if (osd_min > 0 &&
7939 osdmap_epoch > max_lag &&
7940 osdmap_epoch - max_lag > osd_min) {
7941 epoch_t need = osdmap_epoch - max_lag;
7942 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7943 << " max_lag " << max_lag << ")" << dendl;
7944 for (auto shard : shards) {
7945 epoch_t min = shard->get_min_pg_epoch();
7946 if (need > min) {
7947 dout(10) << __func__ << " waiting for pgs to consume " << need
7948 << " (shard " << shard->shard_id << " min " << min
7949 << ", map cache is " << cct->_conf->osd_map_cache_size
7950 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7951 << ")" << dendl;
7952 unlock_guard unlock{osd_lock};
7953 shard->wait_min_pg_epoch(need);
7954 }
7955 }
7956 }
7957 }
7958
7959 ceph_assert(ceph_mutex_is_locked(osd_lock));
7960 map<epoch_t,OSDMapRef> added_maps;
7961 map<epoch_t,bufferlist> added_maps_bl;
7962 if (m->fsid != monc->get_fsid()) {
7963 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7964 << monc->get_fsid() << dendl;
7965 m->put();
7966 return;
7967 }
7968 if (is_initializing()) {
7969 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7970 m->put();
7971 return;
7972 }
7973
7974 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7975 if (session && !(session->entity_name.is_mon() ||
7976 session->entity_name.is_osd())) {
7977 //not enough perms!
7978 dout(10) << "got osd map from Session " << session
7979 << " which we can't take maps from (not a mon or osd)" << dendl;
7980 m->put();
7981 return;
7982 }
7983
7984 // share with the objecter
7985 if (!is_preboot())
7986 service.objecter->handle_osd_map(m);
7987
7988 epoch_t first = m->get_first();
7989 epoch_t last = m->get_last();
7990 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7991 << superblock.newest_map
7992 << ", src has [" << m->cluster_osdmap_trim_lower_bound
7993 << "," << m->newest_map << "]"
7994 << dendl;
7995
7996 logger->inc(l_osd_map);
7997 logger->inc(l_osd_mape, last - first + 1);
7998 if (first <= superblock.newest_map)
7999 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8000
8001 if (superblock.cluster_osdmap_trim_lower_bound <
8002 m->cluster_osdmap_trim_lower_bound) {
8003 superblock.cluster_osdmap_trim_lower_bound =
8004 m->cluster_osdmap_trim_lower_bound;
8005 dout(10) << " superblock cluster_osdmap_trim_lower_bound new epoch is: "
8006 << superblock.cluster_osdmap_trim_lower_bound << dendl;
8007 ceph_assert(
8008 superblock.cluster_osdmap_trim_lower_bound >= superblock.oldest_map);
8009 }
8010
8011 // make sure there is something new, here, before we bother flushing
8012 // the queues and such
8013 if (last <= superblock.newest_map) {
8014 dout(10) << " no new maps here, dropping" << dendl;
8015 m->put();
8016 return;
8017 }
8018
8019 // missing some?
8020 bool skip_maps = false;
8021 if (first > superblock.newest_map + 1) {
8022 dout(10) << "handle_osd_map message skips epochs "
8023 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8024 if (m->cluster_osdmap_trim_lower_bound <= superblock.newest_map + 1) {
8025 osdmap_subscribe(superblock.newest_map + 1, false);
8026 m->put();
8027 return;
8028 }
8029 // always try to get the full range of maps--as many as we can. this
8030 // 1- is good to have
8031 // 2- is at present the only way to ensure that we get a *full* map as
8032 // the first map!
8033 if (m->cluster_osdmap_trim_lower_bound < first) {
8034 osdmap_subscribe(m->cluster_osdmap_trim_lower_bound - 1, true);
8035 m->put();
8036 return;
8037 }
8038 skip_maps = true;
8039 }
8040
8041 ObjectStore::Transaction t;
8042 uint64_t txn_size = 0;
8043
8044 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8045
8046 // store new maps: queue for disk and put in the osdmap cache
8047 epoch_t start = std::max(superblock.newest_map + 1, first);
8048 for (epoch_t e = start; e <= last; e++) {
8049 if (txn_size >= t.get_num_bytes()) {
8050 derr << __func__ << " transaction size overflowed" << dendl;
8051 ceph_assert(txn_size < t.get_num_bytes());
8052 }
8053 txn_size = t.get_num_bytes();
8054 map<epoch_t,bufferlist>::iterator p;
8055 p = m->maps.find(e);
8056 if (p != m->maps.end()) {
8057 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8058 OSDMap *o = new OSDMap;
8059 bufferlist& bl = p->second;
8060
8061 o->decode(bl);
8062
8063 purged_snaps[e] = o->get_new_purged_snaps();
8064
8065 ghobject_t fulloid = get_osdmap_pobject_name(e);
8066 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8067 added_maps[e] = add_map(o);
8068 added_maps_bl[e] = bl;
8069 got_full_map(e);
8070 continue;
8071 }
8072
8073 p = m->incremental_maps.find(e);
8074 if (p != m->incremental_maps.end()) {
8075 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8076 bufferlist& bl = p->second;
8077 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8078 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8079
8080 OSDMap *o = new OSDMap;
8081 if (e > 1) {
8082 bufferlist obl;
8083 bool got = get_map_bl(e - 1, obl);
8084 if (!got) {
8085 auto p = added_maps_bl.find(e - 1);
8086 ceph_assert(p != added_maps_bl.end());
8087 obl = p->second;
8088 }
8089 o->decode(obl);
8090 }
8091
8092 OSDMap::Incremental inc;
8093 auto p = bl.cbegin();
8094 inc.decode(p);
8095
8096 if (o->apply_incremental(inc) < 0) {
8097 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8098 ceph_abort_msg("bad fsid");
8099 }
8100
8101 bufferlist fbl;
8102 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8103
8104 bool injected_failure = false;
8105 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8106 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8107 derr << __func__ << " injecting map crc failure" << dendl;
8108 injected_failure = true;
8109 }
8110
8111 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8112 dout(2) << "got incremental " << e
8113 << " but failed to encode full with correct crc; requesting"
8114 << dendl;
8115 clog->warn() << "failed to encode map e" << e << " with expected crc";
8116 dout(20) << "my encoded map was:\n";
8117 fbl.hexdump(*_dout);
8118 *_dout << dendl;
8119 delete o;
8120 request_full_map(e, last);
8121 last = e - 1;
8122
8123 // don't continue committing if we failed to enc the first inc map
8124 if (last < start) {
8125 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8126 m->put();
8127 return;
8128 }
8129 break;
8130 }
8131 got_full_map(e);
8132 purged_snaps[e] = o->get_new_purged_snaps();
8133
8134 ghobject_t fulloid = get_osdmap_pobject_name(e);
8135 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8136 added_maps[e] = add_map(o);
8137 added_maps_bl[e] = fbl;
8138 continue;
8139 }
8140
8141 ceph_abort_msg("MOSDMap lied about what maps it had?");
8142 }
8143
8144 // even if this map isn't from a mon, we may have satisfied our subscription
8145 monc->sub_got("osdmap", last);
8146
8147 if (!m->maps.empty() && requested_full_first) {
8148 dout(10) << __func__ << " still missing full maps " << requested_full_first
8149 << ".." << requested_full_last << dendl;
8150 rerequest_full_maps();
8151 }
8152
8153 if (superblock.oldest_map) {
8154 // make sure we at least keep pace with incoming maps
8155 trim_maps(m->cluster_osdmap_trim_lower_bound,
8156 last - first + 1, skip_maps);
8157 pg_num_history.prune(superblock.oldest_map);
8158 }
8159
8160 if (!superblock.oldest_map || skip_maps)
8161 superblock.oldest_map = first;
8162 superblock.newest_map = last;
8163 superblock.current_epoch = last;
8164
8165 // note in the superblock that we were clean thru the prior epoch
8166 epoch_t boot_epoch = service.get_boot_epoch();
8167 if (boot_epoch && boot_epoch >= superblock.mounted) {
8168 superblock.mounted = boot_epoch;
8169 superblock.clean_thru = last;
8170 }
8171
8172 // check for pg_num changes and deleted pools
8173 OSDMapRef lastmap;
8174 for (auto& i : added_maps) {
8175 if (!lastmap) {
8176 if (!(lastmap = service.try_get_map(i.first - 1))) {
8177 dout(10) << __func__ << " can't get previous map " << i.first - 1
8178 << " probably first start of this osd" << dendl;
8179 continue;
8180 }
8181 }
8182 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8183 for (auto& j : lastmap->get_pools()) {
8184 if (!i.second->have_pg_pool(j.first)) {
8185 pg_num_history.log_pool_delete(i.first, j.first);
8186 dout(10) << __func__ << " recording final pg_pool_t for pool "
8187 << j.first << dendl;
8188 // this information is needed by _make_pg() if have to restart before
8189 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8190 ghobject_t obj = make_final_pool_info_oid(j.first);
8191 bufferlist bl;
8192 encode(j.second, bl, CEPH_FEATURES_ALL);
8193 string name = lastmap->get_pool_name(j.first);
8194 encode(name, bl);
8195 map<string,string> profile;
8196 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8197 profile = lastmap->get_erasure_code_profile(
8198 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8199 }
8200 encode(profile, bl);
8201 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8202 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8203 new_pg_num != j.second.get_pg_num()) {
8204 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8205 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8206 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8207 }
8208 }
8209 for (auto& j : i.second->get_pools()) {
8210 if (!lastmap->have_pg_pool(j.first)) {
8211 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8212 << j.second.get_pg_num() << dendl;
8213 pg_num_history.log_pg_num_change(i.first, j.first,
8214 j.second.get_pg_num());
8215 }
8216 }
8217 lastmap = i.second;
8218 }
8219 pg_num_history.epoch = last;
8220 {
8221 bufferlist bl;
8222 ::encode(pg_num_history, bl);
8223 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8224 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8225 }
8226
8227 // record new purged_snaps
8228 if (superblock.purged_snaps_last == start - 1) {
8229 OSDriver osdriver{store.get(), service.meta_ch, make_purged_snaps_oid()};
8230 SnapMapper::record_purged_snaps(
8231 cct,
8232 osdriver,
8233 osdriver.get_transaction(&t),
8234 purged_snaps);
8235 superblock.purged_snaps_last = last;
8236 } else {
8237 dout(10) << __func__ << " superblock purged_snaps_last is "
8238 << superblock.purged_snaps_last
8239 << ", not recording new purged_snaps" << dendl;
8240 }
8241
8242 // superblock and commit
8243 write_superblock(t);
8244 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8245 store->queue_transaction(
8246 service.meta_ch,
8247 std::move(t));
8248 service.publish_superblock(superblock);
8249 }
8250
8251 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8252 {
8253 dout(10) << __func__ << " " << first << ".." << last << dendl;
8254 if (is_stopping()) {
8255 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8256 return;
8257 }
8258 std::lock_guard l(osd_lock);
8259 if (is_stopping()) {
8260 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8261 return;
8262 }
8263 map_lock.lock();
8264
8265 ceph_assert(first <= last);
8266
8267 bool do_shutdown = false;
8268 bool do_restart = false;
8269 bool network_error = false;
8270 OSDMapRef osdmap = get_osdmap();
8271
8272 // advance through the new maps
8273 for (epoch_t cur = first; cur <= last; cur++) {
8274 dout(10) << " advance to epoch " << cur
8275 << " (<= last " << last
8276 << " <= newest_map " << superblock.newest_map
8277 << ")" << dendl;
8278
8279 OSDMapRef newmap = get_map(cur);
8280 ceph_assert(newmap); // we just cached it above!
8281
8282 // start blocklisting messages sent to peers that go down.
8283 service.pre_publish_map(newmap);
8284
8285 // kill connections to newly down osds
8286 bool waited_for_reservations = false;
8287 set<int> old;
8288 osdmap = get_osdmap();
8289 osdmap->get_all_osds(old);
8290 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8291 if (*p != whoami &&
8292 osdmap->is_up(*p) && // in old map
8293 newmap->is_down(*p)) { // but not the new one
8294 if (!waited_for_reservations) {
8295 service.await_reserved_maps();
8296 waited_for_reservations = true;
8297 }
8298 note_down_osd(*p);
8299 } else if (*p != whoami &&
8300 osdmap->is_down(*p) &&
8301 newmap->is_up(*p)) {
8302 note_up_osd(*p);
8303 }
8304 }
8305
8306 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8307 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8308 << dendl;
8309 if (is_booting()) {
8310 // this captures the case where we sent the boot message while
8311 // NOUP was being set on the mon and our boot request was
8312 // dropped, and then later it is cleared. it imperfectly
8313 // handles the case where our original boot message was not
8314 // dropped and we restart even though we might have booted, but
8315 // that is harmless (boot will just take slightly longer).
8316 do_restart = true;
8317 }
8318 }
8319
8320 osdmap = std::move(newmap);
8321 set_osdmap(osdmap);
8322 epoch_t up_epoch;
8323 epoch_t boot_epoch;
8324 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8325 if (!up_epoch &&
8326 osdmap->is_up(whoami) &&
8327 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8328 up_epoch = osdmap->get_epoch();
8329 dout(10) << "up_epoch is " << up_epoch << dendl;
8330 if (!boot_epoch) {
8331 boot_epoch = osdmap->get_epoch();
8332 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8333 }
8334 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8335 }
8336 }
8337
8338 epoch_t _bind_epoch = service.get_bind_epoch();
8339 if (osdmap->is_up(whoami) &&
8340 osdmap->get_addrs(whoami).legacy_equals(
8341 client_messenger->get_myaddrs()) &&
8342 _bind_epoch < osdmap->get_up_from(whoami)) {
8343
8344 if (is_booting()) {
8345 dout(1) << "state: booting -> active" << dendl;
8346 set_state(STATE_ACTIVE);
8347 do_restart = false;
8348
8349 // set incarnation so that osd_reqid_t's we generate for our
8350 // objecter requests are unique across restarts.
8351 service.objecter->set_client_incarnation(osdmap->get_epoch());
8352 cancel_pending_failures();
8353 }
8354 }
8355
8356 if (osdmap->get_epoch() > 0 &&
8357 is_active()) {
8358 if (!osdmap->exists(whoami)) {
8359 derr << "map says i do not exist. shutting down." << dendl;
8360 do_shutdown = true; // don't call shutdown() while we have
8361 // everything paused
8362 } else if (osdmap->is_stop(whoami)) {
8363 derr << "map says i am stopped by admin. shutting down." << dendl;
8364 do_shutdown = true;
8365 } else if (!osdmap->is_up(whoami) ||
8366 !osdmap->get_addrs(whoami).legacy_equals(
8367 client_messenger->get_myaddrs()) ||
8368 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8369 cluster_messenger->get_myaddrs()) ||
8370 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8371 hb_back_server_messenger->get_myaddrs()) ||
8372 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8373 hb_front_server_messenger->get_myaddrs())) {
8374 if (!osdmap->is_up(whoami)) {
8375 if (service.is_preparing_to_stop() || service.is_stopping()) {
8376 service.got_stop_ack();
8377 } else {
8378 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8379 "but it is still running";
8380 clog->debug() << "map e" << osdmap->get_epoch()
8381 << " wrongly marked me down at e"
8382 << osdmap->get_down_at(whoami);
8383 }
8384 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8385 // note that this is best-effort...
8386 monc->send_mon_message(
8387 new MOSDMarkMeDead(
8388 monc->get_fsid(),
8389 whoami,
8390 osdmap->get_epoch()));
8391 }
8392 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8393 client_messenger->get_myaddrs())) {
8394 clog->error() << "map e" << osdmap->get_epoch()
8395 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8396 << " != my " << client_messenger->get_myaddrs() << ")";
8397 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8398 cluster_messenger->get_myaddrs())) {
8399 clog->error() << "map e" << osdmap->get_epoch()
8400 << " had wrong cluster addr ("
8401 << osdmap->get_cluster_addrs(whoami)
8402 << " != my " << cluster_messenger->get_myaddrs() << ")";
8403 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8404 hb_back_server_messenger->get_myaddrs())) {
8405 clog->error() << "map e" << osdmap->get_epoch()
8406 << " had wrong heartbeat back addr ("
8407 << osdmap->get_hb_back_addrs(whoami)
8408 << " != my " << hb_back_server_messenger->get_myaddrs()
8409 << ")";
8410 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8411 hb_front_server_messenger->get_myaddrs())) {
8412 clog->error() << "map e" << osdmap->get_epoch()
8413 << " had wrong heartbeat front addr ("
8414 << osdmap->get_hb_front_addrs(whoami)
8415 << " != my " << hb_front_server_messenger->get_myaddrs()
8416 << ")";
8417 }
8418
8419 if (!service.is_stopping()) {
8420 epoch_t up_epoch = 0;
8421 epoch_t bind_epoch = osdmap->get_epoch();
8422 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8423 do_restart = true;
8424
8425 //add markdown log
8426 utime_t now = ceph_clock_now();
8427 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8428 osd_markdown_log.push_back(now);
8429 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8430 derr << __func__ << " marked down "
8431 << osd_markdown_log.size()
8432 << " > osd_max_markdown_count "
8433 << cct->_conf->osd_max_markdown_count
8434 << " in last " << grace << " seconds, shutting down"
8435 << dendl;
8436 do_restart = false;
8437 do_shutdown = true;
8438 }
8439
8440 start_waiting_for_healthy();
8441
8442 set<int> avoid_ports;
8443 #if defined(__FreeBSD__)
8444 // prevent FreeBSD from grabbing the client_messenger port during
8445 // rebinding. In which case a cluster_meesneger will connect also
8446 // to the same port
8447 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8448 #endif
8449 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8450
8451 int r = cluster_messenger->rebind(avoid_ports);
8452 if (r != 0) {
8453 do_shutdown = true; // FIXME: do_restart?
8454 network_error = true;
8455 derr << __func__ << " marked down:"
8456 << " rebind cluster_messenger failed" << dendl;
8457 }
8458
8459 hb_back_server_messenger->mark_down_all();
8460 hb_front_server_messenger->mark_down_all();
8461 hb_front_client_messenger->mark_down_all();
8462 hb_back_client_messenger->mark_down_all();
8463
8464 reset_heartbeat_peers(true);
8465 }
8466 }
8467 } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8468 derr << "map says i am stopped by admin. shutting down." << dendl;
8469 do_shutdown = true;
8470 }
8471
8472 map_lock.unlock();
8473
8474 check_osdmap_features();
8475
8476 // yay!
8477 consume_map();
8478
8479 if (is_active() || is_waiting_for_healthy())
8480 maybe_update_heartbeat_peers();
8481
8482 if (is_active()) {
8483 activate_map();
8484 }
8485
8486 if (do_shutdown) {
8487 if (network_error) {
8488 cancel_pending_failures();
8489 }
8490 // trigger shutdown in a different thread
8491 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8492 queue_async_signal(SIGINT);
8493 }
8494 else if (m->newest_map && m->newest_map > last) {
8495 dout(10) << " msg say newest map is " << m->newest_map
8496 << ", requesting more" << dendl;
8497 osdmap_subscribe(osdmap->get_epoch()+1, false);
8498 }
8499 else if (is_preboot()) {
8500 if (m->get_source().is_mon())
8501 _preboot(m->cluster_osdmap_trim_lower_bound, m->newest_map);
8502 else
8503 start_boot();
8504 }
8505 else if (do_restart)
8506 start_boot();
8507
8508 }
8509
8510 void OSD::check_osdmap_features()
8511 {
8512 // adjust required feature bits?
8513
8514 // we have to be a bit careful here, because we are accessing the
8515 // Policy structures without taking any lock. in particular, only
8516 // modify integer values that can safely be read by a racing CPU.
8517 // since we are only accessing existing Policy structures a their
8518 // current memory location, and setting or clearing bits in integer
8519 // fields, and we are the only writer, this is not a problem.
8520
8521 const auto osdmap = get_osdmap();
8522 {
8523 Messenger::Policy p = client_messenger->get_default_policy();
8524 uint64_t mask;
8525 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8526 if ((p.features_required & mask) != features) {
8527 dout(0) << "crush map has features " << features
8528 << ", adjusting msgr requires for clients" << dendl;
8529 p.features_required = (p.features_required & ~mask) | features;
8530 client_messenger->set_default_policy(p);
8531 }
8532 }
8533 {
8534 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8535 uint64_t mask;
8536 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8537 if ((p.features_required & mask) != features) {
8538 dout(0) << "crush map has features " << features
8539 << " was " << p.features_required
8540 << ", adjusting msgr requires for mons" << dendl;
8541 p.features_required = (p.features_required & ~mask) | features;
8542 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8543 }
8544 }
8545 {
8546 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8547 uint64_t mask;
8548 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8549
8550 if ((p.features_required & mask) != features) {
8551 dout(0) << "crush map has features " << features
8552 << ", adjusting msgr requires for osds" << dendl;
8553 p.features_required = (p.features_required & ~mask) | features;
8554 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8555 }
8556
8557 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8558 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8559 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8560 ObjectStore::Transaction t;
8561 write_superblock(t);
8562 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8563 ceph_assert(err == 0);
8564 }
8565 }
8566
8567 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8568 hb_front_server_messenger->set_require_authorizer(false);
8569 hb_back_server_messenger->set_require_authorizer(false);
8570 } else {
8571 hb_front_server_messenger->set_require_authorizer(true);
8572 hb_back_server_messenger->set_require_authorizer(true);
8573 }
8574
8575 if (osdmap->require_osd_release != last_require_osd_release) {
8576 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8577 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8578 store->write_meta("require_osd_release",
8579 stringify((int)osdmap->require_osd_release));
8580 last_require_osd_release = osdmap->require_osd_release;
8581 }
8582 }
8583
8584 struct C_FinishSplits : public Context {
8585 OSD *osd;
8586 set<PGRef> pgs;
8587 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8588 : osd(osd), pgs(in) {}
8589 void finish(int r) override {
8590 osd->_finish_splits(pgs);
8591 }
8592 };
8593
8594 void OSD::_finish_splits(set<PGRef>& pgs)
8595 {
8596 dout(10) << __func__ << " " << pgs << dendl;
8597 if (is_stopping())
8598 return;
8599 for (set<PGRef>::iterator i = pgs.begin();
8600 i != pgs.end();
8601 ++i) {
8602 PG *pg = i->get();
8603
8604 PeeringCtx rctx;
8605 pg->lock();
8606 dout(10) << __func__ << " " << *pg << dendl;
8607 epoch_t e = pg->get_osdmap_epoch();
8608 pg->handle_initialize(rctx);
8609 pg->queue_null(e, e);
8610 dispatch_context(rctx, pg, service.get_osdmap());
8611 pg->unlock();
8612
8613 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8614 shards[shard_index]->register_and_wake_split_child(pg);
8615 }
8616 };
8617
8618 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8619 unsigned need)
8620 {
8621 std::lock_guard l(merge_lock);
8622 auto& p = merge_waiters[nextmap->get_epoch()][target];
8623 p[src->pg_id] = src;
8624 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8625 << " for " << target << ", have " << p.size() << "/" << need
8626 << dendl;
8627 return p.size() == need;
8628 }
8629
8630 bool OSD::advance_pg(
8631 epoch_t osd_epoch,
8632 PG *pg,
8633 ThreadPool::TPHandle &handle,
8634 PeeringCtx &rctx)
8635 {
8636 if (osd_epoch <= pg->get_osdmap_epoch()) {
8637 return true;
8638 }
8639 ceph_assert(pg->is_locked());
8640 OSDMapRef lastmap = pg->get_osdmap();
8641 set<PGRef> new_pgs; // any split children
8642 bool ret = true;
8643
8644 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8645 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8646 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8647 next_epoch <= osd_epoch;
8648 ++next_epoch) {
8649 OSDMapRef nextmap = service.try_get_map(next_epoch);
8650 if (!nextmap) {
8651 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8652 continue;
8653 }
8654
8655 unsigned new_pg_num =
8656 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8657 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8658 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8659 // check for merge
8660 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8661 spg_t parent;
8662 if (pg->pg_id.is_merge_source(
8663 old_pg_num,
8664 new_pg_num,
8665 &parent)) {
8666 // we are merge source
8667 PGRef spg = pg; // carry a ref
8668 dout(1) << __func__ << " " << pg->pg_id
8669 << " is merge source, target is " << parent
8670 << dendl;
8671 pg->write_if_dirty(rctx);
8672 if (!new_pgs.empty()) {
8673 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8674 new_pgs));
8675 new_pgs.clear();
8676 }
8677 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8678 pg->ch->flush();
8679 // release backoffs explicitly, since the on_shutdown path
8680 // aggressively tears down backoff state.
8681 if (pg->is_primary()) {
8682 pg->release_pg_backoffs();
8683 }
8684 pg->on_shutdown();
8685 OSDShard *sdata = pg->osd_shard;
8686 {
8687 std::lock_guard l(sdata->shard_lock);
8688 if (pg->pg_slot) {
8689 sdata->_detach_pg(pg->pg_slot);
8690 // update pg count now since we might not get an osdmap
8691 // any time soon.
8692 if (pg->is_primary())
8693 logger->dec(l_osd_pg_primary);
8694 else if (pg->is_nonprimary())
8695 logger->dec(l_osd_pg_replica); // misnomer
8696 else
8697 logger->dec(l_osd_pg_stray);
8698 }
8699 }
8700 pg->unlock();
8701
8702 set<spg_t> children;
8703 parent.is_split(new_pg_num, old_pg_num, &children);
8704 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8705 enqueue_peering_evt(
8706 parent,
8707 PGPeeringEventRef(
8708 std::make_shared<PGPeeringEvent>(
8709 nextmap->get_epoch(),
8710 nextmap->get_epoch(),
8711 NullEvt())));
8712 }
8713 ret = false;
8714 goto out;
8715 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8716 // we are merge target
8717 set<spg_t> children;
8718 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8719 dout(20) << __func__ << " " << pg->pg_id
8720 << " is merge target, sources are " << children
8721 << dendl;
8722 map<spg_t,PGRef> sources;
8723 {
8724 std::lock_guard l(merge_lock);
8725 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8726 unsigned need = children.size();
8727 dout(20) << __func__ << " have " << s.size() << "/"
8728 << need << dendl;
8729 if (s.size() == need) {
8730 sources.swap(s);
8731 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8732 if (merge_waiters[nextmap->get_epoch()].empty()) {
8733 merge_waiters.erase(nextmap->get_epoch());
8734 }
8735 }
8736 }
8737 if (!sources.empty()) {
8738 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8739 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8740 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8741 pg->merge_from(
8742 sources, rctx, split_bits,
8743 nextmap->get_pg_pool(
8744 pg->pg_id.pool())->last_pg_merge_meta);
8745 pg->pg_slot->waiting_for_merge_epoch = 0;
8746 } else {
8747 dout(20) << __func__ << " not ready to merge yet" << dendl;
8748 pg->write_if_dirty(rctx);
8749 if (!new_pgs.empty()) {
8750 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8751 new_pgs));
8752 new_pgs.clear();
8753 }
8754 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8755 pg->unlock();
8756 // kick source(s) to get them ready
8757 for (auto& i : children) {
8758 dout(20) << __func__ << " kicking source " << i << dendl;
8759 enqueue_peering_evt(
8760 i,
8761 PGPeeringEventRef(
8762 std::make_shared<PGPeeringEvent>(
8763 nextmap->get_epoch(),
8764 nextmap->get_epoch(),
8765 NullEvt())));
8766 }
8767 ret = false;
8768 goto out;
8769 }
8770 }
8771 }
8772 }
8773
8774 vector<int> newup, newacting;
8775 int up_primary, acting_primary;
8776 nextmap->pg_to_up_acting_osds(
8777 pg->pg_id.pgid,
8778 &newup, &up_primary,
8779 &newacting, &acting_primary);
8780 pg->handle_advance_map(
8781 nextmap, lastmap, newup, up_primary,
8782 newacting, acting_primary, rctx);
8783
8784 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8785 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8786 if (oldpool != lastmap->get_pools().end()
8787 && newpool != nextmap->get_pools().end()) {
8788 dout(20) << __func__
8789 << " new pool opts " << newpool->second.opts
8790 << " old pool opts " << oldpool->second.opts
8791 << dendl;
8792
8793 double old_min_interval = 0, new_min_interval = 0;
8794 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8795 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8796
8797 double old_max_interval = 0, new_max_interval = 0;
8798 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8799 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8800
8801 // Assume if an interval is change from set to unset or vice versa the actual config
8802 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8803 // unnecessarily.
8804 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8805 pg->on_info_history_change();
8806 }
8807 }
8808
8809 if (new_pg_num && old_pg_num != new_pg_num) {
8810 // check for split
8811 set<spg_t> children;
8812 if (pg->pg_id.is_split(
8813 old_pg_num,
8814 new_pg_num,
8815 &children)) {
8816 split_pgs(
8817 pg, children, &new_pgs, lastmap, nextmap,
8818 rctx);
8819 }
8820 }
8821
8822 lastmap = nextmap;
8823 old_pg_num = new_pg_num;
8824 handle.reset_tp_timeout();
8825 }
8826 pg->handle_activate_map(rctx);
8827
8828 ret = true;
8829 out:
8830 if (!new_pgs.empty()) {
8831 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8832 }
8833 return ret;
8834 }
8835
8836 void OSD::consume_map()
8837 {
8838 ceph_assert(ceph_mutex_is_locked(osd_lock));
8839 auto osdmap = get_osdmap();
8840 dout(20) << __func__ << " version " << osdmap->get_epoch() << dendl;
8841
8842 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8843 * speak the older sorting version any more. Be careful not to force
8844 * a shutdown if we are merely processing old maps, though.
8845 */
8846 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8847 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8848 ceph_abort();
8849 }
8850 service.pre_publish_map(osdmap);
8851 service.await_reserved_maps();
8852 service.publish_map(osdmap);
8853 dout(20) << "consume_map " << osdmap->get_epoch() << " -- publish done" << dendl;
8854 // prime splits and merges
8855 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8856 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8857 for (auto& shard : shards) {
8858 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8859 }
8860 if (!newly_split.empty()) {
8861 for (auto& shard : shards) {
8862 shard->prime_splits(osdmap, &newly_split);
8863 }
8864 ceph_assert(newly_split.empty());
8865 }
8866
8867 // prune sent_ready_to_merge
8868 service.prune_sent_ready_to_merge(osdmap);
8869
8870 // FIXME, maybe: We could race against an incoming peering message
8871 // that instantiates a merge PG after identify_merges() below and
8872 // never set up its peer to complete the merge. An OSD restart
8873 // would clear it up. This is a hard race to resolve,
8874 // extraordinarily rare (we only merge PGs that are stable and
8875 // clean, so it'd have to be an imported PG to an OSD with a
8876 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8877 // replace all of this with a seastar-based code soon anyway.
8878 if (!merge_pgs.empty()) {
8879 // mark the pgs we already have, or create new and empty merge
8880 // participants for those we are missing. do this all under the
8881 // shard lock so we don't have to worry about racing pg creates
8882 // via _process.
8883 for (auto& shard : shards) {
8884 shard->prime_merges(osdmap, &merge_pgs);
8885 }
8886 ceph_assert(merge_pgs.empty());
8887 }
8888
8889 service.prune_pg_created();
8890
8891 unsigned pushes_to_free = 0;
8892 for (auto& shard : shards) {
8893 shard->consume_map(osdmap, &pushes_to_free);
8894 }
8895
8896 vector<spg_t> pgids;
8897 _get_pgids(&pgids);
8898
8899 // count (FIXME, probably during seastar rewrite)
8900 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8901 vector<PGRef> pgs;
8902 _get_pgs(&pgs);
8903 for (auto& pg : pgs) {
8904 // FIXME (probably during seastar rewrite): this is lockless and
8905 // racy, but we don't want to take pg lock here.
8906 if (pg->is_primary())
8907 num_pg_primary++;
8908 else if (pg->is_nonprimary())
8909 num_pg_replica++; // misnomer
8910 else
8911 num_pg_stray++;
8912 }
8913
8914 {
8915 // FIXME (as part of seastar rewrite): move to OSDShard
8916 std::lock_guard l(pending_creates_lock);
8917 for (auto pg = pending_creates_from_osd.begin();
8918 pg != pending_creates_from_osd.end();) {
8919 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8920 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8921 << "discarding pending_create_from_osd" << dendl;
8922 pg = pending_creates_from_osd.erase(pg);
8923 } else {
8924 ++pg;
8925 }
8926 }
8927 }
8928
8929 service.maybe_inject_dispatch_delay();
8930
8931 dispatch_sessions_waiting_on_map();
8932
8933 service.maybe_inject_dispatch_delay();
8934
8935 service.release_reserved_pushes(pushes_to_free);
8936
8937 // queue null events to push maps down to individual PGs
8938 for (auto pgid : pgids) {
8939 enqueue_peering_evt(
8940 pgid,
8941 PGPeeringEventRef(
8942 std::make_shared<PGPeeringEvent>(
8943 osdmap->get_epoch(),
8944 osdmap->get_epoch(),
8945 NullEvt())));
8946 }
8947 logger->set(l_osd_pg, pgids.size());
8948 logger->set(l_osd_pg_primary, num_pg_primary);
8949 logger->set(l_osd_pg_replica, num_pg_replica);
8950 logger->set(l_osd_pg_stray, num_pg_stray);
8951 }
8952
8953 void OSD::activate_map()
8954 {
8955 ceph_assert(ceph_mutex_is_locked(osd_lock));
8956 auto osdmap = get_osdmap();
8957
8958 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8959
8960 // norecover?
8961 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8962 if (!service.recovery_is_paused()) {
8963 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8964 service.pause_recovery();
8965 }
8966 } else {
8967 if (service.recovery_is_paused()) {
8968 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8969 service.unpause_recovery();
8970 }
8971 }
8972
8973 service.activate_map();
8974 }
8975
8976 bool OSD::require_mon_peer(const Message *m)
8977 {
8978 if (!m->get_connection()->peer_is_mon()) {
8979 dout(0) << "require_mon_peer received from non-mon "
8980 << m->get_connection()->get_peer_addr()
8981 << " " << *m << dendl;
8982 return false;
8983 }
8984 return true;
8985 }
8986
8987 bool OSD::require_mon_or_mgr_peer(const Message *m)
8988 {
8989 if (!m->get_connection()->peer_is_mon() &&
8990 !m->get_connection()->peer_is_mgr()) {
8991 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8992 << m->get_connection()->get_peer_addr()
8993 << " " << *m << dendl;
8994 return false;
8995 }
8996 return true;
8997 }
8998
8999 bool OSD::require_osd_peer(const Message *m)
9000 {
9001 if (!m->get_connection()->peer_is_osd()) {
9002 dout(0) << "require_osd_peer received from non-osd "
9003 << m->get_connection()->get_peer_addr()
9004 << " " << *m << dendl;
9005 return false;
9006 }
9007 return true;
9008 }
9009
9010 // ----------------------------------------
9011 // pg creation
9012
9013 void OSD::split_pgs(
9014 PG *parent,
9015 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9016 OSDMapRef curmap,
9017 OSDMapRef nextmap,
9018 PeeringCtx &rctx)
9019 {
9020 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9021 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9022
9023 vector<object_stat_sum_t> updated_stats;
9024 parent->start_split_stats(childpgids, &updated_stats);
9025
9026 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9027 for (set<spg_t>::const_iterator i = childpgids.begin();
9028 i != childpgids.end();
9029 ++i, ++stat_iter) {
9030 ceph_assert(stat_iter != updated_stats.end());
9031 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9032 PG* child = _make_pg(nextmap, *i);
9033 child->lock(true);
9034 out_pgs->insert(child);
9035 child->ch = store->create_new_collection(child->coll);
9036
9037 {
9038 uint32_t shard_index = i->hash_to_shard(shards.size());
9039 assert(NULL != shards[shard_index]);
9040 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9041 }
9042
9043 unsigned split_bits = i->get_split_bits(pg_num);
9044 dout(10) << " pg_num is " << pg_num
9045 << ", m_seed " << i->ps()
9046 << ", split_bits is " << split_bits << dendl;
9047 parent->split_colls(
9048 *i,
9049 split_bits,
9050 i->ps(),
9051 &child->get_pgpool().info,
9052 rctx.transaction);
9053 parent->split_into(
9054 i->pgid,
9055 child,
9056 split_bits);
9057
9058 child->init_collection_pool_opts();
9059
9060 child->finish_split_stats(*stat_iter, rctx.transaction);
9061 child->unlock();
9062 }
9063 ceph_assert(stat_iter != updated_stats.end());
9064 parent->finish_split_stats(*stat_iter, rctx.transaction);
9065 }
9066
9067 // ----------------------------------------
9068 // peering and recovery
9069
9070 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9071 ThreadPool::TPHandle *handle)
9072 {
9073 if (!service.get_osdmap()->is_up(whoami)) {
9074 dout(20) << __func__ << " not up in osdmap" << dendl;
9075 } else if (!is_active()) {
9076 dout(20) << __func__ << " not active" << dendl;
9077 } else {
9078 for (auto& [osd, ls] : ctx.message_map) {
9079 if (!curmap->is_up(osd)) {
9080 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9081 continue;
9082 }
9083 ConnectionRef con = service.get_con_osd_cluster(
9084 osd, curmap->get_epoch());
9085 if (!con) {
9086 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9087 << dendl;
9088 continue;
9089 }
9090 service.maybe_share_map(con.get(), curmap);
9091 for (auto m : ls) {
9092 con->send_message2(m);
9093 }
9094 ls.clear();
9095 }
9096 }
9097 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9098 int tr = store->queue_transaction(
9099 pg->ch,
9100 std::move(ctx.transaction), TrackedOpRef(),
9101 handle);
9102 ceph_assert(tr == 0);
9103 }
9104 }
9105
9106 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9107 {
9108 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9109 if (!require_mon_peer(m)) {
9110 m->put();
9111 return;
9112 }
9113 for (auto& p : m->pgs) {
9114 spg_t pgid = p.first;
9115 epoch_t created = p.second.first;
9116 utime_t created_stamp = p.second.second;
9117 auto q = m->pg_extra.find(pgid);
9118 if (q == m->pg_extra.end()) {
9119 clog->error() << __func__ << " " << pgid << " e" << created
9120 << "@" << created_stamp << " with no history or past_intervals"
9121 << ", this should be impossible after octopus. Ignoring.";
9122 } else {
9123 dout(20) << __func__ << " " << pgid << " e" << created
9124 << "@" << created_stamp
9125 << " history " << q->second.first
9126 << " pi " << q->second.second << dendl;
9127 if (!q->second.second.empty() &&
9128 m->epoch < q->second.second.get_bounds().second) {
9129 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9130 << " and unmatched past_intervals " << q->second.second
9131 << " (history " << q->second.first << ")";
9132 } else {
9133 enqueue_peering_evt(
9134 pgid,
9135 PGPeeringEventRef(
9136 std::make_shared<PGPeeringEvent>(
9137 m->epoch,
9138 m->epoch,
9139 NullEvt(),
9140 true,
9141 new PGCreateInfo(
9142 pgid,
9143 m->epoch,
9144 q->second.first,
9145 q->second.second,
9146 true)
9147 )));
9148 }
9149 }
9150 }
9151
9152 {
9153 std::lock_guard l(pending_creates_lock);
9154 if (pending_creates_from_mon == 0) {
9155 last_pg_create_epoch = m->epoch;
9156 }
9157 }
9158
9159 m->put();
9160 }
9161
9162 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9163 {
9164 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9165 if (!require_osd_peer(m)) {
9166 m->put();
9167 return;
9168 }
9169 int from = m->get_source().num();
9170 for (auto& p : m->get_pg_list()) {
9171 spg_t pgid(p.info.pgid.pgid, p.to);
9172 enqueue_peering_evt(
9173 pgid,
9174 PGPeeringEventRef(
9175 std::make_shared<PGPeeringEvent>(
9176 p.epoch_sent,
9177 p.query_epoch,
9178 MNotifyRec(
9179 pgid, pg_shard_t(from, p.from),
9180 p,
9181 m->get_connection()->get_features()),
9182 true,
9183 new PGCreateInfo(
9184 pgid,
9185 p.query_epoch,
9186 p.info.history,
9187 p.past_intervals,
9188 false)
9189 )));
9190 }
9191 m->put();
9192 }
9193
9194 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9195 {
9196 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9197 if (!require_osd_peer(m)) {
9198 m->put();
9199 return;
9200 }
9201 int from = m->get_source().num();
9202 for (auto& p : m->pg_list) {
9203 enqueue_peering_evt(
9204 spg_t(p.info.pgid.pgid, p.to),
9205 PGPeeringEventRef(
9206 std::make_shared<PGPeeringEvent>(
9207 p.epoch_sent, p.query_epoch,
9208 MInfoRec(
9209 pg_shard_t(from, p.from),
9210 p.info,
9211 p.epoch_sent)))
9212 );
9213 }
9214 m->put();
9215 }
9216
9217 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9218 {
9219 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9220 if (!require_osd_peer(m)) {
9221 m->put();
9222 return;
9223 }
9224 for (auto& pgid : m->pg_list) {
9225 enqueue_peering_evt(
9226 pgid,
9227 PGPeeringEventRef(
9228 std::make_shared<PGPeeringEvent>(
9229 m->get_epoch(), m->get_epoch(),
9230 PeeringState::DeleteStart())));
9231 }
9232 m->put();
9233 }
9234
9235 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9236 {
9237 dout(10) << __func__ << " " << *m << dendl;
9238 if (!require_mon_or_mgr_peer(m)) {
9239 m->put();
9240 return;
9241 }
9242 epoch_t epoch = get_osdmap_epoch();
9243 for (auto pgid : m->forced_pgs) {
9244 if (m->options & OFR_BACKFILL) {
9245 if (m->options & OFR_CANCEL) {
9246 enqueue_peering_evt(
9247 pgid,
9248 PGPeeringEventRef(
9249 std::make_shared<PGPeeringEvent>(
9250 epoch, epoch,
9251 PeeringState::UnsetForceBackfill())));
9252 } else {
9253 enqueue_peering_evt(
9254 pgid,
9255 PGPeeringEventRef(
9256 std::make_shared<PGPeeringEvent>(
9257 epoch, epoch,
9258 PeeringState::SetForceBackfill())));
9259 }
9260 } else if (m->options & OFR_RECOVERY) {
9261 if (m->options & OFR_CANCEL) {
9262 enqueue_peering_evt(
9263 pgid,
9264 PGPeeringEventRef(
9265 std::make_shared<PGPeeringEvent>(
9266 epoch, epoch,
9267 PeeringState::UnsetForceRecovery())));
9268 } else {
9269 enqueue_peering_evt(
9270 pgid,
9271 PGPeeringEventRef(
9272 std::make_shared<PGPeeringEvent>(
9273 epoch, epoch,
9274 PeeringState::SetForceRecovery())));
9275 }
9276 }
9277 }
9278 m->put();
9279 }
9280
9281 void OSD::handle_pg_query_nopg(const MQuery& q)
9282 {
9283 spg_t pgid = q.pgid;
9284 dout(10) << __func__ << " " << pgid << dendl;
9285
9286 OSDMapRef osdmap = get_osdmap();
9287 if (!osdmap->have_pg_pool(pgid.pool()))
9288 return;
9289
9290 dout(10) << " pg " << pgid << " dne" << dendl;
9291 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9292 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9293 if (con) {
9294 Message *m;
9295 if (q.query.type == pg_query_t::LOG ||
9296 q.query.type == pg_query_t::FULLLOG) {
9297 m = new MOSDPGLog(
9298 q.query.from, q.query.to,
9299 osdmap->get_epoch(), empty,
9300 q.query.epoch_sent);
9301 } else {
9302 pg_notify_t notify{q.query.from, q.query.to,
9303 q.query.epoch_sent,
9304 osdmap->get_epoch(),
9305 empty,
9306 PastIntervals()};
9307 m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9308 std::move(notify));
9309 }
9310 service.maybe_share_map(con.get(), osdmap);
9311 con->send_message(m);
9312 }
9313 }
9314
9315 void OSDService::queue_check_readable(spg_t spgid,
9316 epoch_t lpr,
9317 ceph::signedspan delay)
9318 {
9319 if (delay == ceph::signedspan::zero()) {
9320 osd->enqueue_peering_evt(
9321 spgid,
9322 PGPeeringEventRef(
9323 std::make_shared<PGPeeringEvent>(
9324 lpr, lpr,
9325 PeeringState::CheckReadable())));
9326 } else {
9327 mono_timer.add_event(
9328 delay,
9329 [this, spgid, lpr]() {
9330 queue_check_readable(spgid, lpr);
9331 });
9332 }
9333 }
9334
9335
9336 // =========================================================
9337 // RECOVERY
9338
9339 void OSDService::_maybe_queue_recovery() {
9340 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9341 uint64_t available_pushes;
9342 while (!awaiting_throttle.empty() &&
9343 _recover_now(&available_pushes)) {
9344 uint64_t to_start = std::min(
9345 available_pushes,
9346 cct->_conf->osd_recovery_max_single_start);
9347 _queue_for_recovery(awaiting_throttle.front(), to_start);
9348 awaiting_throttle.pop_front();
9349 dout(10) << __func__ << " starting " << to_start
9350 << ", recovery_ops_reserved " << recovery_ops_reserved
9351 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9352 recovery_ops_reserved += to_start;
9353 }
9354 }
9355
9356 bool OSDService::_recover_now(uint64_t *available_pushes)
9357 {
9358 if (available_pushes)
9359 *available_pushes = 0;
9360
9361 if (ceph_clock_now() < defer_recovery_until) {
9362 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9363 return false;
9364 }
9365
9366 if (recovery_paused) {
9367 dout(15) << __func__ << " paused" << dendl;
9368 return false;
9369 }
9370
9371 uint64_t max = osd->get_recovery_max_active();
9372 if (max <= recovery_ops_active + recovery_ops_reserved) {
9373 dout(15) << __func__ << " active " << recovery_ops_active
9374 << " + reserved " << recovery_ops_reserved
9375 << " >= max " << max << dendl;
9376 return false;
9377 }
9378
9379 if (available_pushes)
9380 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9381
9382 return true;
9383 }
9384
9385 unsigned OSDService::get_target_pg_log_entries() const
9386 {
9387 auto num_pgs = osd->get_num_pgs();
9388 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9389 if (num_pgs > 0 && target > 0) {
9390 // target an even spread of our budgeted log entries across all
9391 // PGs. note that while we only get to control the entry count
9392 // for primary PGs, we'll normally be responsible for a mix of
9393 // primary and replica PGs (for the same pool(s) even), so this
9394 // will work out.
9395 return std::max<unsigned>(
9396 std::min<unsigned>(target / num_pgs,
9397 cct->_conf->osd_max_pg_log_entries),
9398 cct->_conf->osd_min_pg_log_entries);
9399 } else {
9400 // fall back to a per-pg value.
9401 return cct->_conf->osd_min_pg_log_entries;
9402 }
9403 }
9404
9405 void OSD::do_recovery(
9406 PG *pg, epoch_t queued, uint64_t reserved_pushes, int priority,
9407 ThreadPool::TPHandle &handle)
9408 {
9409 uint64_t started = 0;
9410
9411 /*
9412 * When the value of osd_recovery_sleep is set greater than zero, recovery
9413 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9414 * recovery event's schedule time. This is done by adding a
9415 * recovery_requeue_callback event, which re-queues the recovery op using
9416 * queue_recovery_after_sleep.
9417 */
9418 float recovery_sleep = get_osd_recovery_sleep();
9419 {
9420 std::lock_guard l(service.sleep_lock);
9421 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9422 PGRef pgref(pg);
9423 auto recovery_requeue_callback = new LambdaContext(
9424 [this, pgref, queued, reserved_pushes, priority](int r) {
9425 dout(20) << "do_recovery wake up at "
9426 << ceph_clock_now()
9427 << ", re-queuing recovery" << dendl;
9428 std::lock_guard l(service.sleep_lock);
9429 service.recovery_needs_sleep = false;
9430 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes, priority);
9431 });
9432
9433 // This is true for the first recovery op and when the previous recovery op
9434 // has been scheduled in the past. The next recovery op is scheduled after
9435 // completing the sleep from now.
9436
9437 if (auto now = ceph::real_clock::now();
9438 service.recovery_schedule_time < now) {
9439 service.recovery_schedule_time = now;
9440 }
9441 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9442 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9443 recovery_requeue_callback);
9444 dout(20) << "Recovery event scheduled at "
9445 << service.recovery_schedule_time << dendl;
9446 return;
9447 }
9448 }
9449
9450 {
9451 {
9452 std::lock_guard l(service.sleep_lock);
9453 service.recovery_needs_sleep = true;
9454 }
9455
9456 if (pg->pg_has_reset_since(queued)) {
9457 goto out;
9458 }
9459
9460 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9461 #ifdef DEBUG_RECOVERY_OIDS
9462 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9463 #endif
9464
9465 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9466 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9467 << " on " << *pg << dendl;
9468
9469 if (do_unfound) {
9470 PeeringCtx rctx;
9471 rctx.handle = &handle;
9472 pg->find_unfound(queued, rctx);
9473 dispatch_context(rctx, pg, pg->get_osdmap());
9474 }
9475 }
9476
9477 out:
9478 ceph_assert(started <= reserved_pushes);
9479 service.release_reserved_pushes(reserved_pushes);
9480 }
9481
9482 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9483 {
9484 std::lock_guard l(recovery_lock);
9485 dout(10) << "start_recovery_op " << *pg << " " << soid
9486 << " (" << recovery_ops_active << "/"
9487 << osd->get_recovery_max_active() << " rops)"
9488 << dendl;
9489 recovery_ops_active++;
9490
9491 #ifdef DEBUG_RECOVERY_OIDS
9492 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9493 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9494 recovery_oids[pg->pg_id].insert(soid);
9495 #endif
9496 }
9497
9498 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9499 {
9500 std::lock_guard l(recovery_lock);
9501 dout(10) << "finish_recovery_op " << *pg << " " << soid
9502 << " dequeue=" << dequeue
9503 << " (" << recovery_ops_active << "/"
9504 << osd->get_recovery_max_active() << " rops)"
9505 << dendl;
9506
9507 // adjust count
9508 ceph_assert(recovery_ops_active > 0);
9509 recovery_ops_active--;
9510
9511 #ifdef DEBUG_RECOVERY_OIDS
9512 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9513 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9514 recovery_oids[pg->pg_id].erase(soid);
9515 #endif
9516
9517 _maybe_queue_recovery();
9518 }
9519
9520 bool OSDService::is_recovery_active()
9521 {
9522 if (cct->_conf->osd_debug_pretend_recovery_active) {
9523 return true;
9524 }
9525 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9526 }
9527
9528 void OSDService::release_reserved_pushes(uint64_t pushes)
9529 {
9530 std::lock_guard l(recovery_lock);
9531 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9532 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9533 << dendl;
9534 ceph_assert(recovery_ops_reserved >= pushes);
9535 recovery_ops_reserved -= pushes;
9536 _maybe_queue_recovery();
9537 }
9538
9539 // =========================================================
9540 // OPS
9541
9542 bool OSD::op_is_discardable(const MOSDOp *op)
9543 {
9544 // drop client request if they are not connected and can't get the
9545 // reply anyway.
9546 if (!op->get_connection()->is_connected()) {
9547 return true;
9548 }
9549 return false;
9550 }
9551
9552 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9553 {
9554 const utime_t stamp = op->get_req()->get_recv_stamp();
9555 const utime_t latency = ceph_clock_now() - stamp;
9556 const unsigned priority = op->get_req()->get_priority();
9557 const int cost = op->get_req()->get_cost();
9558 const uint64_t owner = op->get_req()->get_source().num();
9559 const int type = op->get_req()->get_type();
9560
9561 dout(15) << "enqueue_op " << *op->get_req() << " prio " << priority
9562 << " type " << type
9563 << " cost " << cost
9564 << " latency " << latency
9565 << " epoch " << epoch
9566 << " " << *(op->get_req()) << dendl;
9567 op->osd_trace.event("enqueue op");
9568 op->osd_trace.keyval("priority", priority);
9569 op->osd_trace.keyval("cost", cost);
9570
9571 auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9572 enqueue_span->AddEvent(__func__, {
9573 {"priority", priority},
9574 {"cost", cost},
9575 {"epoch", epoch},
9576 {"owner", owner},
9577 {"type", type}
9578 });
9579
9580 op->mark_queued_for_pg();
9581 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9582 if (PGRecoveryMsg::is_recovery_msg(op)) {
9583 op_shardedwq.queue(
9584 OpSchedulerItem(
9585 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9586 cost, priority, stamp, owner, epoch));
9587 } else {
9588 op_shardedwq.queue(
9589 OpSchedulerItem(
9590 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9591 cost, priority, stamp, owner, epoch));
9592 }
9593 }
9594
9595 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9596 {
9597 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9598 op_shardedwq.queue(
9599 OpSchedulerItem(
9600 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9601 10,
9602 cct->_conf->osd_peering_op_priority,
9603 utime_t(),
9604 0,
9605 evt->get_epoch_sent()));
9606 }
9607
9608 /*
9609 * NOTE: dequeue called in worker thread, with pg lock
9610 */
9611 void OSD::dequeue_op(
9612 PGRef pg, OpRequestRef op,
9613 ThreadPool::TPHandle &handle)
9614 {
9615 const Message *m = op->get_req();
9616
9617 FUNCTRACE(cct);
9618 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9619
9620 utime_t now = ceph_clock_now();
9621 op->set_dequeued_time(now);
9622
9623 utime_t latency = now - m->get_recv_stamp();
9624 dout(10) << "dequeue_op " << *op->get_req()
9625 << " prio " << m->get_priority()
9626 << " cost " << m->get_cost()
9627 << " latency " << latency
9628 << " " << *m
9629 << " pg " << *pg << dendl;
9630
9631 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9632
9633 service.maybe_share_map(m->get_connection().get(),
9634 pg->get_osdmap(),
9635 op->sent_epoch);
9636
9637 if (pg->is_deleting())
9638 return;
9639
9640 op->mark_reached_pg();
9641 op->osd_trace.event("dequeue_op");
9642
9643 pg->do_request(op, handle);
9644
9645 // finish
9646 dout(10) << "dequeue_op " << *op->get_req() << " finish" << dendl;
9647 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9648 }
9649
9650
9651 void OSD::dequeue_peering_evt(
9652 OSDShard *sdata,
9653 PG *pg,
9654 PGPeeringEventRef evt,
9655 ThreadPool::TPHandle& handle)
9656 {
9657 auto curmap = sdata->get_osdmap();
9658 bool need_up_thru = false;
9659 epoch_t same_interval_since = 0;
9660 if (!pg) {
9661 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9662 handle_pg_query_nopg(*q);
9663 } else {
9664 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9665 ceph_abort();
9666 }
9667 } else if (PeeringCtx rctx;
9668 advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9669 pg->do_peering_event(evt, rctx);
9670 if (pg->is_deleted()) {
9671 pg->unlock();
9672 return;
9673 }
9674 dispatch_context(rctx, pg, curmap, &handle);
9675 need_up_thru = pg->get_need_up_thru();
9676 same_interval_since = pg->get_same_interval_since();
9677 pg->unlock();
9678 }
9679
9680 if (need_up_thru) {
9681 queue_want_up_thru(same_interval_since);
9682 }
9683
9684 service.send_pg_temp();
9685 }
9686
9687 void OSD::dequeue_delete(
9688 OSDShard *sdata,
9689 PG *pg,
9690 epoch_t e,
9691 ThreadPool::TPHandle& handle)
9692 {
9693 dequeue_peering_evt(
9694 sdata,
9695 pg,
9696 PGPeeringEventRef(
9697 std::make_shared<PGPeeringEvent>(
9698 e, e,
9699 PeeringState::DeleteSome())),
9700 handle);
9701 }
9702
9703
9704
9705 // --------------------------------
9706
9707 const char** OSD::get_tracked_conf_keys() const
9708 {
9709 static const char* KEYS[] = {
9710 "osd_max_backfills",
9711 "osd_min_recovery_priority",
9712 "osd_max_trimming_pgs",
9713 "osd_op_complaint_time",
9714 "osd_op_log_threshold",
9715 "osd_op_history_size",
9716 "osd_op_history_duration",
9717 "osd_op_history_slow_op_size",
9718 "osd_op_history_slow_op_threshold",
9719 "osd_enable_op_tracker",
9720 "osd_map_cache_size",
9721 "osd_pg_epoch_max_lag_factor",
9722 "osd_pg_epoch_persisted_max_stale",
9723 "osd_recovery_sleep",
9724 "osd_recovery_sleep_hdd",
9725 "osd_recovery_sleep_ssd",
9726 "osd_recovery_sleep_hybrid",
9727 "osd_delete_sleep",
9728 "osd_delete_sleep_hdd",
9729 "osd_delete_sleep_ssd",
9730 "osd_delete_sleep_hybrid",
9731 "osd_snap_trim_sleep",
9732 "osd_snap_trim_sleep_hdd",
9733 "osd_snap_trim_sleep_ssd",
9734 "osd_snap_trim_sleep_hybrid",
9735 "osd_scrub_sleep",
9736 "osd_recovery_max_active",
9737 "osd_recovery_max_active_hdd",
9738 "osd_recovery_max_active_ssd",
9739 // clog & admin clog
9740 "clog_to_monitors",
9741 "clog_to_syslog",
9742 "clog_to_syslog_facility",
9743 "clog_to_syslog_level",
9744 "osd_objectstore_fuse",
9745 "clog_to_graylog",
9746 "clog_to_graylog_host",
9747 "clog_to_graylog_port",
9748 "host",
9749 "fsid",
9750 "osd_recovery_delay_start",
9751 "osd_client_message_size_cap",
9752 "osd_client_message_cap",
9753 "osd_heartbeat_min_size",
9754 "osd_heartbeat_interval",
9755 "osd_object_clean_region_max_num_intervals",
9756 "osd_scrub_min_interval",
9757 "osd_scrub_max_interval",
9758 NULL
9759 };
9760 return KEYS;
9761 }
9762
9763 void OSD::handle_conf_change(const ConfigProxy& conf,
9764 const std::set <std::string> &changed)
9765 {
9766 std::lock_guard l{osd_lock};
9767
9768 if (changed.count("osd_max_backfills") ||
9769 changed.count("osd_recovery_max_active") ||
9770 changed.count("osd_recovery_max_active_hdd") ||
9771 changed.count("osd_recovery_max_active_ssd")) {
9772 if (!maybe_override_options_for_qos(&changed) &&
9773 changed.count("osd_max_backfills")) {
9774 // Scheduler is not "mclock". Fallback to earlier behavior
9775 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9776 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9777 }
9778 }
9779 if (changed.count("osd_delete_sleep") ||
9780 changed.count("osd_delete_sleep_hdd") ||
9781 changed.count("osd_delete_sleep_ssd") ||
9782 changed.count("osd_delete_sleep_hybrid") ||
9783 changed.count("osd_snap_trim_sleep") ||
9784 changed.count("osd_snap_trim_sleep_hdd") ||
9785 changed.count("osd_snap_trim_sleep_ssd") ||
9786 changed.count("osd_snap_trim_sleep_hybrid") ||
9787 changed.count("osd_scrub_sleep") ||
9788 changed.count("osd_recovery_sleep") ||
9789 changed.count("osd_recovery_sleep_hdd") ||
9790 changed.count("osd_recovery_sleep_ssd") ||
9791 changed.count("osd_recovery_sleep_hybrid")) {
9792 maybe_override_sleep_options_for_qos();
9793 }
9794 if (changed.count("osd_pg_delete_cost")) {
9795 maybe_override_cost_for_qos();
9796 }
9797 if (changed.count("osd_min_recovery_priority")) {
9798 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9799 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9800 }
9801 if (changed.count("osd_max_trimming_pgs")) {
9802 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9803 }
9804 if (changed.count("osd_op_complaint_time") ||
9805 changed.count("osd_op_log_threshold")) {
9806 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9807 cct->_conf->osd_op_log_threshold);
9808 }
9809 if (changed.count("osd_op_history_size") ||
9810 changed.count("osd_op_history_duration")) {
9811 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9812 cct->_conf->osd_op_history_duration);
9813 }
9814 if (changed.count("osd_op_history_slow_op_size") ||
9815 changed.count("osd_op_history_slow_op_threshold")) {
9816 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9817 cct->_conf->osd_op_history_slow_op_threshold);
9818 }
9819 if (changed.count("osd_enable_op_tracker")) {
9820 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9821 }
9822 if (changed.count("osd_map_cache_size")) {
9823 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9824 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9825 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9826 }
9827 if (changed.count("clog_to_monitors") ||
9828 changed.count("clog_to_syslog") ||
9829 changed.count("clog_to_syslog_level") ||
9830 changed.count("clog_to_syslog_facility") ||
9831 changed.count("clog_to_graylog") ||
9832 changed.count("clog_to_graylog_host") ||
9833 changed.count("clog_to_graylog_port") ||
9834 changed.count("host") ||
9835 changed.count("fsid")) {
9836 update_log_config();
9837 }
9838 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9839 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9840 "osd_pg_epoch_max_lag_factor");
9841 }
9842
9843 #ifdef HAVE_LIBFUSE
9844 if (changed.count("osd_objectstore_fuse")) {
9845 if (store) {
9846 enable_disable_fuse(false);
9847 }
9848 }
9849 #endif
9850
9851 if (changed.count("osd_recovery_delay_start")) {
9852 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9853 service.kick_recovery_queue();
9854 }
9855
9856 if (changed.count("osd_client_message_cap")) {
9857 uint64_t newval = cct->_conf->osd_client_message_cap;
9858 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9859 if (pol.throttler_messages) {
9860 pol.throttler_messages->reset_max(newval);
9861 }
9862 }
9863 if (changed.count("osd_client_message_size_cap")) {
9864 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9865 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9866 if (pol.throttler_bytes) {
9867 pol.throttler_bytes->reset_max(newval);
9868 }
9869 }
9870 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9871 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9872 }
9873
9874 if (changed.count("osd_scrub_min_interval") ||
9875 changed.count("osd_scrub_max_interval")) {
9876 resched_all_scrubs();
9877 dout(0) << __func__ << ": scrub interval change" << dendl;
9878 }
9879 check_config();
9880 if (changed.count("osd_asio_thread_count")) {
9881 service.poolctx.stop();
9882 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
9883 }
9884 }
9885
9886 void OSD::maybe_override_max_osd_capacity_for_qos()
9887 {
9888 // If the scheduler enabled is mclock, override the default
9889 // osd capacity with the value obtained from running the
9890 // osd bench test. This is later used to setup mclock.
9891 if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
9892 (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
9893 (!unsupported_objstore_for_qos())) {
9894 std::string max_capacity_iops_config;
9895 bool force_run_benchmark =
9896 cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
9897
9898 if (store_is_rotational) {
9899 max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
9900 } else {
9901 max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
9902 }
9903
9904 double default_iops = 0.0;
9905 double cur_iops = 0.0;
9906 if (!force_run_benchmark) {
9907 // Get the current osd iops capacity
9908 cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
9909
9910 // Get the default max iops capacity
9911 auto val = cct->_conf.get_val_default(max_capacity_iops_config);
9912 if (!val.has_value()) {
9913 derr << __func__ << " Unable to determine default value of "
9914 << max_capacity_iops_config << dendl;
9915 // Cannot determine default iops. Force a run of the OSD benchmark.
9916 force_run_benchmark = true;
9917 } else {
9918 // Default iops
9919 default_iops = std::stod(val.value());
9920 }
9921
9922 // Determine if we really need to run the osd benchmark
9923 if (!force_run_benchmark && (default_iops != cur_iops)) {
9924 dout(1) << __func__ << std::fixed << std::setprecision(2)
9925 << " default_iops: " << default_iops
9926 << " cur_iops: " << cur_iops
9927 << ". Skip OSD benchmark test." << dendl;
9928 return;
9929 }
9930 }
9931
9932 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
9933 int64_t count = 12288000; // Count of bytes to write
9934 int64_t bsize = 4096; // Block size
9935 int64_t osize = 4194304; // Object size
9936 int64_t onum = 100; // Count of objects to write
9937 double elapsed = 0.0; // Time taken to complete the test
9938 double iops = 0.0;
9939 stringstream ss;
9940 int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
9941 if (ret != 0) {
9942 derr << __func__
9943 << " osd bench err: " << ret
9944 << " osd bench errstr: " << ss.str()
9945 << dendl;
9946 return;
9947 }
9948
9949 double rate = count / elapsed;
9950 iops = rate / bsize;
9951 dout(1) << __func__
9952 << " osd bench result -"
9953 << std::fixed << std::setprecision(3)
9954 << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
9955 << " iops: " << iops
9956 << " elapsed_sec: " << elapsed
9957 << dendl;
9958
9959 // Get the threshold IOPS set for the underlying hdd/ssd.
9960 double threshold_iops = 0.0;
9961 if (store_is_rotational) {
9962 threshold_iops = cct->_conf.get_val<double>(
9963 "osd_mclock_iops_capacity_threshold_hdd");
9964 } else {
9965 threshold_iops = cct->_conf.get_val<double>(
9966 "osd_mclock_iops_capacity_threshold_ssd");
9967 }
9968
9969 // Persist the iops value to the MON store or throw cluster warning
9970 // if the measured iops exceeds the set threshold. If the iops exceed
9971 // the threshold, the default value is used.
9972 if (iops > threshold_iops) {
9973 clog->warn() << "OSD bench result of " << std::to_string(iops)
9974 << " IOPS exceeded the threshold limit of "
9975 << std::to_string(threshold_iops) << " IOPS for osd."
9976 << std::to_string(whoami) << ". IOPS capacity is unchanged"
9977 << " at " << std::to_string(cur_iops) << " IOPS. The"
9978 << " recommendation is to establish the osd's IOPS capacity"
9979 << " using other benchmark tools (e.g. Fio) and then"
9980 << " override osd_mclock_max_capacity_iops_[hdd|ssd].";
9981 } else {
9982 mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
9983 }
9984 }
9985 }
9986
9987 bool OSD::maybe_override_options_for_qos(const std::set<std::string> *changed)
9988 {
9989 // Override options only if the scheduler enabled is mclock and the
9990 // underlying objectstore is supported by mclock
9991 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
9992 !unsupported_objstore_for_qos()) {
9993 static const std::map<std::string, uint64_t> recovery_qos_defaults {
9994 {"osd_recovery_max_active", 0},
9995 {"osd_recovery_max_active_hdd", 3},
9996 {"osd_recovery_max_active_ssd", 10},
9997 {"osd_max_backfills", 1},
9998 };
9999
10000 // Check if we were called because of a configuration change
10001 if (changed != nullptr) {
10002 if (cct->_conf.get_val<bool>("osd_mclock_override_recovery_settings")) {
10003 if (changed->count("osd_max_backfills")) {
10004 dout(1) << __func__ << " Set local and remote max backfills to "
10005 << cct->_conf->osd_max_backfills << dendl;
10006 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10007 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10008 }
10009 } else {
10010 // Recovery options change was attempted without setting
10011 // the 'osd_mclock_override_recovery_settings' option.
10012 // Find the key to remove from the configuration db.
10013 std::string key;
10014 if (changed->count("osd_max_backfills")) {
10015 key = "osd_max_backfills";
10016 } else if (changed->count("osd_recovery_max_active")) {
10017 key = "osd_recovery_max_active";
10018 } else if (changed->count("osd_recovery_max_active_hdd")) {
10019 key = "osd_recovery_max_active_hdd";
10020 } else if (changed->count("osd_recovery_max_active_ssd")) {
10021 key = "osd_recovery_max_active_ssd";
10022 } else {
10023 // No key that we are interested in. Return.
10024 return true;
10025 }
10026
10027 // Remove the current entry from the configuration if
10028 // different from its default value.
10029 auto val = recovery_qos_defaults.find(key);
10030 if (val != recovery_qos_defaults.end() &&
10031 cct->_conf.get_val<uint64_t>(key) != val->second) {
10032 static const std::vector<std::string> osds = {
10033 "osd",
10034 "osd." + std::to_string(whoami)
10035 };
10036
10037 for (auto osd : osds) {
10038 std::string cmd =
10039 "{"
10040 "\"prefix\": \"config rm\", "
10041 "\"who\": \"" + osd + "\", "
10042 "\"name\": \"" + key + "\""
10043 "}";
10044 vector<std::string> vcmd{cmd};
10045
10046 dout(1) << __func__ << " Removing Key: " << key
10047 << " for " << osd << " from Mon db" << dendl;
10048 monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
10049 }
10050
10051 // Raise a cluster warning indicating that the changes did not
10052 // take effect and indicate the reason why.
10053 clog->warn() << "Change to " << key << " on osd."
10054 << std::to_string(whoami) << " did not take effect."
10055 << " Enable osd_mclock_override_recovery_settings before"
10056 << " setting this option.";
10057 }
10058 }
10059 } else { // if (changed != nullptr) (osd boot-up)
10060 /**
10061 * This section is executed only during osd boot-up.
10062 * Override the default recovery max active (hdd & ssd) and max backfills
10063 * config options to either the mClock defaults or retain their respective
10064 * overridden values before the osd was restarted.
10065 */
10066 for (auto opt : recovery_qos_defaults) {
10067 /**
10068 * Note: set_val_default doesn't overwrite an option if it was earlier
10069 * set at a config level greater than CONF_DEFAULT. It doesn't return
10070 * a status. With get_val(), the config subsystem is guaranteed to
10071 * either return the overridden value (if any) or the default value.
10072 */
10073 cct->_conf.set_val_default(opt.first, std::to_string(opt.second));
10074 auto opt_val = cct->_conf.get_val<uint64_t>(opt.first);
10075 dout(1) << __func__ << " "
10076 << opt.first << " set to " << opt_val
10077 << dendl;
10078 if (opt.first == "osd_max_backfills") {
10079 service.local_reserver.set_max(opt_val);
10080 service.remote_reserver.set_max(opt_val);
10081 }
10082 }
10083 }
10084 return true;
10085 }
10086 return false;
10087 }
10088
10089 void OSD::maybe_override_sleep_options_for_qos()
10090 {
10091 // Override options only if the scheduler enabled is mclock and the
10092 // underlying objectstore is supported by mclock
10093 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10094 !unsupported_objstore_for_qos()) {
10095
10096 // Override the various sleep settings
10097 // Disable recovery sleep
10098 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10099 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10100 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10101 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10102
10103 // Disable delete sleep
10104 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10105 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10106 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10107 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10108
10109 // Disable snap trim sleep
10110 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10111 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10112 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10113 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10114
10115 // Disable scrub sleep
10116 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10117 }
10118 }
10119
10120 void OSD::maybe_override_cost_for_qos()
10121 {
10122 // If the scheduler enabled is mclock, override the default PG deletion cost
10123 // so that mclock can meet the QoS goals.
10124 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10125 !unsupported_objstore_for_qos()) {
10126 uint64_t pg_delete_cost = 15728640;
10127 cct->_conf.set_val("osd_pg_delete_cost", std::to_string(pg_delete_cost));
10128 }
10129 }
10130
10131 /**
10132 * A context for receiving status from a background mon command to set
10133 * a config option and optionally apply the changes on each op shard.
10134 */
10135 class MonCmdSetConfigOnFinish : public Context {
10136 OSD *osd;
10137 CephContext *cct;
10138 std::string key;
10139 std::string val;
10140 bool update_shard;
10141 public:
10142 explicit MonCmdSetConfigOnFinish(
10143 OSD *o,
10144 CephContext *cct,
10145 const std::string &k,
10146 const std::string &v,
10147 const bool s)
10148 : osd(o), cct(cct), key(k), val(v), update_shard(s) {}
10149 void finish(int r) override {
10150 if (r != 0) {
10151 // Fallback to setting the config within the in-memory "values" map.
10152 cct->_conf.set_val_default(key, val);
10153 }
10154
10155 // If requested, apply this option on the
10156 // active scheduler of each op shard.
10157 if (update_shard) {
10158 for (auto& shard : osd->shards) {
10159 shard->update_scheduler_config();
10160 }
10161 }
10162 }
10163 };
10164
10165 void OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10166 {
10167 std::string cmd =
10168 "{"
10169 "\"prefix\": \"config set\", "
10170 "\"who\": \"osd." + std::to_string(whoami) + "\", "
10171 "\"name\": \"" + key + "\", "
10172 "\"value\": \"" + val + "\""
10173 "}";
10174 vector<std::string> vcmd{cmd};
10175
10176 // List of config options to be distributed across each op shard.
10177 // Currently limited to a couple of mClock options.
10178 static const std::vector<std::string> shard_option =
10179 { "osd_mclock_max_capacity_iops_hdd", "osd_mclock_max_capacity_iops_ssd" };
10180 const bool update_shard = std::find(shard_option.begin(),
10181 shard_option.end(),
10182 key) != shard_option.end();
10183
10184 auto on_finish = new MonCmdSetConfigOnFinish(this, cct, key,
10185 val, update_shard);
10186 dout(10) << __func__ << " Set " << key << " = " << val << dendl;
10187 monc->start_mon_command(vcmd, {}, nullptr, nullptr, on_finish);
10188 }
10189
10190 bool OSD::unsupported_objstore_for_qos()
10191 {
10192 static const std::vector<std::string> unsupported_objstores = { "filestore" };
10193 return std::find(unsupported_objstores.begin(),
10194 unsupported_objstores.end(),
10195 store->get_type()) != unsupported_objstores.end();
10196 }
10197
10198 void OSD::update_log_config()
10199 {
10200 auto parsed_options = clog->parse_client_options(cct);
10201 derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
10202 }
10203
10204 void OSD::check_config()
10205 {
10206 // some sanity checks
10207 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10208 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10209 << " is not > osd_pg_epoch_persisted_max_stale ("
10210 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10211 }
10212 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10213 clog->warn() << "osd_object_clean_region_max_num_intervals ("
10214 << cct->_conf->osd_object_clean_region_max_num_intervals
10215 << ") is < 0";
10216 }
10217 }
10218
10219 // --------------------------------
10220
10221 void OSD::get_latest_osdmap()
10222 {
10223 dout(10) << __func__ << " -- start" << dendl;
10224
10225 boost::system::error_code ec;
10226 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10227
10228 dout(10) << __func__ << " -- finish" << dendl;
10229 }
10230
10231 // --------------------------------
10232
10233 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10234 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10235 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10236 dout(10) << "setting " << queries.size() << " queries" << dendl;
10237
10238 std::list<OSDPerfMetricQuery> supported_queries;
10239 for (auto &it : queries) {
10240 auto &query = it.first;
10241 if (!query.key_descriptor.empty()) {
10242 supported_queries.push_back(query);
10243 }
10244 }
10245 if (supported_queries.size() < queries.size()) {
10246 dout(1) << queries.size() - supported_queries.size()
10247 << " unsupported queries" << dendl;
10248 }
10249 {
10250 std::lock_guard locker{m_perf_queries_lock};
10251 m_perf_queries = supported_queries;
10252 m_perf_limits = queries;
10253 }
10254 std::vector<PGRef> pgs;
10255 _get_pgs(&pgs);
10256 for (auto& pg : pgs) {
10257 std::scoped_lock l{*pg};
10258 pg->set_dynamic_perf_stats_queries(supported_queries);
10259 }
10260 }
10261
10262 MetricPayload OSD::get_perf_reports() {
10263 OSDMetricPayload payload;
10264 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10265
10266 std::vector<PGRef> pgs;
10267 _get_pgs(&pgs);
10268 DynamicPerfStats dps;
10269 for (auto& pg : pgs) {
10270 // m_perf_queries can be modified only in set_perf_queries by mgr client
10271 // request, and it is protected by by mgr client's lock, which is held
10272 // when set_perf_queries/get_perf_reports are called, so we may not hold
10273 // m_perf_queries_lock here.
10274 DynamicPerfStats pg_dps(m_perf_queries);
10275 pg->lock();
10276 pg->get_dynamic_perf_stats(&pg_dps);
10277 pg->unlock();
10278 dps.merge(pg_dps);
10279 }
10280 dps.add_to_reports(m_perf_limits, &reports);
10281 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10282
10283 return payload;
10284 }
10285
10286 // =============================================================
10287
10288 #undef dout_context
10289 #define dout_context cct
10290 #undef dout_prefix
10291 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10292
10293 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10294 {
10295 dout(10) << pg->pg_id << " " << pg << dendl;
10296 slot->pg = pg;
10297 pg->osd_shard = this;
10298 pg->pg_slot = slot;
10299 osd->inc_num_pgs();
10300
10301 slot->epoch = pg->get_osdmap_epoch();
10302 pg_slots_by_epoch.insert(*slot);
10303 }
10304
10305 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10306 {
10307 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10308 slot->pg->osd_shard = nullptr;
10309 slot->pg->pg_slot = nullptr;
10310 slot->pg = nullptr;
10311 osd->dec_num_pgs();
10312
10313 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10314 slot->epoch = 0;
10315 if (waiting_for_min_pg_epoch) {
10316 min_pg_epoch_cond.notify_all();
10317 }
10318 }
10319
10320 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10321 {
10322 std::lock_guard l(shard_lock);
10323 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10324 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10325 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10326 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10327 slot->epoch = e;
10328 pg_slots_by_epoch.insert(*slot);
10329 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10330 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10331 if (waiting_for_min_pg_epoch) {
10332 min_pg_epoch_cond.notify_all();
10333 }
10334 }
10335
10336 epoch_t OSDShard::get_min_pg_epoch()
10337 {
10338 std::lock_guard l(shard_lock);
10339 auto p = pg_slots_by_epoch.begin();
10340 if (p == pg_slots_by_epoch.end()) {
10341 return 0;
10342 }
10343 return p->epoch;
10344 }
10345
10346 void OSDShard::wait_min_pg_epoch(epoch_t need)
10347 {
10348 std::unique_lock l{shard_lock};
10349 ++waiting_for_min_pg_epoch;
10350 min_pg_epoch_cond.wait(l, [need, this] {
10351 if (pg_slots_by_epoch.empty()) {
10352 return true;
10353 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10354 return true;
10355 } else {
10356 dout(10) << need << " waiting on "
10357 << pg_slots_by_epoch.begin()->epoch << dendl;
10358 return false;
10359 }
10360 });
10361 --waiting_for_min_pg_epoch;
10362 }
10363
10364 epoch_t OSDShard::get_max_waiting_epoch()
10365 {
10366 std::lock_guard l(shard_lock);
10367 epoch_t r = 0;
10368 for (auto& i : pg_slots) {
10369 if (!i.second->waiting_peering.empty()) {
10370 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10371 }
10372 }
10373 return r;
10374 }
10375
10376 void OSDShard::consume_map(
10377 const OSDMapRef& new_osdmap,
10378 unsigned *pushes_to_free)
10379 {
10380 std::lock_guard l(shard_lock);
10381 OSDMapRef old_osdmap;
10382 {
10383 std::lock_guard l(osdmap_lock);
10384 old_osdmap = std::move(shard_osdmap);
10385 shard_osdmap = new_osdmap;
10386 }
10387 dout(10) << new_osdmap->get_epoch()
10388 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10389 << dendl;
10390 int queued = 0;
10391
10392 // check slots
10393 auto p = pg_slots.begin();
10394 while (p != pg_slots.end()) {
10395 OSDShardPGSlot *slot = p->second.get();
10396 const spg_t& pgid = p->first;
10397 dout(20) << __func__ << " " << pgid << dendl;
10398 if (!slot->waiting_for_split.empty()) {
10399 dout(20) << __func__ << " " << pgid
10400 << " waiting for split " << slot->waiting_for_split << dendl;
10401 ++p;
10402 continue;
10403 }
10404 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10405 dout(20) << __func__ << " " << pgid
10406 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10407 << dendl;
10408 ++p;
10409 continue;
10410 }
10411 if (!slot->waiting_peering.empty()) {
10412 epoch_t first = slot->waiting_peering.begin()->first;
10413 if (first <= new_osdmap->get_epoch()) {
10414 dout(20) << __func__ << " " << pgid
10415 << " pending_peering first epoch " << first
10416 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10417 queued += _wake_pg_slot(pgid, slot);
10418 }
10419 ++p;
10420 continue;
10421 }
10422 if (!slot->waiting.empty()) {
10423 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10424 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10425 << dendl;
10426 ++p;
10427 continue;
10428 }
10429 while (!slot->waiting.empty() &&
10430 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10431 auto& qi = slot->waiting.front();
10432 dout(20) << __func__ << " " << pgid
10433 << " waiting item " << qi
10434 << " epoch " << qi.get_map_epoch()
10435 << " <= " << new_osdmap->get_epoch()
10436 << ", "
10437 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10438 "misdirected")
10439 << ", dropping" << dendl;
10440 *pushes_to_free += qi.get_reserved_pushes();
10441 slot->waiting.pop_front();
10442 }
10443 }
10444 if (slot->waiting.empty() &&
10445 slot->num_running == 0 &&
10446 slot->waiting_for_split.empty() &&
10447 !slot->pg) {
10448 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10449 p = pg_slots.erase(p);
10450 continue;
10451 }
10452
10453 ++p;
10454 }
10455 if (queued) {
10456 std::lock_guard l{sdata_wait_lock};
10457 if (queued == 1)
10458 sdata_cond.notify_one();
10459 else
10460 sdata_cond.notify_all();
10461 }
10462 }
10463
10464 int OSDShard::_wake_pg_slot(
10465 spg_t pgid,
10466 OSDShardPGSlot *slot)
10467 {
10468 int count = 0;
10469 dout(20) << __func__ << " " << pgid
10470 << " to_process " << slot->to_process
10471 << " waiting " << slot->waiting
10472 << " waiting_peering " << slot->waiting_peering << dendl;
10473 for (auto i = slot->to_process.rbegin();
10474 i != slot->to_process.rend();
10475 ++i) {
10476 scheduler->enqueue_front(std::move(*i));
10477 count++;
10478 }
10479 slot->to_process.clear();
10480 for (auto i = slot->waiting.rbegin();
10481 i != slot->waiting.rend();
10482 ++i) {
10483 scheduler->enqueue_front(std::move(*i));
10484 count++;
10485 }
10486 slot->waiting.clear();
10487 for (auto i = slot->waiting_peering.rbegin();
10488 i != slot->waiting_peering.rend();
10489 ++i) {
10490 // this is overkill; we requeue everything, even if some of these
10491 // items are waiting for maps we don't have yet. FIXME, maybe,
10492 // someday, if we decide this inefficiency matters
10493 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10494 scheduler->enqueue_front(std::move(*j));
10495 count++;
10496 }
10497 }
10498 slot->waiting_peering.clear();
10499 ++slot->requeue_seq;
10500 return count;
10501 }
10502
10503 void OSDShard::identify_splits_and_merges(
10504 const OSDMapRef& as_of_osdmap,
10505 set<pair<spg_t,epoch_t>> *split_pgs,
10506 set<pair<spg_t,epoch_t>> *merge_pgs)
10507 {
10508 std::lock_guard l(shard_lock);
10509 dout(20) << __func__ << " " << pg_slots.size() << " slots" << dendl;
10510 if (shard_osdmap) {
10511 for (auto& i : pg_slots) {
10512 dout(20) << __func__ << " slot pgid:" << i.first << "slot:" << i.second.get() << dendl;
10513 const spg_t& pgid = i.first;
10514 auto *slot = i.second.get();
10515 if (slot->pg) {
10516 osd->service.identify_splits_and_merges(
10517 shard_osdmap, as_of_osdmap, pgid,
10518 split_pgs, merge_pgs);
10519 } else if (!slot->waiting_for_split.empty()) {
10520 osd->service.identify_splits_and_merges(
10521 shard_osdmap, as_of_osdmap, pgid,
10522 split_pgs, nullptr);
10523 } else {
10524 dout(20) << __func__ << " slot " << pgid
10525 << " has no pg and waiting_for_split " << dendl;
10526 }
10527 }
10528 }
10529 dout(20) << __func__ << " " << split_pgs->size() << " splits, "
10530 << merge_pgs->size() << " merges" << dendl;
10531 }
10532
10533 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10534 set<pair<spg_t,epoch_t>> *pgids)
10535 {
10536 std::lock_guard l(shard_lock);
10537 _prime_splits(pgids);
10538 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10539 set<pair<spg_t,epoch_t>> newer_children;
10540 for (auto i : *pgids) {
10541 osd->service.identify_splits_and_merges(
10542 as_of_osdmap, shard_osdmap, i.first,
10543 &newer_children, nullptr);
10544 }
10545 newer_children.insert(pgids->begin(), pgids->end());
10546 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10547 << shard_osdmap->get_epoch() << ", new children " << newer_children
10548 << dendl;
10549 _prime_splits(&newer_children);
10550 // note: we don't care what is left over here for other shards.
10551 // if this shard is ahead of us and one isn't, e.g., one thread is
10552 // calling into prime_splits via _process (due to a newly created
10553 // pg) and this shard has a newer map due to a racing consume_map,
10554 // then any grandchildren left here will be identified (or were
10555 // identified) when the slower shard's osdmap is advanced.
10556 // _prime_splits() will tolerate the case where the pgid is
10557 // already primed.
10558 }
10559 }
10560
10561 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10562 {
10563 dout(10) << *pgids << dendl;
10564 auto p = pgids->begin();
10565 while (p != pgids->end()) {
10566 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10567 if (shard_index == shard_id) {
10568 auto r = pg_slots.emplace(p->first, nullptr);
10569 if (r.second) {
10570 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10571 r.first->second = make_unique<OSDShardPGSlot>();
10572 r.first->second->waiting_for_split.insert(p->second);
10573 } else {
10574 auto q = r.first;
10575 ceph_assert(q != pg_slots.end());
10576 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10577 << dendl;
10578 q->second->waiting_for_split.insert(p->second);
10579 }
10580 p = pgids->erase(p);
10581 } else {
10582 ++p;
10583 }
10584 }
10585 }
10586
10587 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10588 set<pair<spg_t,epoch_t>> *merge_pgs)
10589 {
10590 std::lock_guard l(shard_lock);
10591 dout(20) << __func__ << " checking shard " << shard_id
10592 << " for remaining merge pgs " << merge_pgs << dendl;
10593 auto p = merge_pgs->begin();
10594 while (p != merge_pgs->end()) {
10595 spg_t pgid = p->first;
10596 epoch_t epoch = p->second;
10597 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10598 if (shard_index != shard_id) {
10599 ++p;
10600 continue;
10601 }
10602 OSDShardPGSlot *slot;
10603 auto r = pg_slots.emplace(pgid, nullptr);
10604 if (r.second) {
10605 r.first->second = make_unique<OSDShardPGSlot>();
10606 }
10607 slot = r.first->second.get();
10608 if (slot->pg) {
10609 // already have pg
10610 dout(20) << __func__ << " have merge participant pg " << pgid
10611 << " " << slot->pg << dendl;
10612 } else if (!slot->waiting_for_split.empty() &&
10613 *slot->waiting_for_split.begin() < epoch) {
10614 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10615 << " " << slot->waiting_for_split << dendl;
10616 } else {
10617 dout(20) << __func__ << " creating empty merge participant " << pgid
10618 << " for merge in " << epoch << dendl;
10619 // leave history zeroed; PG::merge_from() will fill it in.
10620 pg_history_t history;
10621 PGCreateInfo cinfo(pgid, epoch - 1,
10622 history, PastIntervals(), false);
10623 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10624 _attach_pg(r.first->second.get(), pg.get());
10625 _wake_pg_slot(pgid, slot);
10626 pg->unlock();
10627 }
10628 // mark slot for merge
10629 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10630 slot->waiting_for_merge_epoch = epoch;
10631 p = merge_pgs->erase(p);
10632 }
10633 }
10634
10635 void OSDShard::register_and_wake_split_child(PG *pg)
10636 {
10637 dout(15) << __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
10638 epoch_t epoch;
10639 {
10640 std::lock_guard l(shard_lock);
10641 dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
10642 auto p = pg_slots.find(pg->pg_id);
10643 ceph_assert(p != pg_slots.end());
10644 auto *slot = p->second.get();
10645 dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10646 << slot->waiting_for_split << dendl;
10647 ceph_assert(!slot->pg);
10648 ceph_assert(!slot->waiting_for_split.empty());
10649 _attach_pg(slot, pg);
10650
10651 epoch = pg->get_osdmap_epoch();
10652 ceph_assert(slot->waiting_for_split.count(epoch));
10653 slot->waiting_for_split.erase(epoch);
10654 if (slot->waiting_for_split.empty()) {
10655 _wake_pg_slot(pg->pg_id, slot);
10656 } else {
10657 dout(10) << __func__ << " still waiting for split on "
10658 << slot->waiting_for_split << dendl;
10659 }
10660 }
10661
10662 // kick child to ensure it pulls up to the latest osdmap
10663 osd->enqueue_peering_evt(
10664 pg->pg_id,
10665 PGPeeringEventRef(
10666 std::make_shared<PGPeeringEvent>(
10667 epoch,
10668 epoch,
10669 NullEvt())));
10670
10671 std::lock_guard l{sdata_wait_lock};
10672 sdata_cond.notify_one();
10673 }
10674
10675 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10676 {
10677 std::lock_guard l(shard_lock);
10678 vector<spg_t> to_delete;
10679 for (auto& i : pg_slots) {
10680 if (i.first != parent &&
10681 i.first.get_ancestor(old_pg_num) == parent) {
10682 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10683 << dendl;
10684 _wake_pg_slot(i.first, i.second.get());
10685 to_delete.push_back(i.first);
10686 }
10687 }
10688 for (auto pgid : to_delete) {
10689 pg_slots.erase(pgid);
10690 }
10691 }
10692
10693 void OSDShard::update_scheduler_config()
10694 {
10695 scheduler->update_configuration();
10696 }
10697
10698 std::string OSDShard::get_scheduler_type()
10699 {
10700 std::ostringstream scheduler_type;
10701 scheduler_type << *scheduler;
10702 return scheduler_type.str();
10703 }
10704
10705 OSDShard::OSDShard(
10706 int id,
10707 CephContext *cct,
10708 OSD *osd)
10709 : shard_id(id),
10710 cct(cct),
10711 osd(osd),
10712 shard_name(string("OSDShard.") + stringify(id)),
10713 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10714 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10715 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10716 shard_lock_name(shard_name + "::shard_lock"),
10717 shard_lock{make_mutex(shard_lock_name)},
10718 scheduler(ceph::osd::scheduler::make_scheduler(
10719 cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
10720 osd->store->get_type(), osd->monc)),
10721 context_queue(sdata_wait_lock, sdata_cond)
10722 {
10723 dout(0) << "using op scheduler " << *scheduler << dendl;
10724 }
10725
10726
10727 // =============================================================
10728
10729 #undef dout_context
10730 #define dout_context osd->cct
10731 #undef dout_prefix
10732 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10733
10734 void OSD::ShardedOpWQ::_add_slot_waiter(
10735 spg_t pgid,
10736 OSDShardPGSlot *slot,
10737 OpSchedulerItem&& qi)
10738 {
10739 if (qi.is_peering()) {
10740 dout(20) << __func__ << " " << pgid
10741 << " peering, item epoch is "
10742 << qi.get_map_epoch()
10743 << ", will wait on " << qi << dendl;
10744 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10745 } else {
10746 dout(20) << __func__ << " " << pgid
10747 << " item epoch is "
10748 << qi.get_map_epoch()
10749 << ", will wait on " << qi << dendl;
10750 slot->waiting.push_back(std::move(qi));
10751 }
10752 }
10753
10754 #undef dout_prefix
10755 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10756
10757 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10758 {
10759 uint32_t shard_index = thread_index % osd->num_shards;
10760 auto& sdata = osd->shards[shard_index];
10761 ceph_assert(sdata);
10762
10763 // If all threads of shards do oncommits, there is a out-of-order
10764 // problem. So we choose the thread which has the smallest
10765 // thread_index(thread_index < num_shards) of shard to do oncommit
10766 // callback.
10767 bool is_smallest_thread_index = thread_index < osd->num_shards;
10768
10769 // peek at spg_t
10770 sdata->shard_lock.lock();
10771 if (sdata->scheduler->empty() &&
10772 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10773 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10774 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10775 // we raced with a context_queue addition, don't wait
10776 wait_lock.unlock();
10777 } else if (!sdata->stop_waiting) {
10778 dout(20) << __func__ << " empty q, waiting" << dendl;
10779 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10780 sdata->shard_lock.unlock();
10781 sdata->sdata_cond.wait(wait_lock);
10782 wait_lock.unlock();
10783 sdata->shard_lock.lock();
10784 if (sdata->scheduler->empty() &&
10785 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10786 sdata->shard_lock.unlock();
10787 return;
10788 }
10789 // found a work item; reapply default wq timeouts
10790 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10791 timeout_interval, suicide_interval);
10792 } else {
10793 dout(20) << __func__ << " need return immediately" << dendl;
10794 wait_lock.unlock();
10795 sdata->shard_lock.unlock();
10796 return;
10797 }
10798 }
10799
10800 list<Context *> oncommits;
10801 if (is_smallest_thread_index) {
10802 sdata->context_queue.move_to(oncommits);
10803 }
10804
10805 WorkItem work_item;
10806 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10807 if (sdata->scheduler->empty()) {
10808 if (osd->is_stopping()) {
10809 sdata->shard_lock.unlock();
10810 for (auto c : oncommits) {
10811 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10812 delete c;
10813 }
10814 return; // OSD shutdown, discard.
10815 }
10816 sdata->shard_lock.unlock();
10817 handle_oncommits(oncommits);
10818 return;
10819 }
10820
10821 work_item = sdata->scheduler->dequeue();
10822 if (osd->is_stopping()) {
10823 sdata->shard_lock.unlock();
10824 for (auto c : oncommits) {
10825 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10826 delete c;
10827 }
10828 return; // OSD shutdown, discard.
10829 }
10830
10831 // If the work item is scheduled in the future, wait until
10832 // the time returned in the dequeue response before retrying.
10833 if (auto when_ready = std::get_if<double>(&work_item)) {
10834 if (is_smallest_thread_index) {
10835 sdata->shard_lock.unlock();
10836 handle_oncommits(oncommits);
10837 sdata->shard_lock.lock();
10838 }
10839 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10840 auto future_time = ceph::real_clock::from_double(*when_ready);
10841 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10842 // Disable heartbeat timeout until we find a non-future work item to process.
10843 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10844 sdata->shard_lock.unlock();
10845 ++sdata->waiting_threads;
10846 sdata->sdata_cond.wait_until(wait_lock, future_time);
10847 --sdata->waiting_threads;
10848 wait_lock.unlock();
10849 sdata->shard_lock.lock();
10850 // Reapply default wq timeouts
10851 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10852 timeout_interval, suicide_interval);
10853 // Populate the oncommits list if there were any additions
10854 // to the context_queue while we were waiting
10855 if (is_smallest_thread_index) {
10856 sdata->context_queue.move_to(oncommits);
10857 }
10858 }
10859 } // while
10860
10861 // Access the stored item
10862 auto item = std::move(std::get<OpSchedulerItem>(work_item));
10863 if (osd->is_stopping()) {
10864 sdata->shard_lock.unlock();
10865 for (auto c : oncommits) {
10866 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10867 delete c;
10868 }
10869 return; // OSD shutdown, discard.
10870 }
10871
10872 const auto token = item.get_ordering_token();
10873 auto r = sdata->pg_slots.emplace(token, nullptr);
10874 if (r.second) {
10875 r.first->second = make_unique<OSDShardPGSlot>();
10876 }
10877 OSDShardPGSlot *slot = r.first->second.get();
10878 dout(20) << __func__ << " " << token
10879 << (r.second ? " (new)" : "")
10880 << " to_process " << slot->to_process
10881 << " waiting " << slot->waiting
10882 << " waiting_peering " << slot->waiting_peering
10883 << dendl;
10884 slot->to_process.push_back(std::move(item));
10885 dout(20) << __func__ << " " << slot->to_process.back()
10886 << " queued" << dendl;
10887
10888 retry_pg:
10889 PGRef pg = slot->pg;
10890
10891 // lock pg (if we have it)
10892 if (pg) {
10893 // note the requeue seq now...
10894 uint64_t requeue_seq = slot->requeue_seq;
10895 ++slot->num_running;
10896
10897 sdata->shard_lock.unlock();
10898 osd->service.maybe_inject_dispatch_delay();
10899 pg->lock();
10900 osd->service.maybe_inject_dispatch_delay();
10901 sdata->shard_lock.lock();
10902
10903 auto q = sdata->pg_slots.find(token);
10904 if (q == sdata->pg_slots.end()) {
10905 // this can happen if we race with pg removal.
10906 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10907 pg->unlock();
10908 sdata->shard_lock.unlock();
10909 handle_oncommits(oncommits);
10910 return;
10911 }
10912 slot = q->second.get();
10913 --slot->num_running;
10914
10915 if (slot->to_process.empty()) {
10916 // raced with _wake_pg_slot or consume_map
10917 dout(20) << __func__ << " " << token
10918 << " nothing queued" << dendl;
10919 pg->unlock();
10920 sdata->shard_lock.unlock();
10921 handle_oncommits(oncommits);
10922 return;
10923 }
10924 if (requeue_seq != slot->requeue_seq) {
10925 dout(20) << __func__ << " " << token
10926 << " requeue_seq " << slot->requeue_seq << " > our "
10927 << requeue_seq << ", we raced with _wake_pg_slot"
10928 << dendl;
10929 pg->unlock();
10930 sdata->shard_lock.unlock();
10931 handle_oncommits(oncommits);
10932 return;
10933 }
10934 if (slot->pg != pg) {
10935 // this can happen if we race with pg removal.
10936 dout(20) << __func__ << " slot " << token << " no longer attached to "
10937 << pg << dendl;
10938 pg->unlock();
10939 goto retry_pg;
10940 }
10941 }
10942
10943 dout(20) << __func__ << " " << token
10944 << " to_process " << slot->to_process
10945 << " waiting " << slot->waiting
10946 << " waiting_peering " << slot->waiting_peering << dendl;
10947
10948 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10949 suicide_interval);
10950
10951 // take next item
10952 auto qi = std::move(slot->to_process.front());
10953 slot->to_process.pop_front();
10954 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10955 set<pair<spg_t,epoch_t>> new_children;
10956 OSDMapRef osdmap;
10957
10958 while (!pg) {
10959 // should this pg shard exist on this osd in this (or a later) epoch?
10960 osdmap = sdata->shard_osdmap;
10961 const PGCreateInfo *create_info = qi.creates_pg();
10962 if (!slot->waiting_for_split.empty()) {
10963 dout(20) << __func__ << " " << token
10964 << " splitting " << slot->waiting_for_split << dendl;
10965 _add_slot_waiter(token, slot, std::move(qi));
10966 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10967 dout(20) << __func__ << " " << token
10968 << " map " << qi.get_map_epoch() << " > "
10969 << osdmap->get_epoch() << dendl;
10970 _add_slot_waiter(token, slot, std::move(qi));
10971 } else if (qi.is_peering()) {
10972 if (!qi.peering_requires_pg()) {
10973 // for pg-less events, we run them under the ordering lock, since
10974 // we don't have the pg lock to keep them ordered.
10975 qi.run(osd, sdata, pg, tp_handle);
10976 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10977 if (create_info) {
10978 if (create_info->by_mon &&
10979 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10980 dout(20) << __func__ << " " << token
10981 << " no pg, no longer primary, ignoring mon create on "
10982 << qi << dendl;
10983 } else {
10984 dout(20) << __func__ << " " << token
10985 << " no pg, should create on " << qi << dendl;
10986 pg = osd->handle_pg_create_info(osdmap, create_info);
10987 if (pg) {
10988 // we created the pg! drop out and continue "normally"!
10989 sdata->_attach_pg(slot, pg.get());
10990 sdata->_wake_pg_slot(token, slot);
10991
10992 // identify split children between create epoch and shard epoch.
10993 osd->service.identify_splits_and_merges(
10994 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10995 sdata->_prime_splits(&new_children);
10996 // distribute remaining split children to other shards below!
10997 break;
10998 }
10999 dout(20) << __func__ << " ignored create on " << qi << dendl;
11000 }
11001 } else {
11002 dout(20) << __func__ << " " << token
11003 << " no pg, peering, !create, discarding " << qi << dendl;
11004 }
11005 } else {
11006 dout(20) << __func__ << " " << token
11007 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11008 << ", discarding " << qi
11009 << dendl;
11010 }
11011 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11012 dout(20) << __func__ << " " << token
11013 << " no pg, should exist e" << osdmap->get_epoch()
11014 << ", will wait on " << qi << dendl;
11015 _add_slot_waiter(token, slot, std::move(qi));
11016 } else {
11017 dout(20) << __func__ << " " << token
11018 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11019 << ", dropping " << qi << dendl;
11020 // share map with client?
11021 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11022 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11023 sdata->shard_osdmap,
11024 (*_op)->sent_epoch);
11025 }
11026 unsigned pushes_to_free = qi.get_reserved_pushes();
11027 if (pushes_to_free > 0) {
11028 sdata->shard_lock.unlock();
11029 osd->service.release_reserved_pushes(pushes_to_free);
11030 handle_oncommits(oncommits);
11031 return;
11032 }
11033 }
11034 sdata->shard_lock.unlock();
11035 handle_oncommits(oncommits);
11036 return;
11037 }
11038 if (qi.is_peering()) {
11039 OSDMapRef osdmap = sdata->shard_osdmap;
11040 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11041 _add_slot_waiter(token, slot, std::move(qi));
11042 sdata->shard_lock.unlock();
11043 pg->unlock();
11044 handle_oncommits(oncommits);
11045 return;
11046 }
11047 }
11048 sdata->shard_lock.unlock();
11049
11050 if (!new_children.empty()) {
11051 for (auto shard : osd->shards) {
11052 shard->prime_splits(osdmap, &new_children);
11053 }
11054 ceph_assert(new_children.empty());
11055 }
11056
11057 // osd_opwq_process marks the point at which an operation has been dequeued
11058 // and will begin to be handled by a worker thread.
11059 {
11060 #ifdef WITH_LTTNG
11061 osd_reqid_t reqid;
11062 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11063 reqid = (*_op)->get_reqid();
11064 }
11065 #endif
11066 tracepoint(osd, opwq_process_start, reqid.name._type,
11067 reqid.name._num, reqid.tid, reqid.inc);
11068 }
11069
11070 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11071 Formatter *f = Formatter::create("json");
11072 f->open_object_section("q");
11073 dump(f);
11074 f->close_section();
11075 f->flush(*_dout);
11076 delete f;
11077 *_dout << dendl;
11078
11079 qi.run(osd, sdata, pg, tp_handle);
11080
11081 {
11082 #ifdef WITH_LTTNG
11083 osd_reqid_t reqid;
11084 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11085 reqid = (*_op)->get_reqid();
11086 }
11087 #endif
11088 tracepoint(osd, opwq_process_finish, reqid.name._type,
11089 reqid.name._num, reqid.tid, reqid.inc);
11090 }
11091
11092 handle_oncommits(oncommits);
11093 }
11094
11095 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11096 if (unlikely(m_fast_shutdown) ) {
11097 // stop enqueing when we are in the middle of a fast shutdown
11098 return;
11099 }
11100
11101 uint32_t shard_index =
11102 item.get_ordering_token().hash_to_shard(osd->shards.size());
11103
11104 OSDShard* sdata = osd->shards[shard_index];
11105 assert (NULL != sdata);
11106
11107 dout(20) << __func__ << " " << item << dendl;
11108
11109 bool empty = true;
11110 {
11111 std::lock_guard l{sdata->shard_lock};
11112 empty = sdata->scheduler->empty();
11113 sdata->scheduler->enqueue(std::move(item));
11114 }
11115
11116 {
11117 std::lock_guard l{sdata->sdata_wait_lock};
11118 if (empty) {
11119 sdata->sdata_cond.notify_all();
11120 } else if (sdata->waiting_threads) {
11121 sdata->sdata_cond.notify_one();
11122 }
11123 }
11124 }
11125
11126 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11127 {
11128 if (unlikely(m_fast_shutdown) ) {
11129 // stop enqueing when we are in the middle of a fast shutdown
11130 return;
11131 }
11132
11133 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11134 auto& sdata = osd->shards[shard_index];
11135 ceph_assert(sdata);
11136 sdata->shard_lock.lock();
11137 auto p = sdata->pg_slots.find(item.get_ordering_token());
11138 if (p != sdata->pg_slots.end() &&
11139 !p->second->to_process.empty()) {
11140 // we may be racing with _process, which has dequeued a new item
11141 // from scheduler, put it on to_process, and is now busy taking the
11142 // pg lock. ensure this old requeued item is ordered before any
11143 // such newer item in to_process.
11144 p->second->to_process.push_front(std::move(item));
11145 item = std::move(p->second->to_process.back());
11146 p->second->to_process.pop_back();
11147 dout(20) << __func__
11148 << " " << p->second->to_process.front()
11149 << " shuffled w/ " << item << dendl;
11150 } else {
11151 dout(20) << __func__ << " " << item << dendl;
11152 }
11153 sdata->scheduler->enqueue_front(std::move(item));
11154 sdata->shard_lock.unlock();
11155 std::lock_guard l{sdata->sdata_wait_lock};
11156 sdata->sdata_cond.notify_one();
11157 }
11158
11159 void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11160 {
11161 uint32_t shard_index = 0;
11162 m_fast_shutdown = true;
11163
11164 for (; shard_index < osd->num_shards; shard_index++) {
11165 auto& sdata = osd->shards[shard_index];
11166 ceph_assert(sdata);
11167 sdata->shard_lock.lock();
11168 int work_count = 0;
11169 while(! sdata->scheduler->empty() ) {
11170 auto work_item = sdata->scheduler->dequeue();
11171 work_count++;
11172 }
11173 sdata->shard_lock.unlock();
11174 }
11175 }
11176
11177 namespace ceph::osd_cmds {
11178
11179 int heap(CephContext& cct,
11180 const cmdmap_t& cmdmap,
11181 std::ostream& outos,
11182 std::ostream& erros)
11183 {
11184 if (!ceph_using_tcmalloc()) {
11185 erros << "could not issue heap profiler command -- not using tcmalloc!";
11186 return -EOPNOTSUPP;
11187 }
11188
11189 string cmd;
11190 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11191 erros << "unable to get value for command \"" << cmd << "\"";
11192 return -EINVAL;
11193 }
11194
11195 std::vector<std::string> cmd_vec;
11196 get_str_vec(cmd, cmd_vec);
11197
11198 string val;
11199 if (cmd_getval(cmdmap, "value", val)) {
11200 cmd_vec.push_back(val);
11201 }
11202
11203 ceph_heap_profiler_handle_command(cmd_vec, outos);
11204
11205 return 0;
11206 }
11207
11208 } // namespace ceph::osd_cmds