]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39 #include "osd/scrub_machine.h"
40 #include "osd/pg_scrubber.h"
41
42 #include "include/types.h"
43 #include "include/compat.h"
44 #include "include/random.h"
45
46 #include "OSD.h"
47 #include "OSDMap.h"
48 #include "Watch.h"
49 #include "osdc/Objecter.h"
50
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
60
61 #include "os/ObjectStore.h"
62 #ifdef HAVE_LIBFUSE
63 #include "os/FuseStore.h"
64 #endif
65
66 #include "PrimaryLogPG.h"
67
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
70
71 #include "mon/MonClient.h"
72
73 #include "messages/MLog.h"
74
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
90
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery.h"
96 #include "messages/MOSDPGQuery2.h"
97 #include "messages/MOSDPGLog.h"
98 #include "messages/MOSDPGRemove.h"
99 #include "messages/MOSDPGInfo.h"
100 #include "messages/MOSDPGInfo2.h"
101 #include "messages/MOSDPGCreate.h"
102 #include "messages/MOSDPGCreate2.h"
103 #include "messages/MBackfillReserve.h"
104 #include "messages/MRecoveryReserve.h"
105 #include "messages/MOSDForceRecovery.h"
106 #include "messages/MOSDECSubOpWrite.h"
107 #include "messages/MOSDECSubOpWriteReply.h"
108 #include "messages/MOSDECSubOpRead.h"
109 #include "messages/MOSDECSubOpReadReply.h"
110 #include "messages/MOSDPGCreated.h"
111 #include "messages/MOSDPGUpdateLogMissing.h"
112 #include "messages/MOSDPGUpdateLogMissingReply.h"
113
114 #include "messages/MOSDPeeringOp.h"
115
116 #include "messages/MOSDAlive.h"
117
118 #include "messages/MOSDScrub.h"
119 #include "messages/MOSDScrub2.h"
120 #include "messages/MOSDRepScrub.h"
121
122 #include "messages/MCommand.h"
123 #include "messages/MCommandReply.h"
124
125 #include "messages/MPGStats.h"
126
127 #include "messages/MWatchNotify.h"
128 #include "messages/MOSDPGPush.h"
129 #include "messages/MOSDPGPushReply.h"
130 #include "messages/MOSDPGPull.h"
131
132 #include "messages/MMonGetPurgedSnaps.h"
133 #include "messages/MMonGetPurgedSnapsReply.h"
134
135 #include "common/perf_counters.h"
136 #include "common/Timer.h"
137 #include "common/LogClient.h"
138 #include "common/AsyncReserver.h"
139 #include "common/HeartbeatMap.h"
140 #include "common/admin_socket.h"
141 #include "common/ceph_context.h"
142
143 #include "global/signal_handler.h"
144 #include "global/pidfile.h"
145
146 #include "include/color.h"
147 #include "perfglue/cpu_profiler.h"
148 #include "perfglue/heap_profiler.h"
149
150 #include "osd/ClassHandler.h"
151 #include "osd/OpRequest.h"
152
153 #include "auth/AuthAuthorizeHandler.h"
154 #include "auth/RotatingKeyRing.h"
155
156 #include "objclass/objclass.h"
157
158 #include "common/cmdparse.h"
159 #include "include/str_list.h"
160 #include "include/util.h"
161
162 #include "include/ceph_assert.h"
163 #include "common/config.h"
164 #include "common/EventTrace.h"
165
166 #include "json_spirit/json_spirit_reader.h"
167 #include "json_spirit/json_spirit_writer.h"
168
169 #ifdef WITH_LTTNG
170 #define TRACEPOINT_DEFINE
171 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #include "tracing/osd.h"
173 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
174 #undef TRACEPOINT_DEFINE
175 #else
176 #define tracepoint(...)
177 #endif
178 #ifdef HAVE_JAEGER
179 #include "common/tracer.h"
180 #endif
181
182 #define dout_context cct
183 #define dout_subsys ceph_subsys_osd
184 #undef dout_prefix
185 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
186
187 using std::deque;
188 using std::list;
189 using std::lock_guard;
190 using std::make_pair;
191 using std::make_tuple;
192 using std::make_unique;
193 using std::map;
194 using std::ostream;
195 using std::ostringstream;
196 using std::pair;
197 using std::set;
198 using std::string;
199 using std::stringstream;
200 using std::to_string;
201 using std::unique_ptr;
202 using std::vector;
203
204 using ceph::bufferlist;
205 using ceph::bufferptr;
206 using ceph::decode;
207 using ceph::encode;
208 using ceph::fixed_u_to_string;
209 using ceph::Formatter;
210 using ceph::heartbeat_handle_d;
211 using ceph::make_mutex;
212
213 using namespace ceph::osd::scheduler;
214 using TOPNSPC::common::cmd_getval;
215
216 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
217 return *_dout << "osd." << whoami << " " << epoch << " ";
218 }
219
220 //Initial features in new superblock.
221 //Features here are also automatically upgraded
222 CompatSet OSD::get_osd_initial_compat_set() {
223 CompatSet::FeatureSet ceph_osd_feature_compat;
224 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
225 CompatSet::FeatureSet ceph_osd_feature_incompat;
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
235 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
236 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
237 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
238 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
239 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
240 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
241 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
242 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
243 ceph_osd_feature_incompat);
244 }
245
246 //Features are added here that this OSD supports.
247 CompatSet OSD::get_osd_compat_set() {
248 CompatSet compat = get_osd_initial_compat_set();
249 //Any features here can be set in code, but not in initial superblock
250 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
251 return compat;
252 }
253
254 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
255 osd(osd),
256 cct(osd->cct),
257 whoami(osd->whoami), store(osd->store),
258 log_client(osd->log_client), clog(osd->clog),
259 pg_recovery_stats(osd->pg_recovery_stats),
260 cluster_messenger(osd->cluster_messenger),
261 client_messenger(osd->client_messenger),
262 logger(osd->logger),
263 recoverystate_perf(osd->recoverystate_perf),
264 monc(osd->monc),
265 osd_max_object_size(cct->_conf, "osd_max_object_size"),
266 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
267 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
268 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
269 max_oldest_map(0),
270 scrubs_local(0),
271 scrubs_remote(0),
272 agent_valid_iterator(false),
273 agent_ops(0),
274 flush_mode_high_count(0),
275 agent_active(true),
276 agent_thread(this),
277 agent_stop_flag(false),
278 agent_timer(osd->client_messenger->cct, agent_timer_lock),
279 last_recalibrate(ceph_clock_now()),
280 promote_max_objects(0),
281 promote_max_bytes(0),
282 poolctx(poolctx),
283 objecter(make_unique<Objecter>(osd->client_messenger->cct,
284 osd->objecter_messenger,
285 osd->monc, poolctx)),
286 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
287 watch_timer(osd->client_messenger->cct, watch_lock),
288 next_notif_id(0),
289 recovery_request_timer(cct, recovery_request_lock, false),
290 sleep_timer(cct, sleep_lock, false),
291 reserver_finisher(cct),
292 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
293 cct->_conf->osd_min_recovery_priority),
294 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
295 cct->_conf->osd_min_recovery_priority),
296 snap_reserver(cct, &reserver_finisher,
297 cct->_conf->osd_max_trimming_pgs),
298 recovery_ops_active(0),
299 recovery_ops_reserved(0),
300 recovery_paused(false),
301 map_cache(cct, cct->_conf->osd_map_cache_size),
302 map_bl_cache(cct->_conf->osd_map_cache_size),
303 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
304 cur_state(NONE),
305 cur_ratio(0), physical_ratio(0),
306 boot_epoch(0), up_epoch(0), bind_epoch(0)
307 {
308 objecter->init();
309
310 for (int i = 0; i < m_objecter_finishers; i++) {
311 ostringstream str;
312 str << "objecter-finisher-" << i;
313 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
314 objecter_finishers.push_back(std::move(fin));
315 }
316 }
317
318 #ifdef PG_DEBUG_REFS
319 void OSDService::add_pgid(spg_t pgid, PG *pg) {
320 std::lock_guard l(pgid_lock);
321 if (!pgid_tracker.count(pgid)) {
322 live_pgs[pgid] = pg;
323 }
324 pgid_tracker[pgid]++;
325 }
326 void OSDService::remove_pgid(spg_t pgid, PG *pg)
327 {
328 std::lock_guard l(pgid_lock);
329 ceph_assert(pgid_tracker.count(pgid));
330 ceph_assert(pgid_tracker[pgid] > 0);
331 pgid_tracker[pgid]--;
332 if (pgid_tracker[pgid] == 0) {
333 pgid_tracker.erase(pgid);
334 live_pgs.erase(pgid);
335 }
336 }
337 void OSDService::dump_live_pgids()
338 {
339 std::lock_guard l(pgid_lock);
340 derr << "live pgids:" << dendl;
341 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
342 i != pgid_tracker.cend();
343 ++i) {
344 derr << "\t" << *i << dendl;
345 live_pgs[i->first]->dump_live_ids();
346 }
347 }
348 #endif
349
350
351 ceph::signedspan OSDService::get_mnow()
352 {
353 return ceph::mono_clock::now() - osd->startup_time;
354 }
355
356 void OSDService::identify_splits_and_merges(
357 OSDMapRef old_map,
358 OSDMapRef new_map,
359 spg_t pgid,
360 set<pair<spg_t,epoch_t>> *split_children,
361 set<pair<spg_t,epoch_t>> *merge_pgs)
362 {
363 if (!old_map->have_pg_pool(pgid.pool())) {
364 return;
365 }
366 int old_pgnum = old_map->get_pg_num(pgid.pool());
367 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
368 if (p == osd->pg_num_history.pg_nums.end()) {
369 return;
370 }
371 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
372 << " to e" << new_map->get_epoch()
373 << " pg_nums " << p->second << dendl;
374 deque<spg_t> queue;
375 queue.push_back(pgid);
376 set<spg_t> did;
377 while (!queue.empty()) {
378 auto cur = queue.front();
379 queue.pop_front();
380 did.insert(cur);
381 unsigned pgnum = old_pgnum;
382 for (auto q = p->second.lower_bound(old_map->get_epoch());
383 q != p->second.end() &&
384 q->first <= new_map->get_epoch();
385 ++q) {
386 if (pgnum < q->second) {
387 // split?
388 if (cur.ps() < pgnum) {
389 set<spg_t> children;
390 if (cur.is_split(pgnum, q->second, &children)) {
391 dout(20) << __func__ << " " << cur << " e" << q->first
392 << " pg_num " << pgnum << " -> " << q->second
393 << " children " << children << dendl;
394 for (auto i : children) {
395 split_children->insert(make_pair(i, q->first));
396 if (!did.count(i))
397 queue.push_back(i);
398 }
399 }
400 } else if (cur.ps() < q->second) {
401 dout(20) << __func__ << " " << cur << " e" << q->first
402 << " pg_num " << pgnum << " -> " << q->second
403 << " is a child" << dendl;
404 // normally we'd capture this from the parent, but it's
405 // possible the parent doesn't exist yet (it will be
406 // fabricated to allow an intervening merge). note this PG
407 // as a split child here to be sure we catch it.
408 split_children->insert(make_pair(cur, q->first));
409 } else {
410 dout(20) << __func__ << " " << cur << " e" << q->first
411 << " pg_num " << pgnum << " -> " << q->second
412 << " is post-split, skipping" << dendl;
413 }
414 } else if (merge_pgs) {
415 // merge?
416 if (cur.ps() >= q->second) {
417 if (cur.ps() < pgnum) {
418 spg_t parent;
419 if (cur.is_merge_source(pgnum, q->second, &parent)) {
420 set<spg_t> children;
421 parent.is_split(q->second, pgnum, &children);
422 dout(20) << __func__ << " " << cur << " e" << q->first
423 << " pg_num " << pgnum << " -> " << q->second
424 << " is merge source, target " << parent
425 << ", source(s) " << children << dendl;
426 merge_pgs->insert(make_pair(parent, q->first));
427 if (!did.count(parent)) {
428 // queue (and re-scan) parent in case it might not exist yet
429 // and there are some future splits pending on it
430 queue.push_back(parent);
431 }
432 for (auto c : children) {
433 merge_pgs->insert(make_pair(c, q->first));
434 if (!did.count(c))
435 queue.push_back(c);
436 }
437 }
438 } else {
439 dout(20) << __func__ << " " << cur << " e" << q->first
440 << " pg_num " << pgnum << " -> " << q->second
441 << " is beyond old pgnum, skipping" << dendl;
442 }
443 } else {
444 set<spg_t> children;
445 if (cur.is_split(q->second, pgnum, &children)) {
446 dout(20) << __func__ << " " << cur << " e" << q->first
447 << " pg_num " << pgnum << " -> " << q->second
448 << " is merge target, source " << children << dendl;
449 for (auto c : children) {
450 merge_pgs->insert(make_pair(c, q->first));
451 if (!did.count(c))
452 queue.push_back(c);
453 }
454 merge_pgs->insert(make_pair(cur, q->first));
455 }
456 }
457 }
458 pgnum = q->second;
459 }
460 }
461 }
462
463 void OSDService::need_heartbeat_peer_update()
464 {
465 osd->need_heartbeat_peer_update();
466 }
467
468 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
469 {
470 std::lock_guard l(hb_stamp_lock);
471 if (peer >= hb_stamps.size()) {
472 hb_stamps.resize(peer + 1);
473 }
474 if (!hb_stamps[peer]) {
475 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
476 }
477 return hb_stamps[peer];
478 }
479
480 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
481 {
482 osd->enqueue_peering_evt(
483 spgid,
484 PGPeeringEventRef(
485 std::make_shared<PGPeeringEvent>(
486 epoch, epoch,
487 RenewLease())));
488 }
489
490 void OSDService::start_shutdown()
491 {
492 {
493 std::lock_guard l(agent_timer_lock);
494 agent_timer.shutdown();
495 }
496
497 {
498 std::lock_guard l(sleep_lock);
499 sleep_timer.shutdown();
500 }
501
502 {
503 std::lock_guard l(recovery_request_lock);
504 recovery_request_timer.shutdown();
505 }
506 }
507
508 void OSDService::shutdown_reserver()
509 {
510 reserver_finisher.wait_for_empty();
511 reserver_finisher.stop();
512 }
513
514 void OSDService::shutdown()
515 {
516 mono_timer.suspend();
517
518 {
519 std::lock_guard l(watch_lock);
520 watch_timer.shutdown();
521 }
522
523 objecter->shutdown();
524 for (auto& f : objecter_finishers) {
525 f->wait_for_empty();
526 f->stop();
527 }
528
529 publish_map(OSDMapRef());
530 next_osdmap = OSDMapRef();
531 }
532
533 void OSDService::init()
534 {
535 reserver_finisher.start();
536 for (auto& f : objecter_finishers) {
537 f->start();
538 }
539 objecter->set_client_incarnation(0);
540
541 // deprioritize objecter in daemonperf output
542 objecter->get_logger()->set_prio_adjust(-3);
543
544 watch_timer.init();
545 agent_timer.init();
546 mono_timer.resume();
547
548 agent_thread.create("osd_srv_agent");
549
550 if (cct->_conf->osd_recovery_delay_start)
551 defer_recovery(cct->_conf->osd_recovery_delay_start);
552 }
553
554 void OSDService::final_init()
555 {
556 objecter->start(osdmap.get());
557 }
558
559 void OSDService::activate_map()
560 {
561 // wake/unwake the tiering agent
562 std::lock_guard l{agent_lock};
563 agent_active =
564 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
565 osd->is_active();
566 agent_cond.notify_all();
567 }
568
569 void OSDService::request_osdmap_update(epoch_t e)
570 {
571 osd->osdmap_subscribe(e, false);
572 }
573
574
575 class AgentTimeoutCB : public Context {
576 PGRef pg;
577 public:
578 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
579 void finish(int) override {
580 pg->agent_choose_mode_restart();
581 }
582 };
583
584 void OSDService::agent_entry()
585 {
586 dout(10) << __func__ << " start" << dendl;
587 std::unique_lock agent_locker{agent_lock};
588
589 while (!agent_stop_flag) {
590 if (agent_queue.empty()) {
591 dout(20) << __func__ << " empty queue" << dendl;
592 agent_cond.wait(agent_locker);
593 continue;
594 }
595 uint64_t level = agent_queue.rbegin()->first;
596 set<PGRef>& top = agent_queue.rbegin()->second;
597 dout(10) << __func__
598 << " tiers " << agent_queue.size()
599 << ", top is " << level
600 << " with pgs " << top.size()
601 << ", ops " << agent_ops << "/"
602 << cct->_conf->osd_agent_max_ops
603 << (agent_active ? " active" : " NOT ACTIVE")
604 << dendl;
605 dout(20) << __func__ << " oids " << agent_oids << dendl;
606 int max = cct->_conf->osd_agent_max_ops - agent_ops;
607 int agent_flush_quota = max;
608 if (!flush_mode_high_count)
609 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
610 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
611 agent_cond.wait(agent_locker);
612 continue;
613 }
614
615 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
616 agent_queue_pos = top.begin();
617 agent_valid_iterator = true;
618 }
619 PGRef pg = *agent_queue_pos;
620 dout(10) << "high_count " << flush_mode_high_count
621 << " agent_ops " << agent_ops
622 << " flush_quota " << agent_flush_quota << dendl;
623 agent_locker.unlock();
624 if (!pg->agent_work(max, agent_flush_quota)) {
625 dout(10) << __func__ << " " << pg->pg_id
626 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
627 << " seconds" << dendl;
628
629 logger->inc(l_osd_tier_delay);
630 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
631 std::lock_guard timer_locker{agent_timer_lock};
632 Context *cb = new AgentTimeoutCB(pg);
633 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
634 }
635 agent_locker.lock();
636 }
637 dout(10) << __func__ << " finish" << dendl;
638 }
639
640 void OSDService::agent_stop()
641 {
642 {
643 std::lock_guard l(agent_lock);
644
645 // By this time all ops should be cancelled
646 ceph_assert(agent_ops == 0);
647 // By this time all PGs are shutdown and dequeued
648 if (!agent_queue.empty()) {
649 set<PGRef>& top = agent_queue.rbegin()->second;
650 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
651 ceph_abort_msg("agent queue not empty");
652 }
653
654 agent_stop_flag = true;
655 agent_cond.notify_all();
656 }
657 agent_thread.join();
658 }
659
660 // -------------------------------------
661
662 void OSDService::promote_throttle_recalibrate()
663 {
664 utime_t now = ceph_clock_now();
665 double dur = now - last_recalibrate;
666 last_recalibrate = now;
667 unsigned prob = promote_probability_millis;
668
669 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
670 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
671
672 unsigned min_prob = 1;
673
674 uint64_t attempts, obj, bytes;
675 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
676 dout(10) << __func__ << " " << attempts << " attempts, promoted "
677 << obj << " objects and " << byte_u_t(bytes) << "; target "
678 << target_obj_sec << " obj/sec or "
679 << byte_u_t(target_bytes_sec) << "/sec"
680 << dendl;
681
682 // calculate what the probability *should* be, given the targets
683 unsigned new_prob;
684 if (attempts && dur > 0) {
685 uint64_t avg_size = 1;
686 if (obj)
687 avg_size = std::max<uint64_t>(bytes / obj, 1);
688 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
689 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
690 / (double)attempts;
691 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
692 << avg_size << dendl;
693 if (target_obj_sec && target_bytes_sec)
694 new_prob = std::min(po, pb);
695 else if (target_obj_sec)
696 new_prob = po;
697 else if (target_bytes_sec)
698 new_prob = pb;
699 else
700 new_prob = 1000;
701 } else {
702 new_prob = 1000;
703 }
704 dout(20) << __func__ << " new_prob " << new_prob << dendl;
705
706 // correct for persistent skew between target rate and actual rate, adjust
707 double ratio = 1.0;
708 unsigned actual = 0;
709 if (attempts && obj) {
710 actual = obj * 1000 / attempts;
711 ratio = (double)actual / (double)prob;
712 new_prob = (double)new_prob / ratio;
713 }
714 new_prob = std::max(new_prob, min_prob);
715 new_prob = std::min(new_prob, 1000u);
716
717 // adjust
718 prob = (prob + new_prob) / 2;
719 prob = std::max(prob, min_prob);
720 prob = std::min(prob, 1000u);
721 dout(10) << __func__ << " actual " << actual
722 << ", actual/prob ratio " << ratio
723 << ", adjusted new_prob " << new_prob
724 << ", prob " << promote_probability_millis << " -> " << prob
725 << dendl;
726 promote_probability_millis = prob;
727
728 // set hard limits for this interval to mitigate stampedes
729 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
730 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
731 }
732
733 // -------------------------------------
734
735 float OSDService::get_failsafe_full_ratio()
736 {
737 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
738 if (full_ratio > 1.0) full_ratio /= 100.0;
739 return full_ratio;
740 }
741
742 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
743 {
744 // The OSDMap ratios take precendence. So if the failsafe is .95 and
745 // the admin sets the cluster full to .96, the failsafe moves up to .96
746 // too. (Not that having failsafe == full is ideal, but it's better than
747 // dropping writes before the clusters appears full.)
748 OSDMapRef osdmap = get_osdmap();
749 if (!osdmap || osdmap->get_epoch() == 0) {
750 return NONE;
751 }
752 float nearfull_ratio = osdmap->get_nearfull_ratio();
753 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
754 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
755 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
756
757 if (osdmap->require_osd_release < ceph_release_t::luminous) {
758 // use the failsafe for nearfull and full; the mon isn't using the
759 // flags anyway because we're mid-upgrade.
760 full_ratio = failsafe_ratio;
761 backfillfull_ratio = failsafe_ratio;
762 nearfull_ratio = failsafe_ratio;
763 } else if (full_ratio <= 0 ||
764 backfillfull_ratio <= 0 ||
765 nearfull_ratio <= 0) {
766 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
767 // use failsafe flag. ick. the monitor did something wrong or the user
768 // did something stupid.
769 full_ratio = failsafe_ratio;
770 backfillfull_ratio = failsafe_ratio;
771 nearfull_ratio = failsafe_ratio;
772 }
773
774 if (injectfull_state > NONE && injectfull) {
775 inject = "(Injected)";
776 return injectfull_state;
777 } else if (pratio > failsafe_ratio) {
778 return FAILSAFE;
779 } else if (ratio > full_ratio) {
780 return FULL;
781 } else if (ratio > backfillfull_ratio) {
782 return BACKFILLFULL;
783 } else if (pratio > nearfull_ratio) {
784 return NEARFULL;
785 }
786 return NONE;
787 }
788
789 void OSDService::check_full_status(float ratio, float pratio)
790 {
791 std::lock_guard l(full_status_lock);
792
793 cur_ratio = ratio;
794 physical_ratio = pratio;
795
796 string inject;
797 s_names new_state;
798 new_state = recalc_full_state(ratio, pratio, inject);
799
800 dout(20) << __func__ << " cur ratio " << ratio
801 << ", physical ratio " << pratio
802 << ", new state " << get_full_state_name(new_state)
803 << " " << inject
804 << dendl;
805
806 // warn
807 if (cur_state != new_state) {
808 dout(10) << __func__ << " " << get_full_state_name(cur_state)
809 << " -> " << get_full_state_name(new_state) << dendl;
810 if (new_state == FAILSAFE) {
811 clog->error() << "full status failsafe engaged, dropping updates, now "
812 << (int)roundf(ratio * 100) << "% full";
813 } else if (cur_state == FAILSAFE) {
814 clog->error() << "full status failsafe disengaged, no longer dropping "
815 << "updates, now " << (int)roundf(ratio * 100) << "% full";
816 }
817 cur_state = new_state;
818 }
819 }
820
821 bool OSDService::need_fullness_update()
822 {
823 OSDMapRef osdmap = get_osdmap();
824 s_names cur = NONE;
825 if (osdmap->exists(whoami)) {
826 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
827 cur = FULL;
828 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
829 cur = BACKFILLFULL;
830 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
831 cur = NEARFULL;
832 }
833 }
834 s_names want = NONE;
835 if (is_full())
836 want = FULL;
837 else if (is_backfillfull())
838 want = BACKFILLFULL;
839 else if (is_nearfull())
840 want = NEARFULL;
841 return want != cur;
842 }
843
844 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
845 {
846 if (injectfull && injectfull_state >= type) {
847 // injectfull is either a count of the number of times to return failsafe full
848 // or if -1 then always return full
849 if (injectfull > 0)
850 --injectfull;
851 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
852 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
853 << dendl;
854 return true;
855 }
856 return false;
857 }
858
859 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
860 {
861 std::lock_guard l(full_status_lock);
862
863 if (_check_inject_full(dpp, type))
864 return true;
865
866 if (cur_state >= type)
867 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
868 << " physical " << physical_ratio << dendl;
869
870 return cur_state >= type;
871 }
872
873 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
874 {
875 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
876 {
877 std::lock_guard l(full_status_lock);
878 if (_check_inject_full(dpp, type)) {
879 return true;
880 }
881 }
882
883 float pratio;
884 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
885
886 string notused;
887 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
888
889 if (tentative_state >= type)
890 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
891
892 return tentative_state >= type;
893 }
894
895 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
896 {
897 return _check_full(dpp, FAILSAFE);
898 }
899
900 bool OSDService::check_full(DoutPrefixProvider *dpp) const
901 {
902 return _check_full(dpp, FULL);
903 }
904
905 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
906 {
907 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
908 }
909
910 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
911 {
912 return _check_full(dpp, BACKFILLFULL);
913 }
914
915 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
916 {
917 return _check_full(dpp, NEARFULL);
918 }
919
920 bool OSDService::is_failsafe_full() const
921 {
922 std::lock_guard l(full_status_lock);
923 return cur_state == FAILSAFE;
924 }
925
926 bool OSDService::is_full() const
927 {
928 std::lock_guard l(full_status_lock);
929 return cur_state >= FULL;
930 }
931
932 bool OSDService::is_backfillfull() const
933 {
934 std::lock_guard l(full_status_lock);
935 return cur_state >= BACKFILLFULL;
936 }
937
938 bool OSDService::is_nearfull() const
939 {
940 std::lock_guard l(full_status_lock);
941 return cur_state >= NEARFULL;
942 }
943
944 void OSDService::set_injectfull(s_names type, int64_t count)
945 {
946 std::lock_guard l(full_status_lock);
947 injectfull_state = type;
948 injectfull = count;
949 }
950
951 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
952 osd_alert_list_t& alerts)
953 {
954 uint64_t bytes = stbuf.total;
955 uint64_t avail = stbuf.available;
956 uint64_t used = stbuf.get_used_raw();
957
958 // For testing fake statfs values so it doesn't matter if all
959 // OSDs are using the same partition.
960 if (cct->_conf->fake_statfs_for_testing) {
961 uint64_t total_num_bytes = 0;
962 vector<PGRef> pgs;
963 osd->_get_pgs(&pgs);
964 for (auto p : pgs) {
965 total_num_bytes += p->get_stats_num_bytes();
966 }
967 bytes = cct->_conf->fake_statfs_for_testing;
968 if (total_num_bytes < bytes)
969 avail = bytes - total_num_bytes;
970 else
971 avail = 0;
972 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
973 << " adjust available " << avail
974 << dendl;
975 used = bytes - avail;
976 }
977
978 logger->set(l_osd_stat_bytes, bytes);
979 logger->set(l_osd_stat_bytes_used, used);
980 logger->set(l_osd_stat_bytes_avail, avail);
981
982 std::lock_guard l(stat_lock);
983 osd_stat.statfs = stbuf;
984 osd_stat.os_alerts.clear();
985 osd_stat.os_alerts[whoami].swap(alerts);
986 if (cct->_conf->fake_statfs_for_testing) {
987 osd_stat.statfs.total = bytes;
988 osd_stat.statfs.available = avail;
989 // For testing don't want used to go negative, so clear reserved
990 osd_stat.statfs.internally_reserved = 0;
991 }
992 }
993
994 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
995 int num_pgs)
996 {
997 utime_t now = ceph_clock_now();
998 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
999 std::lock_guard l(stat_lock);
1000 osd_stat.hb_peers.swap(hb_peers);
1001 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1002 osd_stat.num_pgs = num_pgs;
1003 // Clean entries that aren't updated
1004 // This is called often enough that we can just remove 1 at a time
1005 for (auto i: osd_stat.hb_pingtime) {
1006 if (i.second.last_update == 0)
1007 continue;
1008 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1009 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1010 << " last_update " << i.second.last_update << dendl;
1011 osd_stat.hb_pingtime.erase(i.first);
1012 break;
1013 }
1014 }
1015 return osd_stat;
1016 }
1017
1018 void OSDService::inc_osd_stat_repaired()
1019 {
1020 std::lock_guard l(stat_lock);
1021 osd_stat.num_shards_repaired++;
1022 return;
1023 }
1024
1025 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1026 uint64_t adjust_used)
1027 {
1028 *pratio =
1029 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1030
1031 if (adjust_used) {
1032 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1033 if (new_stat.statfs.available > adjust_used)
1034 new_stat.statfs.available -= adjust_used;
1035 else
1036 new_stat.statfs.available = 0;
1037 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1038 }
1039
1040 // Check all pgs and adjust kb_used to include all pending backfill data
1041 int backfill_adjusted = 0;
1042 vector<PGRef> pgs;
1043 osd->_get_pgs(&pgs);
1044 for (auto p : pgs) {
1045 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1046 }
1047 if (backfill_adjusted) {
1048 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1049 }
1050 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1051 }
1052
1053 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1054 {
1055 OSDMapRef next_map = get_nextmap_reserved();
1056 // service map is always newer/newest
1057 ceph_assert(from_epoch <= next_map->get_epoch());
1058
1059 if (next_map->is_down(peer) ||
1060 next_map->get_info(peer).up_from > from_epoch) {
1061 m->put();
1062 release_map(next_map);
1063 return;
1064 }
1065 ConnectionRef peer_con;
1066 if (peer == whoami) {
1067 peer_con = osd->cluster_messenger->get_loopback_connection();
1068 } else {
1069 peer_con = osd->cluster_messenger->connect_to_osd(
1070 next_map->get_cluster_addrs(peer), false, true);
1071 }
1072 maybe_share_map(peer_con.get(), next_map);
1073 peer_con->send_message(m);
1074 release_map(next_map);
1075 }
1076
1077 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1078 {
1079 OSDMapRef next_map = get_nextmap_reserved();
1080 // service map is always newer/newest
1081 ceph_assert(from_epoch <= next_map->get_epoch());
1082
1083 for (auto& iter : messages) {
1084 if (next_map->is_down(iter.first) ||
1085 next_map->get_info(iter.first).up_from > from_epoch) {
1086 iter.second->put();
1087 continue;
1088 }
1089 ConnectionRef peer_con;
1090 if (iter.first == whoami) {
1091 peer_con = osd->cluster_messenger->get_loopback_connection();
1092 } else {
1093 peer_con = osd->cluster_messenger->connect_to_osd(
1094 next_map->get_cluster_addrs(iter.first), false, true);
1095 }
1096 maybe_share_map(peer_con.get(), next_map);
1097 peer_con->send_message(iter.second);
1098 }
1099 release_map(next_map);
1100 }
1101 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1102 {
1103 OSDMapRef next_map = get_nextmap_reserved();
1104 // service map is always newer/newest
1105 ceph_assert(from_epoch <= next_map->get_epoch());
1106
1107 if (next_map->is_down(peer) ||
1108 next_map->get_info(peer).up_from > from_epoch) {
1109 release_map(next_map);
1110 return NULL;
1111 }
1112 ConnectionRef con;
1113 if (peer == whoami) {
1114 con = osd->cluster_messenger->get_loopback_connection();
1115 } else {
1116 con = osd->cluster_messenger->connect_to_osd(
1117 next_map->get_cluster_addrs(peer), false, true);
1118 }
1119 release_map(next_map);
1120 return con;
1121 }
1122
1123 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1124 {
1125 OSDMapRef next_map = get_nextmap_reserved();
1126 // service map is always newer/newest
1127 ceph_assert(from_epoch <= next_map->get_epoch());
1128
1129 pair<ConnectionRef,ConnectionRef> ret;
1130 if (next_map->is_down(peer) ||
1131 next_map->get_info(peer).up_from > from_epoch) {
1132 release_map(next_map);
1133 return ret;
1134 }
1135 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1136 next_map->get_hb_back_addrs(peer));
1137 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1138 next_map->get_hb_front_addrs(peer));
1139 release_map(next_map);
1140 return ret;
1141 }
1142
1143 entity_name_t OSDService::get_cluster_msgr_name() const
1144 {
1145 return cluster_messenger->get_myname();
1146 }
1147
1148 void OSDService::queue_want_pg_temp(pg_t pgid,
1149 const vector<int>& want,
1150 bool forced)
1151 {
1152 std::lock_guard l(pg_temp_lock);
1153 auto p = pg_temp_pending.find(pgid);
1154 if (p == pg_temp_pending.end() ||
1155 p->second.acting != want ||
1156 forced) {
1157 pg_temp_wanted[pgid] = {want, forced};
1158 }
1159 }
1160
1161 void OSDService::remove_want_pg_temp(pg_t pgid)
1162 {
1163 std::lock_guard l(pg_temp_lock);
1164 pg_temp_wanted.erase(pgid);
1165 pg_temp_pending.erase(pgid);
1166 }
1167
1168 void OSDService::_sent_pg_temp()
1169 {
1170 #ifdef HAVE_STDLIB_MAP_SPLICING
1171 pg_temp_pending.merge(pg_temp_wanted);
1172 #else
1173 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1174 make_move_iterator(end(pg_temp_wanted)));
1175 #endif
1176 pg_temp_wanted.clear();
1177 }
1178
1179 void OSDService::requeue_pg_temp()
1180 {
1181 std::lock_guard l(pg_temp_lock);
1182 // wanted overrides pending. note that remove_want_pg_temp
1183 // clears the item out of both.
1184 unsigned old_wanted = pg_temp_wanted.size();
1185 unsigned old_pending = pg_temp_pending.size();
1186 _sent_pg_temp();
1187 pg_temp_wanted.swap(pg_temp_pending);
1188 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1189 << pg_temp_wanted.size() << dendl;
1190 }
1191
1192 std::ostream& operator<<(std::ostream& out,
1193 const OSDService::pg_temp_t& pg_temp)
1194 {
1195 out << pg_temp.acting;
1196 if (pg_temp.forced) {
1197 out << " (forced)";
1198 }
1199 return out;
1200 }
1201
1202 void OSDService::send_pg_temp()
1203 {
1204 std::lock_guard l(pg_temp_lock);
1205 if (pg_temp_wanted.empty())
1206 return;
1207 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1208 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1209 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1210 auto& m = ms[pg_temp.forced];
1211 if (!m) {
1212 m = new MOSDPGTemp(osdmap->get_epoch());
1213 m->forced = pg_temp.forced;
1214 }
1215 m->pg_temp.emplace(pgid, pg_temp.acting);
1216 }
1217 for (auto m : ms) {
1218 if (m) {
1219 monc->send_mon_message(m);
1220 }
1221 }
1222 _sent_pg_temp();
1223 }
1224
1225 void OSDService::send_pg_created(pg_t pgid)
1226 {
1227 std::lock_guard l(pg_created_lock);
1228 dout(20) << __func__ << dendl;
1229 auto o = get_osdmap();
1230 if (o->require_osd_release >= ceph_release_t::luminous) {
1231 pg_created.insert(pgid);
1232 monc->send_mon_message(new MOSDPGCreated(pgid));
1233 }
1234 }
1235
1236 void OSDService::send_pg_created()
1237 {
1238 std::lock_guard l(pg_created_lock);
1239 dout(20) << __func__ << dendl;
1240 auto o = get_osdmap();
1241 if (o->require_osd_release >= ceph_release_t::luminous) {
1242 for (auto pgid : pg_created) {
1243 monc->send_mon_message(new MOSDPGCreated(pgid));
1244 }
1245 }
1246 }
1247
1248 void OSDService::prune_pg_created()
1249 {
1250 std::lock_guard l(pg_created_lock);
1251 dout(20) << __func__ << dendl;
1252 auto o = get_osdmap();
1253 auto i = pg_created.begin();
1254 while (i != pg_created.end()) {
1255 auto p = o->get_pg_pool(i->pool());
1256 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1257 dout(20) << __func__ << " pruning " << *i << dendl;
1258 i = pg_created.erase(i);
1259 } else {
1260 dout(20) << __func__ << " keeping " << *i << dendl;
1261 ++i;
1262 }
1263 }
1264 }
1265
1266
1267 // --------------------------------------
1268 // dispatch
1269
1270 bool OSDService::can_inc_scrubs()
1271 {
1272 bool can_inc = false;
1273 std::lock_guard l(sched_scrub_lock);
1274
1275 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1276 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1277 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1278 can_inc = true;
1279 } else {
1280 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1281 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1282 }
1283
1284 return can_inc;
1285 }
1286
1287 bool OSDService::inc_scrubs_local()
1288 {
1289 bool result = false;
1290 std::lock_guard l{sched_scrub_lock};
1291 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1292 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1293 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1294 result = true;
1295 ++scrubs_local;
1296 } else {
1297 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1298 }
1299 return result;
1300 }
1301
1302 void OSDService::dec_scrubs_local()
1303 {
1304 std::lock_guard l{sched_scrub_lock};
1305 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1306 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1307 --scrubs_local;
1308 ceph_assert(scrubs_local >= 0);
1309 }
1310
1311 bool OSDService::inc_scrubs_remote()
1312 {
1313 bool result = false;
1314 std::lock_guard l{sched_scrub_lock};
1315 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1316 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1317 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1318 result = true;
1319 ++scrubs_remote;
1320 } else {
1321 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1322 }
1323 return result;
1324 }
1325
1326 void OSDService::dec_scrubs_remote()
1327 {
1328 std::lock_guard l{sched_scrub_lock};
1329 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1330 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1331 --scrubs_remote;
1332 ceph_assert(scrubs_remote >= 0);
1333 }
1334
1335 void OSDService::dump_scrub_reservations(Formatter *f)
1336 {
1337 std::lock_guard l{sched_scrub_lock};
1338 f->dump_int("scrubs_local", scrubs_local);
1339 f->dump_int("scrubs_remote", scrubs_remote);
1340 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1341 }
1342
1343 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1344 epoch_t *_bind_epoch) const
1345 {
1346 std::lock_guard l(epoch_lock);
1347 if (_boot_epoch)
1348 *_boot_epoch = boot_epoch;
1349 if (_up_epoch)
1350 *_up_epoch = up_epoch;
1351 if (_bind_epoch)
1352 *_bind_epoch = bind_epoch;
1353 }
1354
1355 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1356 const epoch_t *_bind_epoch)
1357 {
1358 std::lock_guard l(epoch_lock);
1359 if (_boot_epoch) {
1360 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1361 boot_epoch = *_boot_epoch;
1362 }
1363 if (_up_epoch) {
1364 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1365 up_epoch = *_up_epoch;
1366 }
1367 if (_bind_epoch) {
1368 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1369 bind_epoch = *_bind_epoch;
1370 }
1371 }
1372
1373 bool OSDService::prepare_to_stop()
1374 {
1375 std::unique_lock l(is_stopping_lock);
1376 if (get_state() != NOT_STOPPING)
1377 return false;
1378
1379 OSDMapRef osdmap = get_osdmap();
1380 if (osdmap && osdmap->is_up(whoami)) {
1381 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1382 set_state(PREPARING_TO_STOP);
1383 monc->send_mon_message(
1384 new MOSDMarkMeDown(
1385 monc->get_fsid(),
1386 whoami,
1387 osdmap->get_addrs(whoami),
1388 osdmap->get_epoch(),
1389 true // request ack
1390 ));
1391 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1392 is_stopping_cond.wait_for(l, timeout,
1393 [this] { return get_state() == STOPPING; });
1394 }
1395 dout(0) << __func__ << " starting shutdown" << dendl;
1396 set_state(STOPPING);
1397 return true;
1398 }
1399
1400 void OSDService::got_stop_ack()
1401 {
1402 std::scoped_lock l(is_stopping_lock);
1403 if (get_state() == PREPARING_TO_STOP) {
1404 dout(0) << __func__ << " starting shutdown" << dendl;
1405 set_state(STOPPING);
1406 is_stopping_cond.notify_all();
1407 } else {
1408 dout(10) << __func__ << " ignoring msg" << dendl;
1409 }
1410 }
1411
1412 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1413 OSDSuperblock& sblock)
1414 {
1415 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1416 osdmap->get_encoding_features());
1417 m->oldest_map = max_oldest_map;
1418 m->newest_map = sblock.newest_map;
1419
1420 int max = cct->_conf->osd_map_message_max;
1421 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1422
1423 if (since < m->oldest_map) {
1424 // we don't have the next map the target wants, so start with a
1425 // full map.
1426 bufferlist bl;
1427 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1428 << since << ", starting with full map" << dendl;
1429 since = m->oldest_map;
1430 if (!get_map_bl(since, bl)) {
1431 derr << __func__ << " missing full map " << since << dendl;
1432 goto panic;
1433 }
1434 max--;
1435 max_bytes -= bl.length();
1436 m->maps[since] = std::move(bl);
1437 }
1438 for (epoch_t e = since + 1; e <= to; ++e) {
1439 bufferlist bl;
1440 if (get_inc_map_bl(e, bl)) {
1441 m->incremental_maps[e] = std::move(bl);
1442 } else {
1443 dout(10) << __func__ << " missing incremental map " << e << dendl;
1444 if (!get_map_bl(e, bl)) {
1445 derr << __func__ << " also missing full map " << e << dendl;
1446 goto panic;
1447 }
1448 m->maps[e] = std::move(bl);
1449 }
1450 max--;
1451 max_bytes -= bl.length();
1452 if (max <= 0 || max_bytes <= 0) {
1453 break;
1454 }
1455 }
1456 return m;
1457
1458 panic:
1459 if (!m->maps.empty() ||
1460 !m->incremental_maps.empty()) {
1461 // send what we have so far
1462 return m;
1463 }
1464 // send something
1465 bufferlist bl;
1466 if (get_inc_map_bl(m->newest_map, bl)) {
1467 m->incremental_maps[m->newest_map] = std::move(bl);
1468 } else {
1469 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1470 if (!get_map_bl(m->newest_map, bl)) {
1471 derr << __func__ << " unable to load latest full map " << m->newest_map
1472 << dendl;
1473 ceph_abort();
1474 }
1475 m->maps[m->newest_map] = std::move(bl);
1476 }
1477 return m;
1478 }
1479
1480 void OSDService::send_map(MOSDMap *m, Connection *con)
1481 {
1482 con->send_message(m);
1483 }
1484
1485 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1486 const OSDMapRef& osdmap)
1487 {
1488 epoch_t to = osdmap->get_epoch();
1489 dout(10) << "send_incremental_map " << since << " -> " << to
1490 << " to " << con << " " << con->get_peer_addr() << dendl;
1491
1492 MOSDMap *m = NULL;
1493 while (!m) {
1494 OSDSuperblock sblock(get_superblock());
1495 if (since < sblock.oldest_map) {
1496 // just send latest full map
1497 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1498 osdmap->get_encoding_features());
1499 m->oldest_map = max_oldest_map;
1500 m->newest_map = sblock.newest_map;
1501 get_map_bl(to, m->maps[to]);
1502 send_map(m, con);
1503 return;
1504 }
1505
1506 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1507 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1508 << ", only sending most recent" << dendl;
1509 since = to - cct->_conf->osd_map_share_max_epochs;
1510 }
1511
1512 m = build_incremental_map_msg(since, to, sblock);
1513 }
1514 send_map(m, con);
1515 }
1516
1517 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1518 {
1519 bool found = map_bl_cache.lookup(e, &bl);
1520 if (found) {
1521 logger->inc(l_osd_map_bl_cache_hit);
1522 return true;
1523 }
1524 logger->inc(l_osd_map_bl_cache_miss);
1525 found = store->read(meta_ch,
1526 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1527 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1528 if (found) {
1529 _add_map_bl(e, bl);
1530 }
1531 return found;
1532 }
1533
1534 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1535 {
1536 std::lock_guard l(map_cache_lock);
1537 bool found = map_bl_inc_cache.lookup(e, &bl);
1538 if (found) {
1539 logger->inc(l_osd_map_bl_cache_hit);
1540 return true;
1541 }
1542 logger->inc(l_osd_map_bl_cache_miss);
1543 found = store->read(meta_ch,
1544 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1545 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1546 if (found) {
1547 _add_map_inc_bl(e, bl);
1548 }
1549 return found;
1550 }
1551
1552 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1553 {
1554 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1555 // cache a contiguous buffer
1556 if (bl.get_num_buffers() > 1) {
1557 bl.rebuild();
1558 }
1559 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1560 map_bl_cache.add(e, bl);
1561 }
1562
1563 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1564 {
1565 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1566 // cache a contiguous buffer
1567 if (bl.get_num_buffers() > 1) {
1568 bl.rebuild();
1569 }
1570 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1571 map_bl_inc_cache.add(e, bl);
1572 }
1573
1574 OSDMapRef OSDService::_add_map(OSDMap *o)
1575 {
1576 epoch_t e = o->get_epoch();
1577
1578 if (cct->_conf->osd_map_dedup) {
1579 // Dedup against an existing map at a nearby epoch
1580 OSDMapRef for_dedup = map_cache.lower_bound(e);
1581 if (for_dedup) {
1582 OSDMap::dedup(for_dedup.get(), o);
1583 }
1584 }
1585 bool existed;
1586 OSDMapRef l = map_cache.add(e, o, &existed);
1587 if (existed) {
1588 delete o;
1589 }
1590 return l;
1591 }
1592
1593 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1594 {
1595 std::lock_guard l(map_cache_lock);
1596 OSDMapRef retval = map_cache.lookup(epoch);
1597 if (retval) {
1598 dout(30) << "get_map " << epoch << " -cached" << dendl;
1599 logger->inc(l_osd_map_cache_hit);
1600 return retval;
1601 }
1602 {
1603 logger->inc(l_osd_map_cache_miss);
1604 epoch_t lb = map_cache.cached_key_lower_bound();
1605 if (epoch < lb) {
1606 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1607 logger->inc(l_osd_map_cache_miss_low);
1608 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1609 }
1610 }
1611
1612 OSDMap *map = new OSDMap;
1613 if (epoch > 0) {
1614 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1615 bufferlist bl;
1616 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1617 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1618 delete map;
1619 return OSDMapRef();
1620 }
1621 map->decode(bl);
1622 } else {
1623 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1624 }
1625 return _add_map(map);
1626 }
1627
1628 // ops
1629
1630
1631 void OSDService::reply_op_error(OpRequestRef op, int err)
1632 {
1633 reply_op_error(op, err, eversion_t(), 0, {});
1634 }
1635
1636 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1637 version_t uv,
1638 vector<pg_log_op_return_item_t> op_returns)
1639 {
1640 auto m = op->get_req<MOSDOp>();
1641 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1642 int flags;
1643 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1644
1645 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1646 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1647 reply->set_reply_versions(v, uv);
1648 reply->set_op_returns(op_returns);
1649 m->get_connection()->send_message(reply);
1650 }
1651
1652 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1653 {
1654 if (!cct->_conf->osd_debug_misdirected_ops) {
1655 return;
1656 }
1657
1658 auto m = op->get_req<MOSDOp>();
1659 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1660
1661 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1662
1663 if (pg->is_ec_pg()) {
1664 /**
1665 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1666 * can get this result:
1667 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1668 * [CRUSH_ITEM_NONE, 2, 3]/3
1669 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1670 * [3, 2, 3]/3
1671 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1672 * -- misdirected op
1673 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1674 * it and fulfils it
1675 *
1676 * We can't compute the op target based on the sending map epoch due to
1677 * splitting. The simplest thing is to detect such cases here and drop
1678 * them without an error (the client will resend anyway).
1679 */
1680 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1681 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1682 if (!opmap) {
1683 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1684 << m->get_map_epoch() << ", dropping" << dendl;
1685 return;
1686 }
1687 pg_t _pgid = m->get_raw_pg();
1688 spg_t pgid;
1689 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1690 _pgid = opmap->raw_pg_to_pg(_pgid);
1691 if (opmap->get_primary_shard(_pgid, &pgid) &&
1692 pgid.shard != pg->pg_id.shard) {
1693 dout(7) << __func__ << ": " << *pg << " primary changed since "
1694 << m->get_map_epoch() << ", dropping" << dendl;
1695 return;
1696 }
1697 }
1698
1699 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1700 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1701 << " pg " << m->get_raw_pg()
1702 << " to osd." << whoami
1703 << " not " << pg->get_acting()
1704 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1705 }
1706
1707 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1708 {
1709 osd->op_shardedwq.queue(std::move(qi));
1710 }
1711
1712 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1713 {
1714 osd->op_shardedwq.queue_front(std::move(qi));
1715 }
1716
1717 void OSDService::queue_recovery_context(
1718 PG *pg,
1719 GenContext<ThreadPool::TPHandle&> *c)
1720 {
1721 epoch_t e = get_osdmap_epoch();
1722 enqueue_back(
1723 OpSchedulerItem(
1724 unique_ptr<OpSchedulerItem::OpQueueable>(
1725 new PGRecoveryContext(pg->get_pgid(), c, e)),
1726 cct->_conf->osd_recovery_cost,
1727 cct->_conf->osd_recovery_priority,
1728 ceph_clock_now(),
1729 0,
1730 e));
1731 }
1732
1733 void OSDService::queue_for_snap_trim(PG *pg)
1734 {
1735 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1736 enqueue_back(
1737 OpSchedulerItem(
1738 unique_ptr<OpSchedulerItem::OpQueueable>(
1739 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1740 cct->_conf->osd_snap_trim_cost,
1741 cct->_conf->osd_snap_trim_priority,
1742 ceph_clock_now(),
1743 0,
1744 pg->get_osdmap_epoch()));
1745 }
1746
1747 template <class MSG_TYPE>
1748 void OSDService::queue_scrub_event_msg(PG* pg,
1749 Scrub::scrub_prio_t with_priority,
1750 unsigned int qu_priority)
1751 {
1752 const auto epoch = pg->get_osdmap_epoch();
1753 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1754 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1755
1756 enqueue_back(OpSchedulerItem(
1757 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1758 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1759 }
1760
1761 template <class MSG_TYPE>
1762 void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority)
1763 {
1764 const auto epoch = pg->get_osdmap_epoch();
1765 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1766 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1767
1768 enqueue_back(OpSchedulerItem(
1769 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1770 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1771 }
1772
1773 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1774 {
1775 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1776 }
1777
1778 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1779 {
1780 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1781 }
1782
1783 void OSDService::queue_for_rep_scrub(PG* pg,
1784 Scrub::scrub_prio_t with_priority,
1785 unsigned int qu_priority)
1786 {
1787 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
1788 }
1789
1790 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1791 Scrub::scrub_prio_t with_priority,
1792 unsigned int qu_priority)
1793 {
1794 // Resulting scrub event: 'SchedReplica'
1795 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
1796 }
1797
1798 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1799 {
1800 // Resulting scrub event: 'RemotesReserved'
1801 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1802 }
1803
1804 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1805 {
1806 // Resulting scrub event: 'ReservationFailure'
1807 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1808 }
1809
1810 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1811 {
1812 // Resulting scrub event: 'InternalSchedScrub'
1813 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1814 }
1815
1816 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1817 {
1818 // Resulting scrub event: 'ActivePushesUpd'
1819 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1820 }
1821
1822 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1823 {
1824 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1825 }
1826
1827 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1828 {
1829 // Resulting scrub event: 'Unblocked'
1830 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1831 }
1832
1833 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1834 {
1835 // Resulting scrub event: 'DigestUpdate'
1836 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1837 }
1838
1839 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1840 {
1841 // Resulting scrub event: 'GotReplicas'
1842 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1843 }
1844
1845 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1846 {
1847 // Resulting scrub event: 'ReplicaPushesUpd'
1848 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1849 }
1850
1851 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1852 {
1853 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1854 enqueue_back(
1855 OpSchedulerItem(
1856 unique_ptr<OpSchedulerItem::OpQueueable>(
1857 new PGDelete(pgid, e)),
1858 cct->_conf->osd_pg_delete_cost,
1859 cct->_conf->osd_pg_delete_priority,
1860 ceph_clock_now(),
1861 0,
1862 e));
1863 }
1864
1865 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1866 {
1867 return osd->try_finish_pg_delete(pg, old_pg_num);
1868 }
1869
1870 // ---
1871
1872 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1873 {
1874 std::lock_guard l(merge_lock);
1875 dout(10) << __func__ << " " << pg->pg_id << dendl;
1876 ready_to_merge_source[pg->pg_id.pgid] = version;
1877 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1878 _send_ready_to_merge();
1879 }
1880
1881 void OSDService::set_ready_to_merge_target(PG *pg,
1882 eversion_t version,
1883 epoch_t last_epoch_started,
1884 epoch_t last_epoch_clean)
1885 {
1886 std::lock_guard l(merge_lock);
1887 dout(10) << __func__ << " " << pg->pg_id << dendl;
1888 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1889 make_tuple(version,
1890 last_epoch_started,
1891 last_epoch_clean)));
1892 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1893 _send_ready_to_merge();
1894 }
1895
1896 void OSDService::set_not_ready_to_merge_source(pg_t source)
1897 {
1898 std::lock_guard l(merge_lock);
1899 dout(10) << __func__ << " " << source << dendl;
1900 not_ready_to_merge_source.insert(source);
1901 assert(ready_to_merge_source.count(source) == 0);
1902 _send_ready_to_merge();
1903 }
1904
1905 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1906 {
1907 std::lock_guard l(merge_lock);
1908 dout(10) << __func__ << " " << target << " source " << source << dendl;
1909 not_ready_to_merge_target[target] = source;
1910 assert(ready_to_merge_target.count(target) == 0);
1911 _send_ready_to_merge();
1912 }
1913
1914 void OSDService::send_ready_to_merge()
1915 {
1916 std::lock_guard l(merge_lock);
1917 _send_ready_to_merge();
1918 }
1919
1920 void OSDService::_send_ready_to_merge()
1921 {
1922 dout(20) << __func__
1923 << " ready_to_merge_source " << ready_to_merge_source
1924 << " not_ready_to_merge_source " << not_ready_to_merge_source
1925 << " ready_to_merge_target " << ready_to_merge_target
1926 << " not_ready_to_merge_target " << not_ready_to_merge_target
1927 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1928 << dendl;
1929 for (auto src : not_ready_to_merge_source) {
1930 if (sent_ready_to_merge_source.count(src) == 0) {
1931 monc->send_mon_message(new MOSDPGReadyToMerge(
1932 src,
1933 {}, {}, 0, 0,
1934 false,
1935 osdmap->get_epoch()));
1936 sent_ready_to_merge_source.insert(src);
1937 }
1938 }
1939 for (auto p : not_ready_to_merge_target) {
1940 if (sent_ready_to_merge_source.count(p.second) == 0) {
1941 monc->send_mon_message(new MOSDPGReadyToMerge(
1942 p.second,
1943 {}, {}, 0, 0,
1944 false,
1945 osdmap->get_epoch()));
1946 sent_ready_to_merge_source.insert(p.second);
1947 }
1948 }
1949 for (auto src : ready_to_merge_source) {
1950 if (not_ready_to_merge_source.count(src.first) ||
1951 not_ready_to_merge_target.count(src.first.get_parent())) {
1952 continue;
1953 }
1954 auto p = ready_to_merge_target.find(src.first.get_parent());
1955 if (p != ready_to_merge_target.end() &&
1956 sent_ready_to_merge_source.count(src.first) == 0) {
1957 monc->send_mon_message(new MOSDPGReadyToMerge(
1958 src.first, // source pgid
1959 src.second, // src version
1960 std::get<0>(p->second), // target version
1961 std::get<1>(p->second), // PG's last_epoch_started
1962 std::get<2>(p->second), // PG's last_epoch_clean
1963 true,
1964 osdmap->get_epoch()));
1965 sent_ready_to_merge_source.insert(src.first);
1966 }
1967 }
1968 }
1969
1970 void OSDService::clear_ready_to_merge(PG *pg)
1971 {
1972 std::lock_guard l(merge_lock);
1973 dout(10) << __func__ << " " << pg->pg_id << dendl;
1974 ready_to_merge_source.erase(pg->pg_id.pgid);
1975 ready_to_merge_target.erase(pg->pg_id.pgid);
1976 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1977 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1978 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1979 }
1980
1981 void OSDService::clear_sent_ready_to_merge()
1982 {
1983 std::lock_guard l(merge_lock);
1984 sent_ready_to_merge_source.clear();
1985 }
1986
1987 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1988 {
1989 std::lock_guard l(merge_lock);
1990 auto i = sent_ready_to_merge_source.begin();
1991 while (i != sent_ready_to_merge_source.end()) {
1992 if (!osdmap->pg_exists(*i)) {
1993 dout(10) << __func__ << " " << *i << dendl;
1994 i = sent_ready_to_merge_source.erase(i);
1995 } else {
1996 ++i;
1997 }
1998 }
1999 }
2000
2001 // ---
2002
2003 void OSDService::_queue_for_recovery(
2004 std::pair<epoch_t, PGRef> p,
2005 uint64_t reserved_pushes)
2006 {
2007 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
2008 enqueue_back(
2009 OpSchedulerItem(
2010 unique_ptr<OpSchedulerItem::OpQueueable>(
2011 new PGRecovery(
2012 p.second->get_pgid(), p.first, reserved_pushes)),
2013 cct->_conf->osd_recovery_cost,
2014 cct->_conf->osd_recovery_priority,
2015 ceph_clock_now(),
2016 0,
2017 p.first));
2018 }
2019
2020 // ====================================================================
2021 // OSD
2022
2023 #undef dout_prefix
2024 #define dout_prefix *_dout
2025
2026 // Commands shared between OSD's console and admin console:
2027 namespace ceph::osd_cmds {
2028
2029 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2030
2031 } // namespace ceph::osd_cmds
2032
2033 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
2034 {
2035 int ret;
2036
2037 OSDSuperblock sb;
2038 bufferlist sbbl;
2039 ObjectStore::CollectionHandle ch;
2040
2041 // if we are fed a uuid for this osd, use it.
2042 store->set_fsid(cct->_conf->osd_uuid);
2043
2044 ret = store->mkfs();
2045 if (ret) {
2046 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2047 << cpp_strerror(ret) << dendl;
2048 goto free_store;
2049 }
2050
2051 store->set_cache_shards(1); // doesn't matter for mkfs!
2052
2053 ret = store->mount();
2054 if (ret) {
2055 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2056 << cpp_strerror(ret) << dendl;
2057 goto free_store;
2058 }
2059
2060 ch = store->open_collection(coll_t::meta());
2061 if (ch) {
2062 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2063 if (ret < 0) {
2064 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2065 goto free_store;
2066 }
2067 /* if we already have superblock, check content of superblock */
2068 dout(0) << " have superblock" << dendl;
2069 auto p = sbbl.cbegin();
2070 decode(sb, p);
2071 if (whoami != sb.whoami) {
2072 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2073 << dendl;
2074 ret = -EINVAL;
2075 goto umount_store;
2076 }
2077 if (fsid != sb.cluster_fsid) {
2078 derr << "provided cluster fsid " << fsid
2079 << " != superblock's " << sb.cluster_fsid << dendl;
2080 ret = -EINVAL;
2081 goto umount_store;
2082 }
2083 } else {
2084 // create superblock
2085 sb.cluster_fsid = fsid;
2086 sb.osd_fsid = store->get_fsid();
2087 sb.whoami = whoami;
2088 sb.compat_features = get_osd_initial_compat_set();
2089
2090 bufferlist bl;
2091 encode(sb, bl);
2092
2093 ObjectStore::CollectionHandle ch = store->create_new_collection(
2094 coll_t::meta());
2095 ObjectStore::Transaction t;
2096 t.create_collection(coll_t::meta(), 0);
2097 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2098 ret = store->queue_transaction(ch, std::move(t));
2099 if (ret) {
2100 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2101 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2102 goto umount_store;
2103 }
2104 }
2105
2106 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2107 if (ret) {
2108 derr << "OSD::mkfs: failed to write fsid file: error "
2109 << cpp_strerror(ret) << dendl;
2110 goto umount_store;
2111 }
2112
2113 umount_store:
2114 if (ch) {
2115 ch.reset();
2116 }
2117 store->umount();
2118 free_store:
2119 delete store;
2120 return ret;
2121 }
2122
2123 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2124 {
2125 char val[80];
2126 int r;
2127
2128 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2129 r = store->write_meta("magic", val);
2130 if (r < 0)
2131 return r;
2132
2133 snprintf(val, sizeof(val), "%d", whoami);
2134 r = store->write_meta("whoami", val);
2135 if (r < 0)
2136 return r;
2137
2138 cluster_fsid.print(val);
2139 r = store->write_meta("ceph_fsid", val);
2140 if (r < 0)
2141 return r;
2142
2143 string key = cct->_conf.get_val<string>("key");
2144 if (key.size()) {
2145 r = store->write_meta("osd_key", key);
2146 if (r < 0)
2147 return r;
2148 } else {
2149 string keyfile = cct->_conf.get_val<string>("keyfile");
2150 if (!keyfile.empty()) {
2151 bufferlist keybl;
2152 string err;
2153 r = keybl.read_file(keyfile.c_str(), &err);
2154 if (r < 0) {
2155 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2156 << err << ": " << cpp_strerror(r) << dendl;
2157 return r;
2158 }
2159 r = store->write_meta("osd_key", keybl.to_str());
2160 if (r < 0)
2161 return r;
2162 }
2163 }
2164 if (!osdspec_affinity.empty()) {
2165 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2166 if (r < 0)
2167 return r;
2168 }
2169
2170 r = store->write_meta("ready", "ready");
2171 if (r < 0)
2172 return r;
2173
2174 return 0;
2175 }
2176
2177 int OSD::peek_meta(ObjectStore *store,
2178 std::string *magic,
2179 uuid_d *cluster_fsid,
2180 uuid_d *osd_fsid,
2181 int *whoami,
2182 ceph_release_t *require_osd_release)
2183 {
2184 string val;
2185
2186 int r = store->read_meta("magic", &val);
2187 if (r < 0)
2188 return r;
2189 *magic = val;
2190
2191 r = store->read_meta("whoami", &val);
2192 if (r < 0)
2193 return r;
2194 *whoami = atoi(val.c_str());
2195
2196 r = store->read_meta("ceph_fsid", &val);
2197 if (r < 0)
2198 return r;
2199 r = cluster_fsid->parse(val.c_str());
2200 if (!r)
2201 return -EINVAL;
2202
2203 r = store->read_meta("fsid", &val);
2204 if (r < 0) {
2205 *osd_fsid = uuid_d();
2206 } else {
2207 r = osd_fsid->parse(val.c_str());
2208 if (!r)
2209 return -EINVAL;
2210 }
2211
2212 r = store->read_meta("require_osd_release", &val);
2213 if (r >= 0) {
2214 *require_osd_release = ceph_release_from_name(val);
2215 }
2216
2217 return 0;
2218 }
2219
2220
2221 #undef dout_prefix
2222 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2223
2224 // cons/des
2225
2226 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2227 int id,
2228 Messenger *internal_messenger,
2229 Messenger *external_messenger,
2230 Messenger *hb_client_front,
2231 Messenger *hb_client_back,
2232 Messenger *hb_front_serverm,
2233 Messenger *hb_back_serverm,
2234 Messenger *osdc_messenger,
2235 MonClient *mc,
2236 const std::string &dev, const std::string &jdev,
2237 ceph::async::io_context_pool& poolctx) :
2238 Dispatcher(cct_),
2239 tick_timer(cct, osd_lock),
2240 tick_timer_without_osd_lock(cct, tick_timer_lock),
2241 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2242 cluster_messenger(internal_messenger),
2243 client_messenger(external_messenger),
2244 objecter_messenger(osdc_messenger),
2245 monc(mc),
2246 mgrc(cct_, client_messenger, &mc->monmap),
2247 logger(create_logger()),
2248 recoverystate_perf(create_recoverystate_perf()),
2249 store(store_),
2250 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2251 clog(log_client.create_channel()),
2252 whoami(id),
2253 dev_path(dev), journal_path(jdev),
2254 store_is_rotational(store->is_rotational()),
2255 trace_endpoint("0.0.0.0", 0, "osd"),
2256 asok_hook(NULL),
2257 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2258 "osd_pg_epoch_max_lag_factor")),
2259 osd_compat(get_osd_compat_set()),
2260 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2261 get_num_op_threads()),
2262 heartbeat_stop(false),
2263 heartbeat_need_update(true),
2264 hb_front_client_messenger(hb_client_front),
2265 hb_back_client_messenger(hb_client_back),
2266 hb_front_server_messenger(hb_front_serverm),
2267 hb_back_server_messenger(hb_back_serverm),
2268 daily_loadavg(0.0),
2269 heartbeat_thread(this),
2270 heartbeat_dispatcher(this),
2271 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2272 cct->_conf->osd_num_op_tracker_shard),
2273 test_ops_hook(NULL),
2274 op_shardedwq(
2275 this,
2276 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2277 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2278 &osd_op_tp),
2279 last_pg_create_epoch(0),
2280 boot_finisher(cct),
2281 up_thru_wanted(0),
2282 requested_full_first(0),
2283 requested_full_last(0),
2284 service(this, poolctx)
2285 {
2286
2287 if (!gss_ktfile_client.empty()) {
2288 // Assert we can export environment variable
2289 /*
2290 The default client keytab is used, if it is present and readable,
2291 to automatically obtain initial credentials for GSSAPI client
2292 applications. The principal name of the first entry in the client
2293 keytab is used by default when obtaining initial credentials.
2294 1. The KRB5_CLIENT_KTNAME environment variable.
2295 2. The default_client_keytab_name profile variable in [libdefaults].
2296 3. The hardcoded default, DEFCKTNAME.
2297 */
2298 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2299 gss_ktfile_client.c_str(), 1));
2300 ceph_assert(set_result == 0);
2301 }
2302
2303 monc->set_messenger(client_messenger);
2304 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2305 cct->_conf->osd_op_log_threshold);
2306 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2307 cct->_conf->osd_op_history_duration);
2308 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2309 cct->_conf->osd_op_history_slow_op_threshold);
2310 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2311 #ifdef WITH_BLKIN
2312 std::stringstream ss;
2313 ss << "osd." << whoami;
2314 trace_endpoint.copy_name(ss.str());
2315 #endif
2316
2317 // initialize shards
2318 num_shards = get_num_op_shards();
2319 for (uint32_t i = 0; i < num_shards; i++) {
2320 OSDShard *one_shard = new OSDShard(
2321 i,
2322 cct,
2323 this);
2324 shards.push_back(one_shard);
2325 }
2326
2327 // override some config options if mclock is enabled on all the shards
2328 maybe_override_options_for_qos();
2329 }
2330
2331 OSD::~OSD()
2332 {
2333 while (!shards.empty()) {
2334 delete shards.back();
2335 shards.pop_back();
2336 }
2337 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2338 cct->get_perfcounters_collection()->remove(logger);
2339 delete recoverystate_perf;
2340 delete logger;
2341 delete store;
2342 }
2343
2344 double OSD::get_tick_interval() const
2345 {
2346 // vary +/- 5% to avoid scrub scheduling livelocks
2347 constexpr auto delta = 0.05;
2348 return (OSD_TICK_INTERVAL *
2349 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2350 }
2351
2352 void OSD::handle_signal(int signum)
2353 {
2354 ceph_assert(signum == SIGINT || signum == SIGTERM);
2355 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2356 shutdown();
2357 }
2358
2359 int OSD::pre_init()
2360 {
2361 std::lock_guard lock(osd_lock);
2362 if (is_stopping())
2363 return 0;
2364
2365 if (store->test_mount_in_use()) {
2366 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2367 << "currently in use. (Is ceph-osd already running?)" << dendl;
2368 return -EBUSY;
2369 }
2370
2371 cct->_conf.add_observer(this);
2372 return 0;
2373 }
2374
2375 int OSD::set_numa_affinity()
2376 {
2377 // storage numa node
2378 int store_node = -1;
2379 store->get_numa_node(&store_node, nullptr, nullptr);
2380 if (store_node >= 0) {
2381 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2382 }
2383
2384 // check network numa node(s)
2385 int front_node = -1, back_node = -1;
2386 string front_iface = pick_iface(
2387 cct,
2388 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2389 string back_iface = pick_iface(
2390 cct,
2391 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2392 int r = get_iface_numa_node(front_iface, &front_node);
2393 if (r >= 0 && front_node >= 0) {
2394 dout(1) << __func__ << " public network " << front_iface << " numa node "
2395 << front_node << dendl;
2396 r = get_iface_numa_node(back_iface, &back_node);
2397 if (r >= 0 && back_node >= 0) {
2398 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2399 << back_node << dendl;
2400 if (front_node == back_node &&
2401 front_node == store_node) {
2402 dout(1) << " objectstore and network numa nodes all match" << dendl;
2403 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2404 numa_node = front_node;
2405 }
2406 } else if (front_node != back_node) {
2407 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2408 << dendl;
2409 } else {
2410 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2411 << dendl;
2412 }
2413 } else if (back_node == -2) {
2414 dout(1) << __func__ << " cluster network " << back_iface
2415 << " ports numa nodes do not match" << dendl;
2416 } else {
2417 derr << __func__ << " unable to identify cluster interface '" << back_iface
2418 << "' numa node: " << cpp_strerror(r) << dendl;
2419 }
2420 } else if (front_node == -2) {
2421 dout(1) << __func__ << " public network " << front_iface
2422 << " ports numa nodes do not match" << dendl;
2423 } else {
2424 derr << __func__ << " unable to identify public interface '" << front_iface
2425 << "' numa node: " << cpp_strerror(r) << dendl;
2426 }
2427 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2428 // this takes precedence over the automagic logic above
2429 numa_node = node;
2430 }
2431 if (numa_node >= 0) {
2432 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2433 if (r < 0) {
2434 dout(1) << __func__ << " unable to determine numa node " << numa_node
2435 << " CPUs" << dendl;
2436 numa_node = -1;
2437 } else {
2438 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2439 << " cpus "
2440 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2441 << dendl;
2442 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2443 if (r < 0) {
2444 r = -errno;
2445 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2446 << dendl;
2447 numa_node = -1;
2448 }
2449 }
2450 } else {
2451 dout(1) << __func__ << " not setting numa affinity" << dendl;
2452 }
2453 return 0;
2454 }
2455
2456 // asok
2457
2458 class OSDSocketHook : public AdminSocketHook {
2459 OSD *osd;
2460 public:
2461 explicit OSDSocketHook(OSD *o) : osd(o) {}
2462 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2463 Formatter *f,
2464 std::ostream& ss,
2465 bufferlist& out) override {
2466 ceph_abort("should use async hook");
2467 }
2468 void call_async(
2469 std::string_view prefix,
2470 const cmdmap_t& cmdmap,
2471 Formatter *f,
2472 const bufferlist& inbl,
2473 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2474 try {
2475 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2476 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2477 bufferlist empty;
2478 on_finish(-EINVAL, e.what(), empty);
2479 }
2480 }
2481 };
2482
2483 std::set<int64_t> OSD::get_mapped_pools()
2484 {
2485 std::set<int64_t> pools;
2486 std::vector<spg_t> pgids;
2487 _get_pgids(&pgids);
2488 for (const auto &pgid : pgids) {
2489 pools.insert(pgid.pool());
2490 }
2491 return pools;
2492 }
2493
2494 void OSD::asok_command(
2495 std::string_view prefix, const cmdmap_t& cmdmap,
2496 Formatter *f,
2497 const bufferlist& inbl,
2498 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2499 {
2500 int ret = 0;
2501 stringstream ss; // stderr error message stream
2502 bufferlist outbl; // if empty at end, we'll dump formatter as output
2503
2504 // --- PG commands are routed here to PG::do_command ---
2505 if (prefix == "pg" ||
2506 prefix == "query" ||
2507 prefix == "mark_unfound_lost" ||
2508 prefix == "list_unfound" ||
2509 prefix == "scrub" ||
2510 prefix == "deep_scrub"
2511 ) {
2512 string pgidstr;
2513 pg_t pgid;
2514 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2515 ss << "no pgid specified";
2516 ret = -EINVAL;
2517 goto out;
2518 }
2519 if (!pgid.parse(pgidstr.c_str())) {
2520 ss << "couldn't parse pgid '" << pgidstr << "'";
2521 ret = -EINVAL;
2522 goto out;
2523 }
2524 spg_t pcand;
2525 PGRef pg;
2526 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2527 (pg = _lookup_lock_pg(pcand))) {
2528 if (pg->is_primary()) {
2529 cmdmap_t new_cmdmap = cmdmap;
2530 try {
2531 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2532 pg->unlock();
2533 return; // the pg handler calls on_finish directly
2534 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2535 pg->unlock();
2536 ss << e.what();
2537 ret = -EINVAL;
2538 goto out;
2539 }
2540 } else {
2541 ss << "not primary for pgid " << pgid;
2542 // do not reply; they will get newer maps and realize they
2543 // need to resend.
2544 pg->unlock();
2545 ret = -EAGAIN;
2546 goto out;
2547 }
2548 } else {
2549 ss << "i don't have pgid " << pgid;
2550 ret = -ENOENT;
2551 }
2552 }
2553
2554 // --- OSD commands follow ---
2555
2556 else if (prefix == "status") {
2557 lock_guard l(osd_lock);
2558 f->open_object_section("status");
2559 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2560 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2561 f->dump_unsigned("whoami", superblock.whoami);
2562 f->dump_string("state", get_state_name(get_state()));
2563 f->dump_unsigned("oldest_map", superblock.oldest_map);
2564 f->dump_unsigned("newest_map", superblock.newest_map);
2565 f->dump_unsigned("num_pgs", num_pgs);
2566 f->close_section();
2567 } else if (prefix == "flush_journal") {
2568 store->flush_journal();
2569 } else if (prefix == "dump_ops_in_flight" ||
2570 prefix == "ops" ||
2571 prefix == "dump_blocked_ops" ||
2572 prefix == "dump_historic_ops" ||
2573 prefix == "dump_historic_ops_by_duration" ||
2574 prefix == "dump_historic_slow_ops") {
2575
2576 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2577 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2578 will start to track new ops received afterwards.";
2579
2580 set<string> filters;
2581 vector<string> filter_str;
2582 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2583 copy(filter_str.begin(), filter_str.end(),
2584 inserter(filters, filters.end()));
2585 }
2586
2587 if (prefix == "dump_ops_in_flight" ||
2588 prefix == "ops") {
2589 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2590 ss << error_str;
2591 ret = -EINVAL;
2592 goto out;
2593 }
2594 }
2595 if (prefix == "dump_blocked_ops") {
2596 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2597 ss << error_str;
2598 ret = -EINVAL;
2599 goto out;
2600 }
2601 }
2602 if (prefix == "dump_historic_ops") {
2603 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2604 ss << error_str;
2605 ret = -EINVAL;
2606 goto out;
2607 }
2608 }
2609 if (prefix == "dump_historic_ops_by_duration") {
2610 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2611 ss << error_str;
2612 ret = -EINVAL;
2613 goto out;
2614 }
2615 }
2616 if (prefix == "dump_historic_slow_ops") {
2617 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2618 ss << error_str;
2619 ret = -EINVAL;
2620 goto out;
2621 }
2622 }
2623 } else if (prefix == "dump_op_pq_state") {
2624 f->open_object_section("pq");
2625 op_shardedwq.dump(f);
2626 f->close_section();
2627 } else if (prefix == "dump_blocklist") {
2628 list<pair<entity_addr_t,utime_t> > bl;
2629 OSDMapRef curmap = service.get_osdmap();
2630
2631 f->open_array_section("blocklist");
2632 curmap->get_blocklist(&bl);
2633 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2634 it != bl.end(); ++it) {
2635 f->open_object_section("entry");
2636 f->open_object_section("entity_addr_t");
2637 it->first.dump(f);
2638 f->close_section(); //entity_addr_t
2639 it->second.localtime(f->dump_stream("expire_time"));
2640 f->close_section(); //entry
2641 }
2642 f->close_section(); //blocklist
2643 } else if (prefix == "dump_watchers") {
2644 list<obj_watch_item_t> watchers;
2645 // scan pg's
2646 vector<PGRef> pgs;
2647 _get_pgs(&pgs);
2648 for (auto& pg : pgs) {
2649 list<obj_watch_item_t> pg_watchers;
2650 pg->get_watchers(&pg_watchers);
2651 watchers.splice(watchers.end(), pg_watchers);
2652 }
2653
2654 f->open_array_section("watchers");
2655 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2656 it != watchers.end(); ++it) {
2657
2658 f->open_object_section("watch");
2659
2660 f->dump_string("namespace", it->obj.nspace);
2661 f->dump_string("object", it->obj.oid.name);
2662
2663 f->open_object_section("entity_name");
2664 it->wi.name.dump(f);
2665 f->close_section(); //entity_name_t
2666
2667 f->dump_unsigned("cookie", it->wi.cookie);
2668 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2669
2670 f->open_object_section("entity_addr_t");
2671 it->wi.addr.dump(f);
2672 f->close_section(); //entity_addr_t
2673
2674 f->close_section(); //watch
2675 }
2676
2677 f->close_section(); //watchers
2678 } else if (prefix == "dump_recovery_reservations") {
2679 f->open_object_section("reservations");
2680 f->open_object_section("local_reservations");
2681 service.local_reserver.dump(f);
2682 f->close_section();
2683 f->open_object_section("remote_reservations");
2684 service.remote_reserver.dump(f);
2685 f->close_section();
2686 f->close_section();
2687 } else if (prefix == "dump_scrub_reservations") {
2688 f->open_object_section("scrub_reservations");
2689 service.dump_scrub_reservations(f);
2690 f->close_section();
2691 } else if (prefix == "get_latest_osdmap") {
2692 get_latest_osdmap();
2693 } else if (prefix == "set_heap_property") {
2694 string property;
2695 int64_t value = 0;
2696 string error;
2697 bool success = false;
2698 if (!cmd_getval(cmdmap, "property", property)) {
2699 error = "unable to get property";
2700 success = false;
2701 } else if (!cmd_getval(cmdmap, "value", value)) {
2702 error = "unable to get value";
2703 success = false;
2704 } else if (value < 0) {
2705 error = "negative value not allowed";
2706 success = false;
2707 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2708 error = "invalid property";
2709 success = false;
2710 } else {
2711 success = true;
2712 }
2713 f->open_object_section("result");
2714 f->dump_string("error", error);
2715 f->dump_bool("success", success);
2716 f->close_section();
2717 } else if (prefix == "get_heap_property") {
2718 string property;
2719 size_t value = 0;
2720 string error;
2721 bool success = false;
2722 if (!cmd_getval(cmdmap, "property", property)) {
2723 error = "unable to get property";
2724 success = false;
2725 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2726 error = "invalid property";
2727 success = false;
2728 } else {
2729 success = true;
2730 }
2731 f->open_object_section("result");
2732 f->dump_string("error", error);
2733 f->dump_bool("success", success);
2734 f->dump_int("value", value);
2735 f->close_section();
2736 } else if (prefix == "dump_objectstore_kv_stats") {
2737 store->get_db_statistics(f);
2738 } else if (prefix == "dump_scrubs") {
2739 service.dumps_scrub(f);
2740 } else if (prefix == "calc_objectstore_db_histogram") {
2741 store->generate_db_histogram(f);
2742 } else if (prefix == "flush_store_cache") {
2743 store->flush_cache(&ss);
2744 } else if (prefix == "dump_pgstate_history") {
2745 f->open_object_section("pgstate_history");
2746 f->open_array_section("pgs");
2747 vector<PGRef> pgs;
2748 _get_pgs(&pgs);
2749 for (auto& pg : pgs) {
2750 f->open_object_section("pg");
2751 f->dump_stream("pg") << pg->pg_id;
2752 f->dump_string("currently", pg->get_current_state());
2753 pg->dump_pgstate_history(f);
2754 f->close_section();
2755 }
2756 f->close_section();
2757 f->close_section();
2758 } else if (prefix == "compact") {
2759 dout(1) << "triggering manual compaction" << dendl;
2760 auto start = ceph::coarse_mono_clock::now();
2761 store->compact();
2762 auto end = ceph::coarse_mono_clock::now();
2763 double duration = std::chrono::duration<double>(end-start).count();
2764 dout(1) << "finished manual compaction in "
2765 << duration
2766 << " seconds" << dendl;
2767 f->open_object_section("compact_result");
2768 f->dump_float("elapsed_time", duration);
2769 f->close_section();
2770 } else if (prefix == "get_mapped_pools") {
2771 f->open_array_section("mapped_pools");
2772 set<int64_t> poollist = get_mapped_pools();
2773 for (auto pool : poollist) {
2774 f->dump_int("pool_id", pool);
2775 }
2776 f->close_section();
2777 } else if (prefix == "smart") {
2778 string devid;
2779 cmd_getval(cmdmap, "devid", devid);
2780 ostringstream out;
2781 probe_smart(devid, out);
2782 outbl.append(out.str());
2783 } else if (prefix == "list_devices") {
2784 set<string> devnames;
2785 store->get_devices(&devnames);
2786 f->open_array_section("list_devices");
2787 for (auto dev : devnames) {
2788 if (dev.find("dm-") == 0) {
2789 continue;
2790 }
2791 string err;
2792 f->open_object_section("device");
2793 f->dump_string("device", "/dev/" + dev);
2794 f->dump_string("device_id", get_device_id(dev, &err));
2795 f->close_section();
2796 }
2797 f->close_section();
2798 } else if (prefix == "send_beacon") {
2799 lock_guard l(osd_lock);
2800 if (is_active()) {
2801 send_beacon(ceph::coarse_mono_clock::now());
2802 }
2803 }
2804
2805 else if (prefix == "cluster_log") {
2806 vector<string> msg;
2807 cmd_getval(cmdmap, "message", msg);
2808 if (msg.empty()) {
2809 ret = -EINVAL;
2810 ss << "ignoring empty log message";
2811 goto out;
2812 }
2813 string message = msg.front();
2814 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2815 message += " " + *a;
2816 string lvl;
2817 cmd_getval(cmdmap, "level", lvl);
2818 clog_type level = string_to_clog_type(lvl);
2819 if (level < 0) {
2820 ret = -EINVAL;
2821 ss << "unknown level '" << lvl << "'";
2822 goto out;
2823 }
2824 clog->do_log(level, message);
2825 }
2826
2827 else if (prefix == "bench") {
2828 int64_t count;
2829 int64_t bsize;
2830 int64_t osize, onum;
2831 // default count 1G, size 4MB
2832 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2833 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2834 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2835 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2836
2837 uint32_t duration = cct->_conf->osd_bench_duration;
2838
2839 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2840 // let us limit the block size because the next checks rely on it
2841 // having a sane value. If we allow any block size to be set things
2842 // can still go sideways.
2843 ss << "block 'size' values are capped at "
2844 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2845 << " a higher value, please adjust 'osd_bench_max_block_size'";
2846 ret = -EINVAL;
2847 goto out;
2848 } else if (bsize < (int64_t) (1 << 20)) {
2849 // entering the realm of small block sizes.
2850 // limit the count to a sane value, assuming a configurable amount of
2851 // IOPS and duration, so that the OSD doesn't get hung up on this,
2852 // preventing timeouts from going off
2853 int64_t max_count =
2854 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2855 if (count > max_count) {
2856 ss << "'count' values greater than " << max_count
2857 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2858 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2859 << " for " << duration << " seconds,"
2860 << " can cause ill effects on osd. "
2861 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2862 << " value if you wish to use a higher 'count'.";
2863 ret = -EINVAL;
2864 goto out;
2865 }
2866 } else {
2867 // 1MB block sizes are big enough so that we get more stuff done.
2868 // However, to avoid the osd from getting hung on this and having
2869 // timers being triggered, we are going to limit the count assuming
2870 // a configurable throughput and duration.
2871 // NOTE: max_count is the total amount of bytes that we believe we
2872 // will be able to write during 'duration' for the given
2873 // throughput. The block size hardly impacts this unless it's
2874 // way too big. Given we already check how big the block size
2875 // is, it's safe to assume everything will check out.
2876 int64_t max_count =
2877 cct->_conf->osd_bench_large_size_max_throughput * duration;
2878 if (count > max_count) {
2879 ss << "'count' values greater than " << max_count
2880 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2881 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2882 << " for " << duration << " seconds,"
2883 << " can cause ill effects on osd. "
2884 << " Please adjust 'osd_bench_large_size_max_throughput'"
2885 << " with a higher value if you wish to use a higher 'count'.";
2886 ret = -EINVAL;
2887 goto out;
2888 }
2889 }
2890
2891 if (osize && bsize > osize)
2892 bsize = osize;
2893
2894 dout(1) << " bench count " << count
2895 << " bsize " << byte_u_t(bsize) << dendl;
2896
2897 ObjectStore::Transaction cleanupt;
2898
2899 if (osize && onum) {
2900 bufferlist bl;
2901 bufferptr bp(osize);
2902 bp.zero();
2903 bl.push_back(std::move(bp));
2904 bl.rebuild_page_aligned();
2905 for (int i=0; i<onum; ++i) {
2906 char nm[30];
2907 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2908 object_t oid(nm);
2909 hobject_t soid(sobject_t(oid, 0));
2910 ObjectStore::Transaction t;
2911 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2912 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2913 cleanupt.remove(coll_t(), ghobject_t(soid));
2914 }
2915 }
2916
2917 bufferlist bl;
2918 bufferptr bp(bsize);
2919 bp.zero();
2920 bl.push_back(std::move(bp));
2921 bl.rebuild_page_aligned();
2922
2923 {
2924 C_SaferCond waiter;
2925 if (!service.meta_ch->flush_commit(&waiter)) {
2926 waiter.wait();
2927 }
2928 }
2929
2930 utime_t start = ceph_clock_now();
2931 for (int64_t pos = 0; pos < count; pos += bsize) {
2932 char nm[30];
2933 unsigned offset = 0;
2934 if (onum && osize) {
2935 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2936 offset = rand() % (osize / bsize) * bsize;
2937 } else {
2938 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2939 }
2940 object_t oid(nm);
2941 hobject_t soid(sobject_t(oid, 0));
2942 ObjectStore::Transaction t;
2943 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2944 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2945 if (!onum || !osize)
2946 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2947 }
2948
2949 {
2950 C_SaferCond waiter;
2951 if (!service.meta_ch->flush_commit(&waiter)) {
2952 waiter.wait();
2953 }
2954 }
2955 utime_t end = ceph_clock_now();
2956
2957 // clean up
2958 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2959 {
2960 C_SaferCond waiter;
2961 if (!service.meta_ch->flush_commit(&waiter)) {
2962 waiter.wait();
2963 }
2964 }
2965
2966 double elapsed = end - start;
2967 double rate = count / elapsed;
2968 double iops = rate / bsize;
2969 f->open_object_section("osd_bench_results");
2970 f->dump_int("bytes_written", count);
2971 f->dump_int("blocksize", bsize);
2972 f->dump_float("elapsed_sec", elapsed);
2973 f->dump_float("bytes_per_sec", rate);
2974 f->dump_float("iops", iops);
2975 f->close_section();
2976 }
2977
2978 else if (prefix == "flush_pg_stats") {
2979 mgrc.send_pgstats();
2980 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2981 }
2982
2983 else if (prefix == "heap") {
2984 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2985 }
2986
2987 else if (prefix == "debug dump_missing") {
2988 f->open_array_section("pgs");
2989 vector<PGRef> pgs;
2990 _get_pgs(&pgs);
2991 for (auto& pg : pgs) {
2992 string s = stringify(pg->pg_id);
2993 f->open_array_section(s.c_str());
2994 pg->lock();
2995 pg->dump_missing(f);
2996 pg->unlock();
2997 f->close_section();
2998 }
2999 f->close_section();
3000 }
3001
3002 else if (prefix == "debug kick_recovery_wq") {
3003 int64_t delay;
3004 cmd_getval(cmdmap, "delay", delay);
3005 ostringstream oss;
3006 oss << delay;
3007 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
3008 if (ret != 0) {
3009 ss << "kick_recovery_wq: error setting "
3010 << "osd_recovery_delay_start to '" << delay << "': error "
3011 << ret;
3012 goto out;
3013 }
3014 cct->_conf.apply_changes(nullptr);
3015 ss << "kicking recovery queue. set osd_recovery_delay_start "
3016 << "to " << cct->_conf->osd_recovery_delay_start;
3017 }
3018
3019 else if (prefix == "cpu_profiler") {
3020 ostringstream ds;
3021 string arg;
3022 cmd_getval(cmdmap, "arg", arg);
3023 vector<string> argvec;
3024 get_str_vec(arg, argvec);
3025 cpu_profiler_handle_command(argvec, ds);
3026 outbl.append(ds.str());
3027 }
3028
3029 else if (prefix == "dump_pg_recovery_stats") {
3030 lock_guard l(osd_lock);
3031 pg_recovery_stats.dump_formatted(f);
3032 }
3033
3034 else if (prefix == "reset_pg_recovery_stats") {
3035 lock_guard l(osd_lock);
3036 pg_recovery_stats.reset();
3037 }
3038
3039 else if (prefix == "perf histogram dump") {
3040 std::string logger;
3041 std::string counter;
3042 cmd_getval(cmdmap, "logger", logger);
3043 cmd_getval(cmdmap, "counter", counter);
3044 cct->get_perfcounters_collection()->dump_formatted_histograms(
3045 f, false, logger, counter);
3046 }
3047
3048 else if (prefix == "cache drop") {
3049 lock_guard l(osd_lock);
3050 dout(20) << "clearing all caches" << dendl;
3051 // Clear the objectstore's cache - onode and buffer for Bluestore,
3052 // system's pagecache for Filestore
3053 ret = store->flush_cache(&ss);
3054 if (ret < 0) {
3055 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
3056 goto out;
3057 }
3058 // Clear the objectcontext cache (per PG)
3059 vector<PGRef> pgs;
3060 _get_pgs(&pgs);
3061 for (auto& pg: pgs) {
3062 pg->clear_cache();
3063 }
3064 }
3065
3066 else if (prefix == "cache status") {
3067 lock_guard l(osd_lock);
3068 int obj_ctx_count = 0;
3069 vector<PGRef> pgs;
3070 _get_pgs(&pgs);
3071 for (auto& pg: pgs) {
3072 obj_ctx_count += pg->get_cache_obj_count();
3073 }
3074 f->open_object_section("cache_status");
3075 f->dump_int("object_ctx", obj_ctx_count);
3076 store->dump_cache_stats(f);
3077 f->close_section();
3078 }
3079
3080 else if (prefix == "scrub_purged_snaps") {
3081 lock_guard l(osd_lock);
3082 scrub_purged_snaps();
3083 }
3084
3085 else if (prefix == "dump_osd_network") {
3086 lock_guard l(osd_lock);
3087 int64_t value = 0;
3088 if (!(cmd_getval(cmdmap, "value", value))) {
3089 // Convert milliseconds to microseconds
3090 value = static_cast<double>(g_conf().get_val<double>(
3091 "mon_warn_on_slow_ping_time")) * 1000;
3092 if (value == 0) {
3093 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3094 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3095 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3096 }
3097 } else {
3098 // Convert user input to microseconds
3099 value *= 1000;
3100 }
3101 if (value < 0) value = 0;
3102
3103 struct osd_ping_time_t {
3104 uint32_t pingtime;
3105 int to;
3106 bool back;
3107 std::array<uint32_t,3> times;
3108 std::array<uint32_t,3> min;
3109 std::array<uint32_t,3> max;
3110 uint32_t last;
3111 uint32_t last_update;
3112
3113 bool operator<(const osd_ping_time_t& rhs) const {
3114 if (pingtime < rhs.pingtime)
3115 return true;
3116 if (pingtime > rhs.pingtime)
3117 return false;
3118 if (to < rhs.to)
3119 return true;
3120 if (to > rhs.to)
3121 return false;
3122 return back;
3123 }
3124 };
3125
3126 set<osd_ping_time_t> sorted;
3127 // Get pingtimes under lock and not on the stack
3128 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3129 service.get_hb_pingtime(pingtimes);
3130 for (auto j : *pingtimes) {
3131 if (j.second.last_update == 0)
3132 continue;
3133 osd_ping_time_t item;
3134 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3135 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3136 if (item.pingtime >= value) {
3137 item.to = j.first;
3138 item.times[0] = j.second.back_pingtime[0];
3139 item.times[1] = j.second.back_pingtime[1];
3140 item.times[2] = j.second.back_pingtime[2];
3141 item.min[0] = j.second.back_min[0];
3142 item.min[1] = j.second.back_min[1];
3143 item.min[2] = j.second.back_min[2];
3144 item.max[0] = j.second.back_max[0];
3145 item.max[1] = j.second.back_max[1];
3146 item.max[2] = j.second.back_max[2];
3147 item.last = j.second.back_last;
3148 item.back = true;
3149 item.last_update = j.second.last_update;
3150 sorted.emplace(item);
3151 }
3152 if (j.second.front_last == 0)
3153 continue;
3154 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3155 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3156 if (item.pingtime >= value) {
3157 item.to = j.first;
3158 item.times[0] = j.second.front_pingtime[0];
3159 item.times[1] = j.second.front_pingtime[1];
3160 item.times[2] = j.second.front_pingtime[2];
3161 item.min[0] = j.second.front_min[0];
3162 item.min[1] = j.second.front_min[1];
3163 item.min[2] = j.second.front_min[2];
3164 item.max[0] = j.second.front_max[0];
3165 item.max[1] = j.second.front_max[1];
3166 item.max[2] = j.second.front_max[2];
3167 item.last = j.second.front_last;
3168 item.last_update = j.second.last_update;
3169 item.back = false;
3170 sorted.emplace(item);
3171 }
3172 }
3173 delete pingtimes;
3174 //
3175 // Network ping times (1min 5min 15min)
3176 f->open_object_section("network_ping_times");
3177 f->dump_int("threshold", value / 1000);
3178 f->open_array_section("entries");
3179 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3180 ceph_assert(sitem.pingtime >= value);
3181 f->open_object_section("entry");
3182
3183 const time_t lu(sitem.last_update);
3184 char buffer[26];
3185 string lustr(ctime_r(&lu, buffer));
3186 lustr.pop_back(); // Remove trailing \n
3187 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3188 f->dump_string("last update", lustr);
3189 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3190 f->dump_int("from osd", whoami);
3191 f->dump_int("to osd", sitem.to);
3192 f->dump_string("interface", (sitem.back ? "back" : "front"));
3193 f->open_object_section("average");
3194 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3195 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3196 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3197 f->close_section(); // average
3198 f->open_object_section("min");
3199 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3200 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3201 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3202 f->close_section(); // min
3203 f->open_object_section("max");
3204 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3205 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3206 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3207 f->close_section(); // max
3208 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3209 f->close_section(); // entry
3210 }
3211 f->close_section(); // entries
3212 f->close_section(); // network_ping_times
3213 } else {
3214 ceph_abort_msg("broken asok registration");
3215 }
3216
3217 out:
3218 on_finish(ret, ss.str(), outbl);
3219 }
3220
3221 class TestOpsSocketHook : public AdminSocketHook {
3222 OSDService *service;
3223 ObjectStore *store;
3224 public:
3225 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3226 int call(std::string_view command, const cmdmap_t& cmdmap,
3227 Formatter *f,
3228 std::ostream& errss,
3229 bufferlist& out) override {
3230 int r = 0;
3231 stringstream outss;
3232 try {
3233 test_ops(service, store, command, cmdmap, outss);
3234 out.append(outss);
3235 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3236 errss << e.what();
3237 r = -EINVAL;
3238 }
3239 return r;
3240 }
3241 void test_ops(OSDService *service, ObjectStore *store,
3242 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3243
3244 };
3245
3246 class OSD::C_Tick : public Context {
3247 OSD *osd;
3248 public:
3249 explicit C_Tick(OSD *o) : osd(o) {}
3250 void finish(int r) override {
3251 osd->tick();
3252 }
3253 };
3254
3255 class OSD::C_Tick_WithoutOSDLock : public Context {
3256 OSD *osd;
3257 public:
3258 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3259 void finish(int r) override {
3260 osd->tick_without_osd_lock();
3261 }
3262 };
3263
3264 int OSD::enable_disable_fuse(bool stop)
3265 {
3266 #ifdef HAVE_LIBFUSE
3267 int r;
3268 string mntpath = cct->_conf->osd_data + "/fuse";
3269 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3270 dout(1) << __func__ << " disabling" << dendl;
3271 fuse_store->stop();
3272 delete fuse_store;
3273 fuse_store = NULL;
3274 r = ::rmdir(mntpath.c_str());
3275 if (r < 0) {
3276 r = -errno;
3277 derr << __func__ << " failed to rmdir " << mntpath << ": "
3278 << cpp_strerror(r) << dendl;
3279 return r;
3280 }
3281 return 0;
3282 }
3283 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3284 dout(1) << __func__ << " enabling" << dendl;
3285 r = ::mkdir(mntpath.c_str(), 0700);
3286 if (r < 0)
3287 r = -errno;
3288 if (r < 0 && r != -EEXIST) {
3289 derr << __func__ << " unable to create " << mntpath << ": "
3290 << cpp_strerror(r) << dendl;
3291 return r;
3292 }
3293 fuse_store = new FuseStore(store, mntpath);
3294 r = fuse_store->start();
3295 if (r < 0) {
3296 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3297 delete fuse_store;
3298 fuse_store = NULL;
3299 return r;
3300 }
3301 }
3302 #endif // HAVE_LIBFUSE
3303 return 0;
3304 }
3305
3306 size_t OSD::get_num_cache_shards()
3307 {
3308 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3309 }
3310
3311 int OSD::get_num_op_shards()
3312 {
3313 if (cct->_conf->osd_op_num_shards)
3314 return cct->_conf->osd_op_num_shards;
3315 if (store_is_rotational)
3316 return cct->_conf->osd_op_num_shards_hdd;
3317 else
3318 return cct->_conf->osd_op_num_shards_ssd;
3319 }
3320
3321 int OSD::get_num_op_threads()
3322 {
3323 if (cct->_conf->osd_op_num_threads_per_shard)
3324 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3325 if (store_is_rotational)
3326 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3327 else
3328 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3329 }
3330
3331 float OSD::get_osd_recovery_sleep()
3332 {
3333 if (cct->_conf->osd_recovery_sleep)
3334 return cct->_conf->osd_recovery_sleep;
3335 if (!store_is_rotational && !journal_is_rotational)
3336 return cct->_conf->osd_recovery_sleep_ssd;
3337 else if (store_is_rotational && !journal_is_rotational)
3338 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3339 else
3340 return cct->_conf->osd_recovery_sleep_hdd;
3341 }
3342
3343 float OSD::get_osd_delete_sleep()
3344 {
3345 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3346 if (osd_delete_sleep > 0)
3347 return osd_delete_sleep;
3348 if (!store_is_rotational && !journal_is_rotational)
3349 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3350 if (store_is_rotational && !journal_is_rotational)
3351 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3352 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3353 }
3354
3355 int OSD::get_recovery_max_active()
3356 {
3357 if (cct->_conf->osd_recovery_max_active)
3358 return cct->_conf->osd_recovery_max_active;
3359 if (store_is_rotational)
3360 return cct->_conf->osd_recovery_max_active_hdd;
3361 else
3362 return cct->_conf->osd_recovery_max_active_ssd;
3363 }
3364
3365 float OSD::get_osd_snap_trim_sleep()
3366 {
3367 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3368 if (osd_snap_trim_sleep > 0)
3369 return osd_snap_trim_sleep;
3370 if (!store_is_rotational && !journal_is_rotational)
3371 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3372 if (store_is_rotational && !journal_is_rotational)
3373 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3374 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3375 }
3376
3377 int OSD::init()
3378 {
3379 OSDMapRef osdmap;
3380 CompatSet initial, diff;
3381 std::lock_guard lock(osd_lock);
3382 if (is_stopping())
3383 return 0;
3384
3385 tick_timer.init();
3386 tick_timer_without_osd_lock.init();
3387 service.recovery_request_timer.init();
3388 service.sleep_timer.init();
3389
3390 boot_finisher.start();
3391
3392 {
3393 string val;
3394 store->read_meta("require_osd_release", &val);
3395 last_require_osd_release = ceph_release_from_name(val);
3396 }
3397
3398 // mount.
3399 dout(2) << "init " << dev_path
3400 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3401 << dendl;
3402 dout(2) << "journal " << journal_path << dendl;
3403 ceph_assert(store); // call pre_init() first!
3404
3405 store->set_cache_shards(get_num_cache_shards());
3406
3407 int r = store->mount();
3408 if (r < 0) {
3409 derr << "OSD:init: unable to mount object store" << dendl;
3410 return r;
3411 }
3412 journal_is_rotational = store->is_journal_rotational();
3413 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3414 << dendl;
3415
3416 enable_disable_fuse(false);
3417
3418 dout(2) << "boot" << dendl;
3419
3420 service.meta_ch = store->open_collection(coll_t::meta());
3421
3422 // initialize the daily loadavg with current 15min loadavg
3423 double loadavgs[3];
3424 if (getloadavg(loadavgs, 3) == 3) {
3425 daily_loadavg = loadavgs[2];
3426 } else {
3427 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3428 daily_loadavg = 1.0;
3429 }
3430
3431 int rotating_auth_attempts = 0;
3432 auto rotating_auth_timeout =
3433 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3434
3435 // sanity check long object name handling
3436 {
3437 hobject_t l;
3438 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3439 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3440 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3441 r = store->validate_hobject_key(l);
3442 if (r < 0) {
3443 derr << "backend (" << store->get_type() << ") is unable to support max "
3444 << "object name[space] len" << dendl;
3445 derr << " osd max object name len = "
3446 << cct->_conf->osd_max_object_name_len << dendl;
3447 derr << " osd max object namespace len = "
3448 << cct->_conf->osd_max_object_namespace_len << dendl;
3449 derr << cpp_strerror(r) << dendl;
3450 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3451 goto out;
3452 }
3453 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3454 << dendl;
3455 } else {
3456 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3457 }
3458 }
3459
3460 // read superblock
3461 r = read_superblock();
3462 if (r < 0) {
3463 derr << "OSD::init() : unable to read osd superblock" << dendl;
3464 r = -EINVAL;
3465 goto out;
3466 }
3467
3468 if (osd_compat.compare(superblock.compat_features) < 0) {
3469 derr << "The disk uses features unsupported by the executable." << dendl;
3470 derr << " ondisk features " << superblock.compat_features << dendl;
3471 derr << " daemon features " << osd_compat << dendl;
3472
3473 if (osd_compat.writeable(superblock.compat_features)) {
3474 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3475 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3476 r = -EOPNOTSUPP;
3477 goto out;
3478 }
3479 else {
3480 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3481 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3482 r = -EOPNOTSUPP;
3483 goto out;
3484 }
3485 }
3486
3487 assert_warn(whoami == superblock.whoami);
3488 if (whoami != superblock.whoami) {
3489 derr << "OSD::init: superblock says osd"
3490 << superblock.whoami << " but I am osd." << whoami << dendl;
3491 r = -EINVAL;
3492 goto out;
3493 }
3494
3495 startup_time = ceph::mono_clock::now();
3496
3497 // load up "current" osdmap
3498 assert_warn(!get_osdmap());
3499 if (get_osdmap()) {
3500 derr << "OSD::init: unable to read current osdmap" << dendl;
3501 r = -EINVAL;
3502 goto out;
3503 }
3504 osdmap = get_map(superblock.current_epoch);
3505 set_osdmap(osdmap);
3506
3507 // make sure we don't have legacy pgs deleting
3508 {
3509 vector<coll_t> ls;
3510 int r = store->list_collections(ls);
3511 ceph_assert(r >= 0);
3512 for (auto c : ls) {
3513 spg_t pgid;
3514 if (c.is_pg(&pgid) &&
3515 !osdmap->have_pg_pool(pgid.pool())) {
3516 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3517 if (!store->exists(service.meta_ch, oid)) {
3518 derr << __func__ << " missing pg_pool_t for deleted pool "
3519 << pgid.pool() << " for pg " << pgid
3520 << "; please downgrade to luminous and allow "
3521 << "pg deletion to complete before upgrading" << dendl;
3522 ceph_abort();
3523 }
3524 }
3525 }
3526 }
3527
3528 initial = get_osd_initial_compat_set();
3529 diff = superblock.compat_features.unsupported(initial);
3530 if (superblock.compat_features.merge(initial)) {
3531 // Are we adding SNAPMAPPER2?
3532 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3533 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3534 << dendl;
3535 auto ch = service.meta_ch;
3536 auto hoid = make_snapmapper_oid();
3537 unsigned max = cct->_conf->osd_target_transaction_size;
3538 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3539 if (r < 0)
3540 goto out;
3541 }
3542 // We need to persist the new compat_set before we
3543 // do anything else
3544 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3545 ObjectStore::Transaction t;
3546 write_superblock(t);
3547 r = store->queue_transaction(service.meta_ch, std::move(t));
3548 if (r < 0)
3549 goto out;
3550 }
3551
3552 // make sure snap mapper object exists
3553 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3554 dout(10) << "init creating/touching snapmapper object" << dendl;
3555 ObjectStore::Transaction t;
3556 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3557 r = store->queue_transaction(service.meta_ch, std::move(t));
3558 if (r < 0)
3559 goto out;
3560 }
3561 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3562 dout(10) << "init creating/touching purged_snaps object" << dendl;
3563 ObjectStore::Transaction t;
3564 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3565 r = store->queue_transaction(service.meta_ch, std::move(t));
3566 if (r < 0)
3567 goto out;
3568 }
3569
3570 if (cct->_conf->osd_open_classes_on_start) {
3571 int r = ClassHandler::get_instance().open_all_classes();
3572 if (r)
3573 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3574 }
3575
3576 check_osdmap_features();
3577
3578 {
3579 epoch_t bind_epoch = osdmap->get_epoch();
3580 service.set_epochs(NULL, NULL, &bind_epoch);
3581 }
3582
3583 clear_temp_objects();
3584
3585 // initialize osdmap references in sharded wq
3586 for (auto& shard : shards) {
3587 std::lock_guard l(shard->osdmap_lock);
3588 shard->shard_osdmap = osdmap;
3589 }
3590
3591 // load up pgs (as they previously existed)
3592 load_pgs();
3593
3594 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3595
3596 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3597 dout(2) << "compacting object store's omap" << dendl;
3598 store->compact();
3599 }
3600
3601 // prime osd stats
3602 {
3603 struct store_statfs_t stbuf;
3604 osd_alert_list_t alerts;
3605 int r = store->statfs(&stbuf, &alerts);
3606 ceph_assert(r == 0);
3607 service.set_statfs(stbuf, alerts);
3608 }
3609
3610 // client_messenger's auth_client will be set up by monc->init() later.
3611 for (auto m : { cluster_messenger,
3612 objecter_messenger,
3613 hb_front_client_messenger,
3614 hb_back_client_messenger,
3615 hb_front_server_messenger,
3616 hb_back_server_messenger } ) {
3617 m->set_auth_client(monc);
3618 }
3619 for (auto m : { client_messenger,
3620 cluster_messenger,
3621 hb_front_server_messenger,
3622 hb_back_server_messenger }) {
3623 m->set_auth_server(monc);
3624 }
3625 monc->set_handle_authentication_dispatcher(this);
3626
3627 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3628 | CEPH_ENTITY_TYPE_MGR);
3629 r = monc->init();
3630 if (r < 0)
3631 goto out;
3632
3633 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3634 mgrc.set_perf_metric_query_cb(
3635 [this](const ConfigPayload &config_payload) {
3636 set_perf_queries(config_payload);
3637 },
3638 [this] {
3639 return get_perf_reports();
3640 });
3641 mgrc.init();
3642
3643 // tell monc about log_client so it will know about mon session resets
3644 monc->set_log_client(&log_client);
3645 update_log_config();
3646
3647 // i'm ready!
3648 client_messenger->add_dispatcher_tail(&mgrc);
3649 client_messenger->add_dispatcher_tail(this);
3650 cluster_messenger->add_dispatcher_head(this);
3651
3652 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3653 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3654 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3655 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3656
3657 objecter_messenger->add_dispatcher_head(service.objecter.get());
3658
3659 service.init();
3660 service.publish_map(osdmap);
3661 service.publish_superblock(superblock);
3662 service.max_oldest_map = superblock.oldest_map;
3663
3664 for (auto& shard : shards) {
3665 // put PGs in a temporary set because we may modify pg_slots
3666 // unordered_map below.
3667 set<PGRef> pgs;
3668 for (auto& i : shard->pg_slots) {
3669 PGRef pg = i.second->pg;
3670 if (!pg) {
3671 continue;
3672 }
3673 pgs.insert(pg);
3674 }
3675 for (auto pg : pgs) {
3676 std::scoped_lock l{*pg};
3677 set<pair<spg_t,epoch_t>> new_children;
3678 set<pair<spg_t,epoch_t>> merge_pgs;
3679 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3680 &new_children, &merge_pgs);
3681 if (!new_children.empty()) {
3682 for (auto shard : shards) {
3683 shard->prime_splits(osdmap, &new_children);
3684 }
3685 assert(new_children.empty());
3686 }
3687 if (!merge_pgs.empty()) {
3688 for (auto shard : shards) {
3689 shard->prime_merges(osdmap, &merge_pgs);
3690 }
3691 assert(merge_pgs.empty());
3692 }
3693 }
3694 }
3695
3696 osd_op_tp.start();
3697
3698 // start the heartbeat
3699 heartbeat_thread.create("osd_srv_heartbt");
3700
3701 // tick
3702 tick_timer.add_event_after(get_tick_interval(),
3703 new C_Tick(this));
3704 {
3705 std::lock_guard l(tick_timer_lock);
3706 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3707 new C_Tick_WithoutOSDLock(this));
3708 }
3709
3710 osd_lock.unlock();
3711
3712 r = monc->authenticate();
3713 if (r < 0) {
3714 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3715 << dendl;
3716 exit(1);
3717 }
3718
3719 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3720 derr << "unable to obtain rotating service keys; retrying" << dendl;
3721 ++rotating_auth_attempts;
3722 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3723 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3724 exit(1);
3725 }
3726 }
3727
3728 r = update_crush_device_class();
3729 if (r < 0) {
3730 derr << __func__ << " unable to update_crush_device_class: "
3731 << cpp_strerror(r) << dendl;
3732 exit(1);
3733 }
3734
3735 r = update_crush_location();
3736 if (r < 0) {
3737 derr << __func__ << " unable to update_crush_location: "
3738 << cpp_strerror(r) << dendl;
3739 exit(1);
3740 }
3741
3742 osd_lock.lock();
3743 if (is_stopping())
3744 return 0;
3745
3746 // start objecter *after* we have authenticated, so that we don't ignore
3747 // the OSDMaps it requests.
3748 service.final_init();
3749
3750 check_config();
3751
3752 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3753 consume_map();
3754
3755 dout(0) << "done with init, starting boot process" << dendl;
3756
3757 // subscribe to any pg creations
3758 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3759
3760 // MgrClient needs this (it doesn't have MonClient reference itself)
3761 monc->sub_want("mgrmap", 0, 0);
3762
3763 // we don't need to ask for an osdmap here; objecter will
3764 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3765
3766 monc->renew_subs();
3767
3768 start_boot();
3769
3770 return 0;
3771
3772 out:
3773 enable_disable_fuse(true);
3774 store->umount();
3775 delete store;
3776 store = NULL;
3777 return r;
3778 }
3779
3780 void OSD::final_init()
3781 {
3782 AdminSocket *admin_socket = cct->get_admin_socket();
3783 asok_hook = new OSDSocketHook(this);
3784 int r = admin_socket->register_command("status", asok_hook,
3785 "high-level status of OSD");
3786 ceph_assert(r == 0);
3787 r = admin_socket->register_command("flush_journal",
3788 asok_hook,
3789 "flush the journal to permanent store");
3790 ceph_assert(r == 0);
3791 r = admin_socket->register_command("dump_ops_in_flight " \
3792 "name=filterstr,type=CephString,n=N,req=false",
3793 asok_hook,
3794 "show the ops currently in flight");
3795 ceph_assert(r == 0);
3796 r = admin_socket->register_command("ops " \
3797 "name=filterstr,type=CephString,n=N,req=false",
3798 asok_hook,
3799 "show the ops currently in flight");
3800 ceph_assert(r == 0);
3801 r = admin_socket->register_command("dump_blocked_ops " \
3802 "name=filterstr,type=CephString,n=N,req=false",
3803 asok_hook,
3804 "show the blocked ops currently in flight");
3805 ceph_assert(r == 0);
3806 r = admin_socket->register_command("dump_historic_ops " \
3807 "name=filterstr,type=CephString,n=N,req=false",
3808 asok_hook,
3809 "show recent ops");
3810 ceph_assert(r == 0);
3811 r = admin_socket->register_command("dump_historic_slow_ops " \
3812 "name=filterstr,type=CephString,n=N,req=false",
3813 asok_hook,
3814 "show slowest recent ops");
3815 ceph_assert(r == 0);
3816 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3817 "name=filterstr,type=CephString,n=N,req=false",
3818 asok_hook,
3819 "show slowest recent ops, sorted by duration");
3820 ceph_assert(r == 0);
3821 r = admin_socket->register_command("dump_op_pq_state",
3822 asok_hook,
3823 "dump op priority queue state");
3824 ceph_assert(r == 0);
3825 r = admin_socket->register_command("dump_blocklist",
3826 asok_hook,
3827 "dump blocklisted clients and times");
3828 ceph_assert(r == 0);
3829 r = admin_socket->register_command("dump_watchers",
3830 asok_hook,
3831 "show clients which have active watches,"
3832 " and on which objects");
3833 ceph_assert(r == 0);
3834 r = admin_socket->register_command("dump_recovery_reservations",
3835 asok_hook,
3836 "show recovery reservations");
3837 ceph_assert(r == 0);
3838 r = admin_socket->register_command("dump_scrub_reservations",
3839 asok_hook,
3840 "show scrub reservations");
3841 ceph_assert(r == 0);
3842 r = admin_socket->register_command("get_latest_osdmap",
3843 asok_hook,
3844 "force osd to update the latest map from "
3845 "the mon");
3846 ceph_assert(r == 0);
3847
3848 r = admin_socket->register_command("set_heap_property " \
3849 "name=property,type=CephString " \
3850 "name=value,type=CephInt",
3851 asok_hook,
3852 "update malloc extension heap property");
3853 ceph_assert(r == 0);
3854
3855 r = admin_socket->register_command("get_heap_property " \
3856 "name=property,type=CephString",
3857 asok_hook,
3858 "get malloc extension heap property");
3859 ceph_assert(r == 0);
3860
3861 r = admin_socket->register_command("dump_objectstore_kv_stats",
3862 asok_hook,
3863 "print statistics of kvdb which used by bluestore");
3864 ceph_assert(r == 0);
3865
3866 r = admin_socket->register_command("dump_scrubs",
3867 asok_hook,
3868 "print scheduled scrubs");
3869 ceph_assert(r == 0);
3870
3871 r = admin_socket->register_command("calc_objectstore_db_histogram",
3872 asok_hook,
3873 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3874 ceph_assert(r == 0);
3875
3876 r = admin_socket->register_command("flush_store_cache",
3877 asok_hook,
3878 "Flush bluestore internal cache");
3879 ceph_assert(r == 0);
3880 r = admin_socket->register_command("dump_pgstate_history",
3881 asok_hook,
3882 "show recent state history");
3883 ceph_assert(r == 0);
3884
3885 r = admin_socket->register_command("compact",
3886 asok_hook,
3887 "Commpact object store's omap."
3888 " WARNING: Compaction probably slows your requests");
3889 ceph_assert(r == 0);
3890
3891 r = admin_socket->register_command("get_mapped_pools",
3892 asok_hook,
3893 "dump pools whose PG(s) are mapped to this OSD.");
3894
3895 ceph_assert(r == 0);
3896
3897 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3898 asok_hook,
3899 "probe OSD devices for SMART data.");
3900
3901 ceph_assert(r == 0);
3902
3903 r = admin_socket->register_command("list_devices",
3904 asok_hook,
3905 "list OSD devices.");
3906 r = admin_socket->register_command("send_beacon",
3907 asok_hook,
3908 "send OSD beacon to mon immediately");
3909
3910 r = admin_socket->register_command(
3911 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3912 "Dump osd heartbeat network ping times");
3913 ceph_assert(r == 0);
3914
3915 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3916 // Note: pools are CephString instead of CephPoolname because
3917 // these commands traditionally support both pool names and numbers
3918 r = admin_socket->register_command(
3919 "setomapval " \
3920 "name=pool,type=CephString " \
3921 "name=objname,type=CephObjectname " \
3922 "name=key,type=CephString "\
3923 "name=val,type=CephString",
3924 test_ops_hook,
3925 "set omap key");
3926 ceph_assert(r == 0);
3927 r = admin_socket->register_command(
3928 "rmomapkey " \
3929 "name=pool,type=CephString " \
3930 "name=objname,type=CephObjectname " \
3931 "name=key,type=CephString",
3932 test_ops_hook,
3933 "remove omap key");
3934 ceph_assert(r == 0);
3935 r = admin_socket->register_command(
3936 "setomapheader " \
3937 "name=pool,type=CephString " \
3938 "name=objname,type=CephObjectname " \
3939 "name=header,type=CephString",
3940 test_ops_hook,
3941 "set omap header");
3942 ceph_assert(r == 0);
3943
3944 r = admin_socket->register_command(
3945 "getomap " \
3946 "name=pool,type=CephString " \
3947 "name=objname,type=CephObjectname",
3948 test_ops_hook,
3949 "output entire object map");
3950 ceph_assert(r == 0);
3951
3952 r = admin_socket->register_command(
3953 "truncobj " \
3954 "name=pool,type=CephString " \
3955 "name=objname,type=CephObjectname " \
3956 "name=len,type=CephInt",
3957 test_ops_hook,
3958 "truncate object to length");
3959 ceph_assert(r == 0);
3960
3961 r = admin_socket->register_command(
3962 "injectdataerr " \
3963 "name=pool,type=CephString " \
3964 "name=objname,type=CephObjectname " \
3965 "name=shardid,type=CephInt,req=false,range=0|255",
3966 test_ops_hook,
3967 "inject data error to an object");
3968 ceph_assert(r == 0);
3969
3970 r = admin_socket->register_command(
3971 "injectmdataerr " \
3972 "name=pool,type=CephString " \
3973 "name=objname,type=CephObjectname " \
3974 "name=shardid,type=CephInt,req=false,range=0|255",
3975 test_ops_hook,
3976 "inject metadata error to an object");
3977 ceph_assert(r == 0);
3978 r = admin_socket->register_command(
3979 "set_recovery_delay " \
3980 "name=utime,type=CephInt,req=false",
3981 test_ops_hook,
3982 "Delay osd recovery by specified seconds");
3983 ceph_assert(r == 0);
3984 r = admin_socket->register_command(
3985 "injectfull " \
3986 "name=type,type=CephString,req=false " \
3987 "name=count,type=CephInt,req=false ",
3988 test_ops_hook,
3989 "Inject a full disk (optional count times)");
3990 ceph_assert(r == 0);
3991 r = admin_socket->register_command(
3992 "bench " \
3993 "name=count,type=CephInt,req=false " \
3994 "name=size,type=CephInt,req=false " \
3995 "name=object_size,type=CephInt,req=false " \
3996 "name=object_num,type=CephInt,req=false ",
3997 asok_hook,
3998 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3999 "(default count=1G default size=4MB). Results in log.");
4000 ceph_assert(r == 0);
4001 r = admin_socket->register_command(
4002 "cluster_log " \
4003 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4004 "name=message,type=CephString,n=N",
4005 asok_hook,
4006 "log a message to the cluster log");
4007 ceph_assert(r == 0);
4008 r = admin_socket->register_command(
4009 "flush_pg_stats",
4010 asok_hook,
4011 "flush pg stats");
4012 ceph_assert(r == 0);
4013 r = admin_socket->register_command(
4014 "heap " \
4015 "name=heapcmd,type=CephChoices,strings=" \
4016 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4017 "name=value,type=CephString,req=false",
4018 asok_hook,
4019 "show heap usage info (available only if compiled with tcmalloc)");
4020 ceph_assert(r == 0);
4021 r = admin_socket->register_command(
4022 "debug dump_missing " \
4023 "name=filename,type=CephFilepath",
4024 asok_hook,
4025 "dump missing objects to a named file");
4026 ceph_assert(r == 0);
4027 r = admin_socket->register_command(
4028 "debug kick_recovery_wq " \
4029 "name=delay,type=CephInt,range=0",
4030 asok_hook,
4031 "set osd_recovery_delay_start to <val>");
4032 ceph_assert(r == 0);
4033 r = admin_socket->register_command(
4034 "cpu_profiler " \
4035 "name=arg,type=CephChoices,strings=status|flush",
4036 asok_hook,
4037 "run cpu profiling on daemon");
4038 ceph_assert(r == 0);
4039 r = admin_socket->register_command(
4040 "dump_pg_recovery_stats",
4041 asok_hook,
4042 "dump pg recovery statistics");
4043 ceph_assert(r == 0);
4044 r = admin_socket->register_command(
4045 "reset_pg_recovery_stats",
4046 asok_hook,
4047 "reset pg recovery statistics");
4048 ceph_assert(r == 0);
4049 r = admin_socket->register_command(
4050 "cache drop",
4051 asok_hook,
4052 "Drop all OSD caches");
4053 ceph_assert(r == 0);
4054 r = admin_socket->register_command(
4055 "cache status",
4056 asok_hook,
4057 "Get OSD caches statistics");
4058 ceph_assert(r == 0);
4059 r = admin_socket->register_command(
4060 "scrub_purged_snaps",
4061 asok_hook,
4062 "Scrub purged_snaps vs snapmapper index");
4063 ceph_assert(r == 0);
4064
4065 // -- pg commands --
4066 // old form: ceph pg <pgid> command ...
4067 r = admin_socket->register_command(
4068 "pg " \
4069 "name=pgid,type=CephPgid " \
4070 "name=cmd,type=CephChoices,strings=query",
4071 asok_hook,
4072 "");
4073 ceph_assert(r == 0);
4074 r = admin_socket->register_command(
4075 "pg " \
4076 "name=pgid,type=CephPgid " \
4077 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4078 "name=mulcmd,type=CephChoices,strings=revert|delete",
4079 asok_hook,
4080 "");
4081 ceph_assert(r == 0);
4082 r = admin_socket->register_command(
4083 "pg " \
4084 "name=pgid,type=CephPgid " \
4085 "name=cmd,type=CephChoices,strings=list_unfound " \
4086 "name=offset,type=CephString,req=false",
4087 asok_hook,
4088 "");
4089 ceph_assert(r == 0);
4090 r = admin_socket->register_command(
4091 "pg " \
4092 "name=pgid,type=CephPgid " \
4093 "name=cmd,type=CephChoices,strings=scrub " \
4094 "name=time,type=CephInt,req=false",
4095 asok_hook,
4096 "");
4097 ceph_assert(r == 0);
4098 r = admin_socket->register_command(
4099 "pg " \
4100 "name=pgid,type=CephPgid " \
4101 "name=cmd,type=CephChoices,strings=deep_scrub " \
4102 "name=time,type=CephInt,req=false",
4103 asok_hook,
4104 "");
4105 ceph_assert(r == 0);
4106 // new form: tell <pgid> <cmd> for both cli and rest
4107 r = admin_socket->register_command(
4108 "query",
4109 asok_hook,
4110 "show details of a specific pg");
4111 ceph_assert(r == 0);
4112 r = admin_socket->register_command(
4113 "mark_unfound_lost " \
4114 "name=pgid,type=CephPgid,req=false " \
4115 "name=mulcmd,type=CephChoices,strings=revert|delete",
4116 asok_hook,
4117 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4118 ceph_assert(r == 0);
4119 r = admin_socket->register_command(
4120 "list_unfound " \
4121 "name=pgid,type=CephPgid,req=false " \
4122 "name=offset,type=CephString,req=false",
4123 asok_hook,
4124 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4125 ceph_assert(r == 0);
4126 r = admin_socket->register_command(
4127 "scrub " \
4128 "name=pgid,type=CephPgid,req=false " \
4129 "name=time,type=CephInt,req=false",
4130 asok_hook,
4131 "Trigger a scheduled scrub ");
4132 ceph_assert(r == 0);
4133 r = admin_socket->register_command(
4134 "deep_scrub " \
4135 "name=pgid,type=CephPgid,req=false " \
4136 "name=time,type=CephInt,req=false",
4137 asok_hook,
4138 "Trigger a scheduled deep scrub ");
4139 ceph_assert(r == 0);
4140 }
4141
4142 PerfCounters* OSD::create_logger()
4143 {
4144 PerfCounters* logger = build_osd_logger(cct);
4145 cct->get_perfcounters_collection()->add(logger);
4146 return logger;
4147 }
4148
4149 PerfCounters* OSD::create_recoverystate_perf()
4150 {
4151 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4152 cct->get_perfcounters_collection()->add(recoverystate_perf);
4153 return recoverystate_perf;
4154 }
4155
4156 int OSD::shutdown()
4157 {
4158 if (cct->_conf->osd_fast_shutdown) {
4159 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4160 if (cct->_conf->osd_fast_shutdown_notify_mon)
4161 service.prepare_to_stop();
4162 cct->_log->flush();
4163 _exit(0);
4164 }
4165
4166 if (!service.prepare_to_stop())
4167 return 0; // already shutting down
4168 osd_lock.lock();
4169 if (is_stopping()) {
4170 osd_lock.unlock();
4171 return 0;
4172 }
4173 dout(0) << "shutdown" << dendl;
4174
4175 set_state(STATE_STOPPING);
4176
4177 // Debugging
4178 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4179 cct->_conf.set_val("debug_osd", "100");
4180 cct->_conf.set_val("debug_journal", "100");
4181 cct->_conf.set_val("debug_filestore", "100");
4182 cct->_conf.set_val("debug_bluestore", "100");
4183 cct->_conf.set_val("debug_ms", "100");
4184 cct->_conf.apply_changes(nullptr);
4185 }
4186
4187 // stop MgrClient earlier as it's more like an internal consumer of OSD
4188 mgrc.shutdown();
4189
4190 service.start_shutdown();
4191
4192 // stop sending work to pgs. this just prevents any new work in _process
4193 // from racing with on_shutdown and potentially entering the pg after.
4194 op_shardedwq.drain();
4195
4196 // Shutdown PGs
4197 {
4198 vector<PGRef> pgs;
4199 _get_pgs(&pgs);
4200 for (auto pg : pgs) {
4201 pg->shutdown();
4202 }
4203 }
4204
4205 // drain op queue again (in case PGs requeued something)
4206 op_shardedwq.drain();
4207 {
4208 finished.clear(); // zap waiters (bleh, this is messy)
4209 waiting_for_osdmap.clear();
4210 }
4211
4212 // unregister commands
4213 cct->get_admin_socket()->unregister_commands(asok_hook);
4214 delete asok_hook;
4215 asok_hook = NULL;
4216
4217 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4218 delete test_ops_hook;
4219 test_ops_hook = NULL;
4220
4221 osd_lock.unlock();
4222
4223 {
4224 std::lock_guard l{heartbeat_lock};
4225 heartbeat_stop = true;
4226 heartbeat_cond.notify_all();
4227 heartbeat_peers.clear();
4228 }
4229 heartbeat_thread.join();
4230
4231 hb_back_server_messenger->mark_down_all();
4232 hb_front_server_messenger->mark_down_all();
4233 hb_front_client_messenger->mark_down_all();
4234 hb_back_client_messenger->mark_down_all();
4235
4236 osd_op_tp.drain();
4237 osd_op_tp.stop();
4238 dout(10) << "op sharded tp stopped" << dendl;
4239
4240 dout(10) << "stopping agent" << dendl;
4241 service.agent_stop();
4242
4243 boot_finisher.wait_for_empty();
4244
4245 osd_lock.lock();
4246
4247 boot_finisher.stop();
4248 reset_heartbeat_peers(true);
4249
4250 tick_timer.shutdown();
4251
4252 {
4253 std::lock_guard l(tick_timer_lock);
4254 tick_timer_without_osd_lock.shutdown();
4255 }
4256
4257 // note unmount epoch
4258 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4259 superblock.mounted = service.get_boot_epoch();
4260 superblock.clean_thru = get_osdmap_epoch();
4261 ObjectStore::Transaction t;
4262 write_superblock(t);
4263 int r = store->queue_transaction(service.meta_ch, std::move(t));
4264 if (r) {
4265 derr << "OSD::shutdown: error writing superblock: "
4266 << cpp_strerror(r) << dendl;
4267 }
4268
4269
4270 service.shutdown_reserver();
4271
4272 // Remove PGs
4273 #ifdef PG_DEBUG_REFS
4274 service.dump_live_pgids();
4275 #endif
4276 while (true) {
4277 vector<PGRef> pgs;
4278 _get_pgs(&pgs, true);
4279 if (pgs.empty()) {
4280 break;
4281 }
4282 for (auto& pg : pgs) {
4283 if (pg->is_deleted()) {
4284 continue;
4285 }
4286 dout(20) << " kicking pg " << pg << dendl;
4287 pg->lock();
4288 if (pg->get_num_ref() != 1) {
4289 derr << "pgid " << pg->get_pgid() << " has ref count of "
4290 << pg->get_num_ref() << dendl;
4291 #ifdef PG_DEBUG_REFS
4292 pg->dump_live_ids();
4293 #endif
4294 if (cct->_conf->osd_shutdown_pgref_assert) {
4295 ceph_abort();
4296 }
4297 }
4298 pg->ch.reset();
4299 pg->unlock();
4300 }
4301 }
4302 #ifdef PG_DEBUG_REFS
4303 service.dump_live_pgids();
4304 #endif
4305
4306 osd_lock.unlock();
4307 cct->_conf.remove_observer(this);
4308 osd_lock.lock();
4309
4310 service.meta_ch.reset();
4311
4312 dout(10) << "syncing store" << dendl;
4313 enable_disable_fuse(true);
4314
4315 if (cct->_conf->osd_journal_flush_on_shutdown) {
4316 dout(10) << "flushing journal" << dendl;
4317 store->flush_journal();
4318 }
4319
4320 monc->shutdown();
4321 osd_lock.unlock();
4322 {
4323 std::unique_lock l{map_lock};
4324 set_osdmap(OSDMapRef());
4325 }
4326 for (auto s : shards) {
4327 std::lock_guard l(s->osdmap_lock);
4328 s->shard_osdmap = OSDMapRef();
4329 }
4330 service.shutdown();
4331
4332 std::lock_guard lock(osd_lock);
4333 store->umount();
4334 delete store;
4335 store = nullptr;
4336 dout(10) << "Store synced" << dendl;
4337
4338 op_tracker.on_shutdown();
4339
4340 ClassHandler::get_instance().shutdown();
4341 client_messenger->shutdown();
4342 cluster_messenger->shutdown();
4343 hb_front_client_messenger->shutdown();
4344 hb_back_client_messenger->shutdown();
4345 objecter_messenger->shutdown();
4346 hb_front_server_messenger->shutdown();
4347 hb_back_server_messenger->shutdown();
4348
4349 return r;
4350 }
4351
4352 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4353 {
4354 bool created = false;
4355 while (true) {
4356 dout(10) << __func__ << " cmd: " << cmd << dendl;
4357 vector<string> vcmd{cmd};
4358 bufferlist inbl;
4359 C_SaferCond w;
4360 string outs;
4361 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4362 int r = w.wait();
4363 if (r < 0) {
4364 if (r == -ENOENT && !created) {
4365 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4366 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4367 vector<string> vnewcmd{newcmd};
4368 bufferlist inbl;
4369 C_SaferCond w;
4370 string outs;
4371 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4372 int r = w.wait();
4373 if (r < 0) {
4374 derr << __func__ << " fail: osd does not exist and created failed: "
4375 << cpp_strerror(r) << dendl;
4376 return r;
4377 }
4378 created = true;
4379 continue;
4380 }
4381 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4382 return r;
4383 }
4384 break;
4385 }
4386
4387 return 0;
4388 }
4389
4390 int OSD::update_crush_location()
4391 {
4392 if (!cct->_conf->osd_crush_update_on_start) {
4393 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4394 return 0;
4395 }
4396
4397 char weight[32];
4398 if (cct->_conf->osd_crush_initial_weight >= 0) {
4399 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4400 } else {
4401 struct store_statfs_t st;
4402 osd_alert_list_t alerts;
4403 int r = store->statfs(&st, &alerts);
4404 if (r < 0) {
4405 derr << "statfs: " << cpp_strerror(r) << dendl;
4406 return r;
4407 }
4408 snprintf(weight, sizeof(weight), "%.4lf",
4409 std::max(.00001,
4410 double(st.total) /
4411 double(1ull << 40 /* TB */)));
4412 }
4413
4414 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4415
4416 string cmd =
4417 string("{\"prefix\": \"osd crush create-or-move\", ") +
4418 string("\"id\": ") + stringify(whoami) + ", " +
4419 string("\"weight\":") + weight + ", " +
4420 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4421 return mon_cmd_maybe_osd_create(cmd);
4422 }
4423
4424 int OSD::update_crush_device_class()
4425 {
4426 if (!cct->_conf->osd_class_update_on_start) {
4427 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4428 return 0;
4429 }
4430
4431 string device_class;
4432 int r = store->read_meta("crush_device_class", &device_class);
4433 if (r < 0 || device_class.empty()) {
4434 device_class = store->get_default_device_class();
4435 }
4436
4437 if (device_class.empty()) {
4438 dout(20) << __func__ << " no device class stored locally" << dendl;
4439 return 0;
4440 }
4441
4442 string cmd =
4443 string("{\"prefix\": \"osd crush set-device-class\", ") +
4444 string("\"class\": \"") + device_class + string("\", ") +
4445 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4446
4447 r = mon_cmd_maybe_osd_create(cmd);
4448 if (r == -EBUSY) {
4449 // good, already bound to a device-class
4450 return 0;
4451 } else {
4452 return r;
4453 }
4454 }
4455
4456 void OSD::write_superblock(ObjectStore::Transaction& t)
4457 {
4458 dout(10) << "write_superblock " << superblock << dendl;
4459
4460 //hack: at minimum it's using the baseline feature set
4461 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4462 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4463
4464 bufferlist bl;
4465 encode(superblock, bl);
4466 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4467 }
4468
4469 int OSD::read_superblock()
4470 {
4471 bufferlist bl;
4472 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4473 if (r < 0)
4474 return r;
4475
4476 auto p = bl.cbegin();
4477 decode(superblock, p);
4478
4479 dout(10) << "read_superblock " << superblock << dendl;
4480
4481 return 0;
4482 }
4483
4484 void OSD::clear_temp_objects()
4485 {
4486 dout(10) << __func__ << dendl;
4487 vector<coll_t> ls;
4488 store->list_collections(ls);
4489 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4490 spg_t pgid;
4491 if (!p->is_pg(&pgid))
4492 continue;
4493
4494 // list temp objects
4495 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4496
4497 vector<ghobject_t> temps;
4498 ghobject_t next;
4499 while (1) {
4500 vector<ghobject_t> objects;
4501 auto ch = store->open_collection(*p);
4502 ceph_assert(ch);
4503 store->collection_list(ch, next, ghobject_t::get_max(),
4504 store->get_ideal_list_max(),
4505 &objects, &next);
4506 if (objects.empty())
4507 break;
4508 vector<ghobject_t>::iterator q;
4509 for (q = objects.begin(); q != objects.end(); ++q) {
4510 // Hammer set pool for temps to -1, so check for clean-up
4511 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4512 temps.push_back(*q);
4513 } else {
4514 break;
4515 }
4516 }
4517 // If we saw a non-temp object and hit the break above we can
4518 // break out of the while loop too.
4519 if (q != objects.end())
4520 break;
4521 }
4522 if (!temps.empty()) {
4523 ObjectStore::Transaction t;
4524 int removed = 0;
4525 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4526 dout(20) << " removing " << *p << " object " << *q << dendl;
4527 t.remove(*p, *q);
4528 if (++removed > cct->_conf->osd_target_transaction_size) {
4529 store->queue_transaction(service.meta_ch, std::move(t));
4530 t = ObjectStore::Transaction();
4531 removed = 0;
4532 }
4533 }
4534 if (removed) {
4535 store->queue_transaction(service.meta_ch, std::move(t));
4536 }
4537 }
4538 }
4539 }
4540
4541 void OSD::recursive_remove_collection(CephContext* cct,
4542 ObjectStore *store, spg_t pgid,
4543 coll_t tmp)
4544 {
4545 OSDriver driver(
4546 store,
4547 coll_t(),
4548 make_snapmapper_oid());
4549
4550 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4551 ObjectStore::Transaction t;
4552 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4553
4554 ghobject_t next;
4555 int max = cct->_conf->osd_target_transaction_size;
4556 vector<ghobject_t> objects;
4557 objects.reserve(max);
4558 while (true) {
4559 objects.clear();
4560 store->collection_list(ch, next, ghobject_t::get_max(),
4561 max, &objects, &next);
4562 generic_dout(10) << __func__ << " " << objects << dendl;
4563 if (objects.empty())
4564 break;
4565 for (auto& p: objects) {
4566 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4567 int r = mapper.remove_oid(p.hobj, &_t);
4568 if (r != 0 && r != -ENOENT)
4569 ceph_abort();
4570 t.remove(tmp, p);
4571 }
4572 int r = store->queue_transaction(ch, std::move(t));
4573 ceph_assert(r == 0);
4574 t = ObjectStore::Transaction();
4575 }
4576 t.remove_collection(tmp);
4577 int r = store->queue_transaction(ch, std::move(t));
4578 ceph_assert(r == 0);
4579
4580 C_SaferCond waiter;
4581 if (!ch->flush_commit(&waiter)) {
4582 waiter.wait();
4583 }
4584 }
4585
4586
4587 // ======================================================
4588 // PG's
4589
4590 PG* OSD::_make_pg(
4591 OSDMapRef createmap,
4592 spg_t pgid)
4593 {
4594 dout(10) << __func__ << " " << pgid << dendl;
4595 pg_pool_t pi;
4596 map<string,string> ec_profile;
4597 string name;
4598 if (createmap->have_pg_pool(pgid.pool())) {
4599 pi = *createmap->get_pg_pool(pgid.pool());
4600 name = createmap->get_pool_name(pgid.pool());
4601 if (pi.is_erasure()) {
4602 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4603 }
4604 } else {
4605 // pool was deleted; grab final pg_pool_t off disk.
4606 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4607 bufferlist bl;
4608 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4609 if (r < 0) {
4610 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4611 << dendl;
4612 return nullptr;
4613 }
4614 ceph_assert(r >= 0);
4615 auto p = bl.cbegin();
4616 decode(pi, p);
4617 decode(name, p);
4618 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4619 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4620 << " tombstone" << dendl;
4621 return nullptr;
4622 }
4623 decode(ec_profile, p);
4624 }
4625 PGPool pool(createmap, pgid.pool(), pi, name);
4626 PG *pg;
4627 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4628 pi.type == pg_pool_t::TYPE_ERASURE)
4629 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4630 else
4631 ceph_abort();
4632 return pg;
4633 }
4634
4635 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4636 {
4637 v->clear();
4638 v->reserve(get_num_pgs());
4639 for (auto& s : shards) {
4640 std::lock_guard l(s->shard_lock);
4641 for (auto& j : s->pg_slots) {
4642 if (j.second->pg &&
4643 !j.second->pg->is_deleted()) {
4644 v->push_back(j.second->pg);
4645 if (clear_too) {
4646 s->_detach_pg(j.second.get());
4647 }
4648 }
4649 }
4650 }
4651 }
4652
4653 void OSD::_get_pgids(vector<spg_t> *v)
4654 {
4655 v->clear();
4656 v->reserve(get_num_pgs());
4657 for (auto& s : shards) {
4658 std::lock_guard l(s->shard_lock);
4659 for (auto& j : s->pg_slots) {
4660 if (j.second->pg &&
4661 !j.second->pg->is_deleted()) {
4662 v->push_back(j.first);
4663 }
4664 }
4665 }
4666 }
4667
4668 void OSD::register_pg(PGRef pg)
4669 {
4670 spg_t pgid = pg->get_pgid();
4671 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4672 auto sdata = shards[shard_index];
4673 std::lock_guard l(sdata->shard_lock);
4674 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4675 ceph_assert(r.second);
4676 auto *slot = r.first->second.get();
4677 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4678 sdata->_attach_pg(slot, pg.get());
4679 }
4680
4681 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4682 {
4683 auto sdata = pg->osd_shard;
4684 ceph_assert(sdata);
4685 {
4686 std::lock_guard l(sdata->shard_lock);
4687 auto p = sdata->pg_slots.find(pg->pg_id);
4688 if (p == sdata->pg_slots.end() ||
4689 !p->second->pg) {
4690 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4691 return false;
4692 }
4693 if (p->second->waiting_for_merge_epoch) {
4694 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4695 return false;
4696 }
4697 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4698 sdata->_detach_pg(p->second.get());
4699 }
4700
4701 for (auto shard : shards) {
4702 shard->unprime_split_children(pg->pg_id, old_pg_num);
4703 }
4704
4705 // update pg count now since we might not get an osdmap any time soon.
4706 if (pg->is_primary())
4707 service.logger->dec(l_osd_pg_primary);
4708 else if (pg->is_nonprimary())
4709 service.logger->dec(l_osd_pg_replica); // misnomver
4710 else
4711 service.logger->dec(l_osd_pg_stray);
4712
4713 return true;
4714 }
4715
4716 PGRef OSD::_lookup_pg(spg_t pgid)
4717 {
4718 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4719 auto sdata = shards[shard_index];
4720 std::lock_guard l(sdata->shard_lock);
4721 auto p = sdata->pg_slots.find(pgid);
4722 if (p == sdata->pg_slots.end()) {
4723 return nullptr;
4724 }
4725 return p->second->pg;
4726 }
4727
4728 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4729 {
4730 PGRef pg = _lookup_pg(pgid);
4731 if (!pg) {
4732 return nullptr;
4733 }
4734 pg->lock();
4735 if (!pg->is_deleted()) {
4736 return pg;
4737 }
4738 pg->unlock();
4739 return nullptr;
4740 }
4741
4742 PGRef OSD::lookup_lock_pg(spg_t pgid)
4743 {
4744 return _lookup_lock_pg(pgid);
4745 }
4746
4747 void OSD::load_pgs()
4748 {
4749 ceph_assert(ceph_mutex_is_locked(osd_lock));
4750 dout(0) << "load_pgs" << dendl;
4751
4752 {
4753 auto pghist = make_pg_num_history_oid();
4754 bufferlist bl;
4755 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4756 if (r >= 0 && bl.length() > 0) {
4757 auto p = bl.cbegin();
4758 decode(pg_num_history, p);
4759 }
4760 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4761 }
4762
4763 vector<coll_t> ls;
4764 int r = store->list_collections(ls);
4765 if (r < 0) {
4766 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4767 }
4768
4769 int num = 0;
4770 for (vector<coll_t>::iterator it = ls.begin();
4771 it != ls.end();
4772 ++it) {
4773 spg_t pgid;
4774 if (it->is_temp(&pgid) ||
4775 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4776 dout(10) << "load_pgs " << *it
4777 << " removing, legacy or flagged for removal pg" << dendl;
4778 recursive_remove_collection(cct, store, pgid, *it);
4779 continue;
4780 }
4781
4782 if (!it->is_pg(&pgid)) {
4783 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4784 continue;
4785 }
4786
4787 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4788 epoch_t map_epoch = 0;
4789 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4790 if (r < 0) {
4791 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4792 << dendl;
4793 continue;
4794 }
4795
4796 PGRef pg;
4797 if (map_epoch > 0) {
4798 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4799 if (!pgosdmap) {
4800 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4801 derr << __func__ << ": could not find map for epoch " << map_epoch
4802 << " on pg " << pgid << ", but the pool is not present in the "
4803 << "current map, so this is probably a result of bug 10617. "
4804 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4805 << "to clean it up later." << dendl;
4806 continue;
4807 } else {
4808 derr << __func__ << ": have pgid " << pgid << " at epoch "
4809 << map_epoch << ", but missing map. Crashing."
4810 << dendl;
4811 ceph_abort_msg("Missing map in load_pgs");
4812 }
4813 }
4814 pg = _make_pg(pgosdmap, pgid);
4815 } else {
4816 pg = _make_pg(get_osdmap(), pgid);
4817 }
4818 if (!pg) {
4819 recursive_remove_collection(cct, store, pgid, *it);
4820 continue;
4821 }
4822
4823 // there can be no waiters here, so we don't call _wake_pg_slot
4824
4825 pg->lock();
4826 pg->ch = store->open_collection(pg->coll);
4827
4828 // read pg state, log
4829 pg->read_state(store);
4830
4831 if (pg->dne()) {
4832 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4833 pg->ch = nullptr;
4834 pg->unlock();
4835 recursive_remove_collection(cct, store, pgid, *it);
4836 continue;
4837 }
4838 {
4839 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4840 assert(NULL != shards[shard_index]);
4841 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4842 }
4843
4844 pg->reg_next_scrub();
4845
4846 dout(10) << __func__ << " loaded " << *pg << dendl;
4847 pg->unlock();
4848
4849 register_pg(pg);
4850 ++num;
4851 }
4852 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4853 }
4854
4855
4856 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4857 const PGCreateInfo *info)
4858 {
4859 spg_t pgid = info->pgid;
4860
4861 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4862 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4863 return nullptr;
4864 }
4865
4866 PeeringCtx rctx = create_context();
4867
4868 OSDMapRef startmap = get_map(info->epoch);
4869
4870 if (info->by_mon) {
4871 int64_t pool_id = pgid.pgid.pool();
4872 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4873 if (!pool) {
4874 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4875 return nullptr;
4876 }
4877 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4878 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4879 // this ensures we do not process old creating messages after the
4880 // pool's initial pgs have been created (and pg are subsequently
4881 // allowed to split or merge).
4882 dout(20) << __func__ << " dropping " << pgid
4883 << "create, pool does not have CREATING flag set" << dendl;
4884 return nullptr;
4885 }
4886 }
4887
4888 int up_primary, acting_primary;
4889 vector<int> up, acting;
4890 startmap->pg_to_up_acting_osds(
4891 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4892
4893 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4894 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4895 store->get_type() != "bluestore") {
4896 clog->warn() << "pg " << pgid
4897 << " is at risk of silent data corruption: "
4898 << "the pool allows ec overwrites but is not stored in "
4899 << "bluestore, so deep scrubbing will not detect bitrot";
4900 }
4901 create_pg_collection(
4902 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4903 init_pg_ondisk(rctx.transaction, pgid, pp);
4904
4905 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4906
4907 PGRef pg = _make_pg(startmap, pgid);
4908 pg->ch = store->create_new_collection(pg->coll);
4909
4910 {
4911 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4912 assert(NULL != shards[shard_index]);
4913 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4914 }
4915
4916 pg->lock(true);
4917
4918 // we are holding the shard lock
4919 ceph_assert(!pg->is_deleted());
4920
4921 pg->init(
4922 role,
4923 up,
4924 up_primary,
4925 acting,
4926 acting_primary,
4927 info->history,
4928 info->past_intervals,
4929 false,
4930 rctx.transaction);
4931
4932 pg->init_collection_pool_opts();
4933
4934 if (pg->is_primary()) {
4935 std::lock_guard locker{m_perf_queries_lock};
4936 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4937 }
4938
4939 pg->handle_initialize(rctx);
4940 pg->handle_activate_map(rctx);
4941
4942 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4943
4944 dout(10) << __func__ << " new pg " << *pg << dendl;
4945 return pg;
4946 }
4947
4948 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4949 spg_t pgid,
4950 bool is_mon_create)
4951 {
4952 const auto max_pgs_per_osd =
4953 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4954 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4955
4956 if (num_pgs < max_pgs_per_osd) {
4957 return false;
4958 }
4959
4960 std::lock_guard l(pending_creates_lock);
4961 if (is_mon_create) {
4962 pending_creates_from_mon++;
4963 } else {
4964 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4965 pending_creates_from_osd.emplace(pgid, is_primary);
4966 }
4967 dout(1) << __func__ << " withhold creation of pg " << pgid
4968 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4969 return true;
4970 }
4971
4972 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4973 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4974 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4975 static vector<int32_t> twiddle(const vector<int>& acting) {
4976 if (acting.size() > 1) {
4977 return {acting[0]};
4978 } else {
4979 vector<int32_t> twiddled(acting.begin(), acting.end());
4980 twiddled.push_back(-1);
4981 return twiddled;
4982 }
4983 }
4984
4985 void OSD::resume_creating_pg()
4986 {
4987 bool do_sub_pg_creates = false;
4988 bool have_pending_creates = false;
4989 {
4990 const auto max_pgs_per_osd =
4991 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4992 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4993 if (max_pgs_per_osd <= num_pgs) {
4994 // this could happen if admin decreases this setting before a PG is removed
4995 return;
4996 }
4997 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4998 std::lock_guard l(pending_creates_lock);
4999 if (pending_creates_from_mon > 0) {
5000 dout(20) << __func__ << " pending_creates_from_mon "
5001 << pending_creates_from_mon << dendl;
5002 do_sub_pg_creates = true;
5003 if (pending_creates_from_mon >= spare_pgs) {
5004 spare_pgs = pending_creates_from_mon = 0;
5005 } else {
5006 spare_pgs -= pending_creates_from_mon;
5007 pending_creates_from_mon = 0;
5008 }
5009 }
5010 auto pg = pending_creates_from_osd.cbegin();
5011 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5012 dout(20) << __func__ << " pg " << pg->first << dendl;
5013 vector<int> acting;
5014 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5015 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5016 pg = pending_creates_from_osd.erase(pg);
5017 do_sub_pg_creates = true;
5018 spare_pgs--;
5019 }
5020 have_pending_creates = (pending_creates_from_mon > 0 ||
5021 !pending_creates_from_osd.empty());
5022 }
5023
5024 bool do_renew_subs = false;
5025 if (do_sub_pg_creates) {
5026 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5027 dout(4) << __func__ << ": resolicit pg creates from mon since "
5028 << last_pg_create_epoch << dendl;
5029 do_renew_subs = true;
5030 }
5031 }
5032 version_t start = get_osdmap_epoch() + 1;
5033 if (have_pending_creates) {
5034 // don't miss any new osdmap deleting PGs
5035 if (monc->sub_want("osdmap", start, 0)) {
5036 dout(4) << __func__ << ": resolicit osdmap from mon since "
5037 << start << dendl;
5038 do_renew_subs = true;
5039 }
5040 } else if (do_sub_pg_creates) {
5041 // no need to subscribe the osdmap continuously anymore
5042 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5043 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5044 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5045 << start << dendl;
5046 do_renew_subs = true;
5047 }
5048 }
5049
5050 if (do_renew_subs) {
5051 monc->renew_subs();
5052 }
5053
5054 service.send_pg_temp();
5055 }
5056
5057 void OSD::build_initial_pg_history(
5058 spg_t pgid,
5059 epoch_t created,
5060 utime_t created_stamp,
5061 pg_history_t *h,
5062 PastIntervals *pi)
5063 {
5064 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
5065 *h = pg_history_t(created, created_stamp);
5066
5067 OSDMapRef lastmap = service.get_map(created);
5068 int up_primary, acting_primary;
5069 vector<int> up, acting;
5070 lastmap->pg_to_up_acting_osds(
5071 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5072
5073 ostringstream debug;
5074 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
5075 OSDMapRef osdmap = service.get_map(e);
5076 int new_up_primary, new_acting_primary;
5077 vector<int> new_up, new_acting;
5078 osdmap->pg_to_up_acting_osds(
5079 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5080
5081 // this is a bit imprecise, but sufficient?
5082 struct min_size_predicate_t : public IsPGRecoverablePredicate {
5083 const pg_pool_t *pi;
5084 bool operator()(const set<pg_shard_t> &have) const {
5085 return have.size() >= pi->min_size;
5086 }
5087 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
5088 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5089
5090 bool new_interval = PastIntervals::check_new_interval(
5091 acting_primary,
5092 new_acting_primary,
5093 acting, new_acting,
5094 up_primary,
5095 new_up_primary,
5096 up, new_up,
5097 h->same_interval_since,
5098 h->last_epoch_clean,
5099 osdmap.get(),
5100 lastmap.get(),
5101 pgid.pgid,
5102 min_size_predicate,
5103 pi,
5104 &debug);
5105 if (new_interval) {
5106 h->same_interval_since = e;
5107 if (up != new_up) {
5108 h->same_up_since = e;
5109 }
5110 if (acting_primary != new_acting_primary) {
5111 h->same_primary_since = e;
5112 }
5113 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5114 osdmap->get_pg_num(pgid.pgid.pool()),
5115 nullptr)) {
5116 h->last_epoch_split = e;
5117 }
5118 up = new_up;
5119 acting = new_acting;
5120 up_primary = new_up_primary;
5121 acting_primary = new_acting_primary;
5122 }
5123 lastmap = osdmap;
5124 }
5125 dout(20) << __func__ << " " << debug.str() << dendl;
5126 dout(10) << __func__ << " " << *h << " " << *pi
5127 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5128 pi->get_bounds()) << ")"
5129 << dendl;
5130 }
5131
5132 void OSD::_add_heartbeat_peer(int p)
5133 {
5134 if (p == whoami)
5135 return;
5136 HeartbeatInfo *hi;
5137
5138 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5139 if (i == heartbeat_peers.end()) {
5140 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5141 if (!cons.first)
5142 return;
5143 assert(cons.second);
5144
5145 hi = &heartbeat_peers[p];
5146 hi->peer = p;
5147
5148 auto stamps = service.get_hb_stamps(p);
5149
5150 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5151 sb->peer = p;
5152 sb->stamps = stamps;
5153 hi->hb_interval_start = ceph_clock_now();
5154 hi->con_back = cons.first.get();
5155 hi->con_back->set_priv(sb);
5156
5157 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5158 sf->peer = p;
5159 sf->stamps = stamps;
5160 hi->con_front = cons.second.get();
5161 hi->con_front->set_priv(sf);
5162
5163 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5164 << " " << hi->con_back->get_peer_addr()
5165 << " " << hi->con_front->get_peer_addr()
5166 << dendl;
5167 } else {
5168 hi = &i->second;
5169 }
5170 hi->epoch = get_osdmap_epoch();
5171 }
5172
5173 void OSD::_remove_heartbeat_peer(int n)
5174 {
5175 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5176 ceph_assert(q != heartbeat_peers.end());
5177 dout(20) << " removing heartbeat peer osd." << n
5178 << " " << q->second.con_back->get_peer_addr()
5179 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5180 << dendl;
5181 q->second.clear_mark_down();
5182 heartbeat_peers.erase(q);
5183 }
5184
5185 void OSD::need_heartbeat_peer_update()
5186 {
5187 if (is_stopping())
5188 return;
5189 dout(20) << "need_heartbeat_peer_update" << dendl;
5190 heartbeat_set_peers_need_update();
5191 }
5192
5193 void OSD::maybe_update_heartbeat_peers()
5194 {
5195 ceph_assert(ceph_mutex_is_locked(osd_lock));
5196
5197 if (is_waiting_for_healthy() || is_active()) {
5198 utime_t now = ceph_clock_now();
5199 if (last_heartbeat_resample == utime_t()) {
5200 last_heartbeat_resample = now;
5201 heartbeat_set_peers_need_update();
5202 } else if (!heartbeat_peers_need_update()) {
5203 utime_t dur = now - last_heartbeat_resample;
5204 if (dur > cct->_conf->osd_heartbeat_grace) {
5205 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5206 heartbeat_set_peers_need_update();
5207 last_heartbeat_resample = now;
5208 // automatically clean up any stale heartbeat peers
5209 // if we are unhealthy, then clean all
5210 reset_heartbeat_peers(is_waiting_for_healthy());
5211 }
5212 }
5213 }
5214
5215 if (!heartbeat_peers_need_update())
5216 return;
5217 heartbeat_clear_peers_need_update();
5218
5219 std::lock_guard l(heartbeat_lock);
5220
5221 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5222
5223
5224 // build heartbeat from set
5225 if (is_active()) {
5226 vector<PGRef> pgs;
5227 _get_pgs(&pgs);
5228 for (auto& pg : pgs) {
5229 pg->with_heartbeat_peers([&](int peer) {
5230 if (get_osdmap()->is_up(peer)) {
5231 _add_heartbeat_peer(peer);
5232 }
5233 });
5234 }
5235 }
5236
5237 // include next and previous up osds to ensure we have a fully-connected set
5238 set<int> want, extras;
5239 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5240 if (next >= 0)
5241 want.insert(next);
5242 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5243 if (prev >= 0 && prev != next)
5244 want.insert(prev);
5245
5246 // make sure we have at least **min_down** osds coming from different
5247 // subtree level (e.g., hosts) for fast failure detection.
5248 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5249 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5250 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5251 get_osdmap()->get_random_up_osds_by_subtree(
5252 whoami, subtree, limit, want, &want);
5253
5254 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5255 dout(10) << " adding neighbor peer osd." << *p << dendl;
5256 extras.insert(*p);
5257 _add_heartbeat_peer(*p);
5258 }
5259
5260 // remove down peers; enumerate extras
5261 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5262 while (p != heartbeat_peers.end()) {
5263 if (!get_osdmap()->is_up(p->first)) {
5264 int o = p->first;
5265 ++p;
5266 _remove_heartbeat_peer(o);
5267 continue;
5268 }
5269 if (p->second.epoch < get_osdmap_epoch()) {
5270 extras.insert(p->first);
5271 }
5272 ++p;
5273 }
5274
5275 // too few?
5276 for (int n = next; n >= 0; ) {
5277 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5278 break;
5279 if (!extras.count(n) && !want.count(n) && n != whoami) {
5280 dout(10) << " adding random peer osd." << n << dendl;
5281 extras.insert(n);
5282 _add_heartbeat_peer(n);
5283 }
5284 n = get_osdmap()->get_next_up_osd_after(n);
5285 if (n == next)
5286 break; // came full circle; stop
5287 }
5288
5289 // too many?
5290 for (set<int>::iterator p = extras.begin();
5291 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5292 ++p) {
5293 if (want.count(*p))
5294 continue;
5295 _remove_heartbeat_peer(*p);
5296 }
5297
5298 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5299
5300 // clean up stale failure pending
5301 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5302 if (heartbeat_peers.count(it->first) == 0) {
5303 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5304 failure_pending.erase(it++);
5305 } else {
5306 it++;
5307 }
5308 }
5309 }
5310
5311 void OSD::reset_heartbeat_peers(bool all)
5312 {
5313 ceph_assert(ceph_mutex_is_locked(osd_lock));
5314 dout(10) << "reset_heartbeat_peers" << dendl;
5315 utime_t stale = ceph_clock_now();
5316 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5317 std::lock_guard l(heartbeat_lock);
5318 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5319 auto& [peer, hi] = *it;
5320 if (all || hi.is_stale(stale)) {
5321 hi.clear_mark_down();
5322 // stop sending failure_report to mon too
5323 failure_queue.erase(peer);
5324 failure_pending.erase(peer);
5325 it = heartbeat_peers.erase(it);
5326 } else {
5327 ++it;
5328 }
5329 }
5330 }
5331
5332 void OSD::handle_osd_ping(MOSDPing *m)
5333 {
5334 if (superblock.cluster_fsid != m->fsid) {
5335 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5336 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5337 << dendl;
5338 m->put();
5339 return;
5340 }
5341
5342 int from = m->get_source().num();
5343
5344 heartbeat_lock.lock();
5345 if (is_stopping()) {
5346 heartbeat_lock.unlock();
5347 m->put();
5348 return;
5349 }
5350
5351 utime_t now = ceph_clock_now();
5352 auto mnow = service.get_mnow();
5353 ConnectionRef con(m->get_connection());
5354 OSDMapRef curmap = service.get_osdmap();
5355 if (!curmap) {
5356 heartbeat_lock.unlock();
5357 m->put();
5358 return;
5359 }
5360
5361 auto sref = con->get_priv();
5362 Session *s = static_cast<Session*>(sref.get());
5363 if (!s) {
5364 heartbeat_lock.unlock();
5365 m->put();
5366 return;
5367 }
5368 if (!s->stamps) {
5369 s->peer = from;
5370 s->stamps = service.get_hb_stamps(from);
5371 }
5372
5373 switch (m->op) {
5374
5375 case MOSDPing::PING:
5376 {
5377 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5378 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5379 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5380 if (heartbeat_drop->second == 0) {
5381 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5382 } else {
5383 --heartbeat_drop->second;
5384 dout(5) << "Dropping heartbeat from " << from
5385 << ", " << heartbeat_drop->second
5386 << " remaining to drop" << dendl;
5387 break;
5388 }
5389 } else if (cct->_conf->osd_debug_drop_ping_probability >
5390 ((((double)(rand()%100))/100.0))) {
5391 heartbeat_drop =
5392 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5393 cct->_conf->osd_debug_drop_ping_duration)).first;
5394 dout(5) << "Dropping heartbeat from " << from
5395 << ", " << heartbeat_drop->second
5396 << " remaining to drop" << dendl;
5397 break;
5398 }
5399 }
5400
5401 ceph::signedspan sender_delta_ub{};
5402 s->stamps->got_ping(
5403 m->up_from,
5404 mnow,
5405 m->mono_send_stamp,
5406 m->delta_ub,
5407 &sender_delta_ub);
5408 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5409
5410 if (!cct->get_heartbeat_map()->is_healthy()) {
5411 dout(10) << "internal heartbeat not healthy, dropping ping request"
5412 << dendl;
5413 break;
5414 }
5415
5416 Message *r = new MOSDPing(monc->get_fsid(),
5417 curmap->get_epoch(),
5418 MOSDPing::PING_REPLY,
5419 m->ping_stamp,
5420 m->mono_ping_stamp,
5421 mnow,
5422 service.get_up_epoch(),
5423 cct->_conf->osd_heartbeat_min_size,
5424 sender_delta_ub);
5425 con->send_message(r);
5426
5427 if (curmap->is_up(from)) {
5428 if (is_active()) {
5429 ConnectionRef cluster_con = service.get_con_osd_cluster(
5430 from, curmap->get_epoch());
5431 if (cluster_con) {
5432 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5433 }
5434 }
5435 } else if (!curmap->exists(from) ||
5436 curmap->get_down_at(from) > m->map_epoch) {
5437 // tell them they have died
5438 Message *r = new MOSDPing(monc->get_fsid(),
5439 curmap->get_epoch(),
5440 MOSDPing::YOU_DIED,
5441 m->ping_stamp,
5442 m->mono_ping_stamp,
5443 mnow,
5444 service.get_up_epoch(),
5445 cct->_conf->osd_heartbeat_min_size);
5446 con->send_message(r);
5447 }
5448 }
5449 break;
5450
5451 case MOSDPing::PING_REPLY:
5452 {
5453 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5454 if (i != heartbeat_peers.end()) {
5455 auto acked = i->second.ping_history.find(m->ping_stamp);
5456 if (acked != i->second.ping_history.end()) {
5457 int &unacknowledged = acked->second.second;
5458 if (con == i->second.con_back) {
5459 dout(25) << "handle_osd_ping got reply from osd." << from
5460 << " first_tx " << i->second.first_tx
5461 << " last_tx " << i->second.last_tx
5462 << " last_rx_back " << i->second.last_rx_back
5463 << " -> " << now
5464 << " last_rx_front " << i->second.last_rx_front
5465 << dendl;
5466 i->second.last_rx_back = now;
5467 ceph_assert(unacknowledged > 0);
5468 --unacknowledged;
5469 // if there is no front con, set both stamps.
5470 if (i->second.con_front == NULL) {
5471 i->second.last_rx_front = now;
5472 ceph_assert(unacknowledged > 0);
5473 --unacknowledged;
5474 }
5475 } else if (con == i->second.con_front) {
5476 dout(25) << "handle_osd_ping got reply from osd." << from
5477 << " first_tx " << i->second.first_tx
5478 << " last_tx " << i->second.last_tx
5479 << " last_rx_back " << i->second.last_rx_back
5480 << " last_rx_front " << i->second.last_rx_front
5481 << " -> " << now
5482 << dendl;
5483 i->second.last_rx_front = now;
5484 ceph_assert(unacknowledged > 0);
5485 --unacknowledged;
5486 }
5487
5488 if (unacknowledged == 0) {
5489 // succeeded in getting all replies
5490 dout(25) << "handle_osd_ping got all replies from osd." << from
5491 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5492 << " and older pending ping(s)"
5493 << dendl;
5494
5495 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5496 ++i->second.hb_average_count;
5497 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5498 i->second.hb_total_back += back_pingtime;
5499 if (back_pingtime < i->second.hb_min_back)
5500 i->second.hb_min_back = back_pingtime;
5501 if (back_pingtime > i->second.hb_max_back)
5502 i->second.hb_max_back = back_pingtime;
5503 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5504 i->second.hb_total_front += front_pingtime;
5505 if (front_pingtime < i->second.hb_min_front)
5506 i->second.hb_min_front = front_pingtime;
5507 if (front_pingtime > i->second.hb_max_front)
5508 i->second.hb_max_front = front_pingtime;
5509
5510 ceph_assert(i->second.hb_interval_start != utime_t());
5511 if (i->second.hb_interval_start == utime_t())
5512 i->second.hb_interval_start = now;
5513 int64_t hb_avg_time_period = 60;
5514 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5515 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5516 }
5517 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5518 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5519 uint32_t back_min = i->second.hb_min_back;
5520 uint32_t back_max = i->second.hb_max_back;
5521 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5522 uint32_t front_min = i->second.hb_min_front;
5523 uint32_t front_max = i->second.hb_max_front;
5524
5525 // Reset for new interval
5526 i->second.hb_average_count = 0;
5527 i->second.hb_interval_start = now;
5528 i->second.hb_total_back = i->second.hb_max_back = 0;
5529 i->second.hb_min_back = UINT_MAX;
5530 i->second.hb_total_front = i->second.hb_max_front = 0;
5531 i->second.hb_min_front = UINT_MAX;
5532
5533 // Record per osd interace ping times
5534 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5535 if (i->second.hb_back_pingtime.size() == 0) {
5536 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5537 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5538 i->second.hb_back_pingtime.push_back(back_avg);
5539 i->second.hb_back_min.push_back(back_min);
5540 i->second.hb_back_max.push_back(back_max);
5541 i->second.hb_front_pingtime.push_back(front_avg);
5542 i->second.hb_front_min.push_back(front_min);
5543 i->second.hb_front_max.push_back(front_max);
5544 ++i->second.hb_index;
5545 }
5546 } else {
5547 int index = i->second.hb_index & (hb_vector_size - 1);
5548 i->second.hb_back_pingtime[index] = back_avg;
5549 i->second.hb_back_min[index] = back_min;
5550 i->second.hb_back_max[index] = back_max;
5551 i->second.hb_front_pingtime[index] = front_avg;
5552 i->second.hb_front_min[index] = front_min;
5553 i->second.hb_front_max[index] = front_max;
5554 ++i->second.hb_index;
5555 }
5556
5557 {
5558 std::lock_guard l(service.stat_lock);
5559 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5560 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5561
5562 uint32_t total = 0;
5563 uint32_t min = UINT_MAX;
5564 uint32_t max = 0;
5565 uint32_t count = 0;
5566 uint32_t which = 0;
5567 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5568 for (int32_t k = size - 1 ; k >= 0; --k) {
5569 ++count;
5570 int index = (i->second.hb_index + k) % size;
5571 total += i->second.hb_back_pingtime[index];
5572 if (i->second.hb_back_min[index] < min)
5573 min = i->second.hb_back_min[index];
5574 if (i->second.hb_back_max[index] > max)
5575 max = i->second.hb_back_max[index];
5576 if (count == 1 || count == 5 || count == 15) {
5577 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5578 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5579 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5580 which++;
5581 if (count == 15)
5582 break;
5583 }
5584 }
5585
5586 if (i->second.con_front != NULL) {
5587 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5588
5589 total = 0;
5590 min = UINT_MAX;
5591 max = 0;
5592 count = 0;
5593 which = 0;
5594 for (int32_t k = size - 1 ; k >= 0; --k) {
5595 ++count;
5596 int index = (i->second.hb_index + k) % size;
5597 total += i->second.hb_front_pingtime[index];
5598 if (i->second.hb_front_min[index] < min)
5599 min = i->second.hb_front_min[index];
5600 if (i->second.hb_front_max[index] > max)
5601 max = i->second.hb_front_max[index];
5602 if (count == 1 || count == 5 || count == 15) {
5603 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5604 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5605 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5606 which++;
5607 if (count == 15)
5608 break;
5609 }
5610 }
5611 }
5612 }
5613 } else {
5614 std::lock_guard l(service.stat_lock);
5615 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5616 if (i->second.con_front != NULL)
5617 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5618 }
5619 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5620 }
5621
5622 if (i->second.is_healthy(now)) {
5623 // Cancel false reports
5624 auto failure_queue_entry = failure_queue.find(from);
5625 if (failure_queue_entry != failure_queue.end()) {
5626 dout(10) << "handle_osd_ping canceling queued "
5627 << "failure report for osd." << from << dendl;
5628 failure_queue.erase(failure_queue_entry);
5629 }
5630
5631 auto failure_pending_entry = failure_pending.find(from);
5632 if (failure_pending_entry != failure_pending.end()) {
5633 dout(10) << "handle_osd_ping canceling in-flight "
5634 << "failure report for osd." << from << dendl;
5635 send_still_alive(curmap->get_epoch(),
5636 from,
5637 failure_pending_entry->second.second);
5638 failure_pending.erase(failure_pending_entry);
5639 }
5640 }
5641 } else {
5642 // old replies, deprecated by newly sent pings.
5643 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5644 << ") is found, treat as covered by newly sent pings "
5645 << "and ignore"
5646 << dendl;
5647 }
5648 }
5649
5650 if (m->map_epoch &&
5651 curmap->is_up(from)) {
5652 if (is_active()) {
5653 ConnectionRef cluster_con = service.get_con_osd_cluster(
5654 from, curmap->get_epoch());
5655 if (cluster_con) {
5656 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5657 }
5658 }
5659 }
5660
5661 s->stamps->got_ping_reply(
5662 mnow,
5663 m->mono_send_stamp,
5664 m->delta_ub);
5665 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5666 }
5667 break;
5668
5669 case MOSDPing::YOU_DIED:
5670 dout(10) << "handle_osd_ping " << m->get_source_inst()
5671 << " says i am down in " << m->map_epoch << dendl;
5672 osdmap_subscribe(curmap->get_epoch()+1, false);
5673 break;
5674 }
5675
5676 heartbeat_lock.unlock();
5677 m->put();
5678 }
5679
5680 void OSD::heartbeat_entry()
5681 {
5682 std::unique_lock l(heartbeat_lock);
5683 if (is_stopping())
5684 return;
5685 while (!heartbeat_stop) {
5686 heartbeat();
5687
5688 double wait;
5689 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5690 wait = (float)cct->_conf->osd_heartbeat_interval;
5691 } else {
5692 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5693 }
5694 auto w = ceph::make_timespan(wait);
5695 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5696 heartbeat_cond.wait_for(l, w);
5697 if (is_stopping())
5698 return;
5699 dout(30) << "heartbeat_entry woke up" << dendl;
5700 }
5701 }
5702
5703 void OSD::heartbeat_check()
5704 {
5705 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5706 utime_t now = ceph_clock_now();
5707
5708 // check for incoming heartbeats (move me elsewhere?)
5709 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5710 p != heartbeat_peers.end();
5711 ++p) {
5712
5713 if (p->second.first_tx == utime_t()) {
5714 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5715 << " yet, skipping" << dendl;
5716 continue;
5717 }
5718
5719 dout(25) << "heartbeat_check osd." << p->first
5720 << " first_tx " << p->second.first_tx
5721 << " last_tx " << p->second.last_tx
5722 << " last_rx_back " << p->second.last_rx_back
5723 << " last_rx_front " << p->second.last_rx_front
5724 << dendl;
5725 if (p->second.is_unhealthy(now)) {
5726 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5727 if (p->second.last_rx_back == utime_t() ||
5728 p->second.last_rx_front == utime_t()) {
5729 derr << "heartbeat_check: no reply from "
5730 << p->second.con_front->get_peer_addr().get_sockaddr()
5731 << " osd." << p->first
5732 << " ever on either front or back, first ping sent "
5733 << p->second.first_tx
5734 << " (oldest deadline " << oldest_deadline << ")"
5735 << dendl;
5736 // fail
5737 failure_queue[p->first] = p->second.first_tx;
5738 } else {
5739 derr << "heartbeat_check: no reply from "
5740 << p->second.con_front->get_peer_addr().get_sockaddr()
5741 << " osd." << p->first << " since back " << p->second.last_rx_back
5742 << " front " << p->second.last_rx_front
5743 << " (oldest deadline " << oldest_deadline << ")"
5744 << dendl;
5745 // fail
5746 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5747 }
5748 }
5749 }
5750 }
5751
5752 void OSD::heartbeat()
5753 {
5754 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5755 dout(30) << "heartbeat" << dendl;
5756
5757 // get CPU load avg
5758 double loadavgs[1];
5759 int hb_interval = cct->_conf->osd_heartbeat_interval;
5760 int n_samples = 86400;
5761 if (hb_interval > 1) {
5762 n_samples /= hb_interval;
5763 if (n_samples < 1)
5764 n_samples = 1;
5765 }
5766
5767 if (getloadavg(loadavgs, 1) == 1) {
5768 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5769 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5770 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5771 }
5772
5773 dout(30) << "heartbeat checking stats" << dendl;
5774
5775 // refresh peer list and osd stats
5776 vector<int> hb_peers;
5777 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5778 p != heartbeat_peers.end();
5779 ++p)
5780 hb_peers.push_back(p->first);
5781
5782 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5783 dout(5) << __func__ << " " << new_stat << dendl;
5784 ceph_assert(new_stat.statfs.total);
5785
5786 float pratio;
5787 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5788
5789 service.check_full_status(ratio, pratio);
5790
5791 utime_t now = ceph_clock_now();
5792 auto mnow = service.get_mnow();
5793 utime_t deadline = now;
5794 deadline += cct->_conf->osd_heartbeat_grace;
5795
5796 // send heartbeats
5797 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5798 i != heartbeat_peers.end();
5799 ++i) {
5800 int peer = i->first;
5801 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5802 if (!s) {
5803 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5804 continue;
5805 }
5806 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5807
5808 i->second.last_tx = now;
5809 if (i->second.first_tx == utime_t())
5810 i->second.first_tx = now;
5811 i->second.ping_history[now] = make_pair(deadline,
5812 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5813 if (i->second.hb_interval_start == utime_t())
5814 i->second.hb_interval_start = now;
5815
5816 std::optional<ceph::signedspan> delta_ub;
5817 s->stamps->sent_ping(&delta_ub);
5818
5819 i->second.con_back->send_message(
5820 new MOSDPing(monc->get_fsid(),
5821 service.get_osdmap_epoch(),
5822 MOSDPing::PING,
5823 now,
5824 mnow,
5825 mnow,
5826 service.get_up_epoch(),
5827 cct->_conf->osd_heartbeat_min_size,
5828 delta_ub));
5829
5830 if (i->second.con_front)
5831 i->second.con_front->send_message(
5832 new MOSDPing(monc->get_fsid(),
5833 service.get_osdmap_epoch(),
5834 MOSDPing::PING,
5835 now,
5836 mnow,
5837 mnow,
5838 service.get_up_epoch(),
5839 cct->_conf->osd_heartbeat_min_size,
5840 delta_ub));
5841 }
5842
5843 logger->set(l_osd_hb_to, heartbeat_peers.size());
5844
5845 // hmm.. am i all alone?
5846 dout(30) << "heartbeat lonely?" << dendl;
5847 if (heartbeat_peers.empty()) {
5848 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5849 last_mon_heartbeat = now;
5850 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5851 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5852 }
5853 }
5854
5855 dout(30) << "heartbeat done" << dendl;
5856 }
5857
5858 bool OSD::heartbeat_reset(Connection *con)
5859 {
5860 std::lock_guard l(heartbeat_lock);
5861 auto s = con->get_priv();
5862 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5863 con->set_priv(nullptr);
5864 if (s) {
5865 if (is_stopping()) {
5866 return true;
5867 }
5868 auto session = static_cast<Session*>(s.get());
5869 auto p = heartbeat_peers.find(session->peer);
5870 if (p != heartbeat_peers.end() &&
5871 (p->second.con_back == con ||
5872 p->second.con_front == con)) {
5873 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5874 << ", reopening" << dendl;
5875 p->second.clear_mark_down(con);
5876 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5877 if (newcon.first) {
5878 p->second.con_back = newcon.first.get();
5879 p->second.con_back->set_priv(s);
5880 if (newcon.second) {
5881 p->second.con_front = newcon.second.get();
5882 p->second.con_front->set_priv(s);
5883 }
5884 p->second.ping_history.clear();
5885 } else {
5886 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5887 << ", raced with osdmap update, closing out peer" << dendl;
5888 heartbeat_peers.erase(p);
5889 }
5890 } else {
5891 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5892 }
5893 }
5894 return true;
5895 }
5896
5897
5898
5899 // =========================================
5900
5901 void OSD::tick()
5902 {
5903 ceph_assert(ceph_mutex_is_locked(osd_lock));
5904 dout(10) << "tick" << dendl;
5905
5906 utime_t now = ceph_clock_now();
5907 // throw out any obsolete markdown log
5908 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5909 while (!osd_markdown_log.empty() &&
5910 osd_markdown_log.front() + grace < now)
5911 osd_markdown_log.pop_front();
5912
5913 if (is_active() || is_waiting_for_healthy()) {
5914 maybe_update_heartbeat_peers();
5915 }
5916
5917 if (is_waiting_for_healthy()) {
5918 start_boot();
5919 }
5920
5921 if (is_waiting_for_healthy() || is_booting()) {
5922 std::lock_guard l(heartbeat_lock);
5923 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5924 last_mon_heartbeat = now;
5925 dout(1) << __func__ << " checking mon for new map" << dendl;
5926 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5927 }
5928 }
5929
5930 do_waiters();
5931
5932 // scrub purged_snaps every deep scrub interval
5933 {
5934 const utime_t last = superblock.last_purged_snaps_scrub;
5935 utime_t next = last;
5936 next += cct->_conf->osd_scrub_min_interval;
5937 std::mt19937 rng;
5938 // use a seed that is stable for each scrub interval, but varies
5939 // by OSD to avoid any herds.
5940 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5941 double r = (rng() % 1024) / 1024;
5942 next +=
5943 cct->_conf->osd_scrub_min_interval *
5944 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5945 if (next < ceph_clock_now()) {
5946 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5947 << " next " << next << " ... now" << dendl;
5948 scrub_purged_snaps();
5949 } else {
5950 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5951 << " next " << next << dendl;
5952 }
5953 }
5954
5955 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5956 }
5957
5958 void OSD::tick_without_osd_lock()
5959 {
5960 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5961 dout(10) << "tick_without_osd_lock" << dendl;
5962
5963 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
5964 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
5965 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
5966
5967 // refresh osd stats
5968 struct store_statfs_t stbuf;
5969 osd_alert_list_t alerts;
5970 int r = store->statfs(&stbuf, &alerts);
5971 ceph_assert(r == 0);
5972 service.set_statfs(stbuf, alerts);
5973
5974 // osd_lock is not being held, which means the OSD state
5975 // might change when doing the monitor report
5976 if (is_active() || is_waiting_for_healthy()) {
5977 {
5978 std::lock_guard l{heartbeat_lock};
5979 heartbeat_check();
5980 }
5981 map_lock.lock_shared();
5982 std::lock_guard l(mon_report_lock);
5983
5984 // mon report?
5985 utime_t now = ceph_clock_now();
5986 if (service.need_fullness_update() ||
5987 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5988 last_mon_report = now;
5989 send_full_update();
5990 send_failures();
5991 }
5992 map_lock.unlock_shared();
5993
5994 epoch_t max_waiting_epoch = 0;
5995 for (auto s : shards) {
5996 max_waiting_epoch = std::max(max_waiting_epoch,
5997 s->get_max_waiting_epoch());
5998 }
5999 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6000 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6001 << ", requesting new map" << dendl;
6002 osdmap_subscribe(superblock.newest_map + 1, false);
6003 }
6004 }
6005
6006 if (is_active()) {
6007 if (!scrub_random_backoff()) {
6008 sched_scrub();
6009 }
6010 service.promote_throttle_recalibrate();
6011 resume_creating_pg();
6012 bool need_send_beacon = false;
6013 const auto now = ceph::coarse_mono_clock::now();
6014 {
6015 // borrow lec lock to pretect last_sent_beacon from changing
6016 std::lock_guard l{min_last_epoch_clean_lock};
6017 const auto elapsed = now - last_sent_beacon;
6018 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6019 cct->_conf->osd_beacon_report_interval) {
6020 need_send_beacon = true;
6021 }
6022 }
6023 if (need_send_beacon) {
6024 send_beacon(now);
6025 }
6026 }
6027
6028 mgrc.update_daemon_health(get_health_metrics());
6029 service.kick_recovery_queue();
6030 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6031 new C_Tick_WithoutOSDLock(this));
6032 }
6033
6034 // Usage:
6035 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6036 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6037 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6038 // getomap <pool> [namespace/]<obj-name>
6039 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6040 // injectmdataerr [namespace/]<obj-name> [shardid]
6041 // injectdataerr [namespace/]<obj-name> [shardid]
6042 //
6043 // set_recovery_delay [utime]
6044 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6045 std::string_view command,
6046 const cmdmap_t& cmdmap, ostream &ss)
6047 {
6048 //Test support
6049 //Support changing the omap on a single osd by using the Admin Socket to
6050 //directly request the osd make a change.
6051 if (command == "setomapval" || command == "rmomapkey" ||
6052 command == "setomapheader" || command == "getomap" ||
6053 command == "truncobj" || command == "injectmdataerr" ||
6054 command == "injectdataerr"
6055 ) {
6056 pg_t rawpg;
6057 int64_t pool;
6058 OSDMapRef curmap = service->get_osdmap();
6059 int r = -1;
6060
6061 string poolstr;
6062
6063 cmd_getval(cmdmap, "pool", poolstr);
6064 pool = curmap->lookup_pg_pool_name(poolstr);
6065 //If we can't find it by name then maybe id specified
6066 if (pool < 0 && isdigit(poolstr[0]))
6067 pool = atoll(poolstr.c_str());
6068 if (pool < 0) {
6069 ss << "Invalid pool '" << poolstr << "''";
6070 return;
6071 }
6072
6073 string objname, nspace;
6074 cmd_getval(cmdmap, "objname", objname);
6075 std::size_t found = objname.find_first_of('/');
6076 if (found != string::npos) {
6077 nspace = objname.substr(0, found);
6078 objname = objname.substr(found+1);
6079 }
6080 object_locator_t oloc(pool, nspace);
6081 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6082
6083 if (r < 0) {
6084 ss << "Invalid namespace/objname";
6085 return;
6086 }
6087
6088 int64_t shardid;
6089 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
6090 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6091 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6092 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6093 if (curmap->pg_is_ec(rawpg)) {
6094 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6095 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6096 return;
6097 }
6098 }
6099
6100 ObjectStore::Transaction t;
6101
6102 if (command == "setomapval") {
6103 map<string, bufferlist> newattrs;
6104 bufferlist val;
6105 string key, valstr;
6106 cmd_getval(cmdmap, "key", key);
6107 cmd_getval(cmdmap, "val", valstr);
6108
6109 val.append(valstr);
6110 newattrs[key] = val;
6111 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6112 r = store->queue_transaction(service->meta_ch, std::move(t));
6113 if (r < 0)
6114 ss << "error=" << r;
6115 else
6116 ss << "ok";
6117 } else if (command == "rmomapkey") {
6118 string key;
6119 cmd_getval(cmdmap, "key", key);
6120
6121 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6122 r = store->queue_transaction(service->meta_ch, std::move(t));
6123 if (r < 0)
6124 ss << "error=" << r;
6125 else
6126 ss << "ok";
6127 } else if (command == "setomapheader") {
6128 bufferlist newheader;
6129 string headerstr;
6130
6131 cmd_getval(cmdmap, "header", headerstr);
6132 newheader.append(headerstr);
6133 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6134 r = store->queue_transaction(service->meta_ch, std::move(t));
6135 if (r < 0)
6136 ss << "error=" << r;
6137 else
6138 ss << "ok";
6139 } else if (command == "getomap") {
6140 //Debug: Output entire omap
6141 bufferlist hdrbl;
6142 map<string, bufferlist> keyvals;
6143 auto ch = store->open_collection(coll_t(pgid));
6144 if (!ch) {
6145 ss << "unable to open collection for " << pgid;
6146 r = -ENOENT;
6147 } else {
6148 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6149 if (r >= 0) {
6150 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6151 for (map<string, bufferlist>::iterator it = keyvals.begin();
6152 it != keyvals.end(); ++it)
6153 ss << " key=" << (*it).first << " val="
6154 << string((*it).second.c_str(), (*it).second.length());
6155 } else {
6156 ss << "error=" << r;
6157 }
6158 }
6159 } else if (command == "truncobj") {
6160 int64_t trunclen;
6161 cmd_getval(cmdmap, "len", trunclen);
6162 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6163 r = store->queue_transaction(service->meta_ch, std::move(t));
6164 if (r < 0)
6165 ss << "error=" << r;
6166 else
6167 ss << "ok";
6168 } else if (command == "injectdataerr") {
6169 store->inject_data_error(gobj);
6170 ss << "ok";
6171 } else if (command == "injectmdataerr") {
6172 store->inject_mdata_error(gobj);
6173 ss << "ok";
6174 }
6175 return;
6176 }
6177 if (command == "set_recovery_delay") {
6178 int64_t delay;
6179 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6180 ostringstream oss;
6181 oss << delay;
6182 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6183 oss.str().c_str());
6184 if (r != 0) {
6185 ss << "set_recovery_delay: error setting "
6186 << "osd_recovery_delay_start to '" << delay << "': error "
6187 << r;
6188 return;
6189 }
6190 service->cct->_conf.apply_changes(nullptr);
6191 ss << "set_recovery_delay: set osd_recovery_delay_start "
6192 << "to " << service->cct->_conf->osd_recovery_delay_start;
6193 return;
6194 }
6195 if (command == "injectfull") {
6196 int64_t count;
6197 string type;
6198 OSDService::s_names state;
6199 cmd_getval(cmdmap, "type", type, string("full"));
6200 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6201 if (type == "none" || count == 0) {
6202 type = "none";
6203 count = 0;
6204 }
6205 state = service->get_full_state(type);
6206 if (state == OSDService::s_names::INVALID) {
6207 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6208 return;
6209 }
6210 service->set_injectfull(state, count);
6211 return;
6212 }
6213 ss << "Internal error - command=" << command;
6214 }
6215
6216 // =========================================
6217
6218 void OSD::ms_handle_connect(Connection *con)
6219 {
6220 dout(10) << __func__ << " con " << con << dendl;
6221 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6222 std::lock_guard l(osd_lock);
6223 if (is_stopping())
6224 return;
6225 dout(10) << __func__ << " on mon" << dendl;
6226
6227 if (is_preboot()) {
6228 start_boot();
6229 } else if (is_booting()) {
6230 _send_boot(); // resend boot message
6231 } else {
6232 map_lock.lock_shared();
6233 std::lock_guard l2(mon_report_lock);
6234
6235 utime_t now = ceph_clock_now();
6236 last_mon_report = now;
6237
6238 // resend everything, it's a new session
6239 send_full_update();
6240 send_alive();
6241 service.requeue_pg_temp();
6242 service.clear_sent_ready_to_merge();
6243 service.send_pg_temp();
6244 service.send_ready_to_merge();
6245 service.send_pg_created();
6246 requeue_failures();
6247 send_failures();
6248
6249 map_lock.unlock_shared();
6250 if (is_active()) {
6251 send_beacon(ceph::coarse_mono_clock::now());
6252 }
6253 }
6254
6255 // full map requests may happen while active or pre-boot
6256 if (requested_full_first) {
6257 rerequest_full_maps();
6258 }
6259 }
6260 }
6261
6262 void OSD::ms_handle_fast_connect(Connection *con)
6263 {
6264 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6265 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6266 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6267 s = ceph::make_ref<Session>(cct, con);
6268 con->set_priv(s);
6269 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6270 << " addr=" << s->con->get_peer_addr() << dendl;
6271 // we don't connect to clients
6272 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6273 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6274 }
6275 }
6276 }
6277
6278 void OSD::ms_handle_fast_accept(Connection *con)
6279 {
6280 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6281 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6282 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6283 s = ceph::make_ref<Session>(cct, con);
6284 con->set_priv(s);
6285 dout(10) << "new session (incoming)" << s << " con=" << con
6286 << " addr=" << con->get_peer_addr()
6287 << " must have raced with connect" << dendl;
6288 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6289 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6290 }
6291 }
6292 }
6293
6294 bool OSD::ms_handle_reset(Connection *con)
6295 {
6296 auto session = ceph::ref_cast<Session>(con->get_priv());
6297 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6298 if (!session)
6299 return false;
6300 session->wstate.reset(con);
6301 session->con->set_priv(nullptr);
6302 session->con.reset(); // break con <-> session ref cycle
6303 // note that we break session->con *before* the session_handle_reset
6304 // cleanup below. this avoids a race between us and
6305 // PG::add_backoff, Session::check_backoff, etc.
6306 session_handle_reset(session);
6307 return true;
6308 }
6309
6310 bool OSD::ms_handle_refused(Connection *con)
6311 {
6312 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6313 return false;
6314
6315 auto session = ceph::ref_cast<Session>(con->get_priv());
6316 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6317 if (!session)
6318 return false;
6319 int type = con->get_peer_type();
6320 // handle only OSD failures here
6321 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6322 OSDMapRef osdmap = get_osdmap();
6323 if (osdmap) {
6324 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6325 if (id >= 0 && osdmap->is_up(id)) {
6326 // I'm cheating mon heartbeat grace logic, because we know it's not going
6327 // to respawn alone. +1 so we won't hit any boundary case.
6328 monc->send_mon_message(
6329 new MOSDFailure(
6330 monc->get_fsid(),
6331 id,
6332 osdmap->get_addrs(id),
6333 cct->_conf->osd_heartbeat_grace + 1,
6334 osdmap->get_epoch(),
6335 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6336 ));
6337 }
6338 }
6339 }
6340 return true;
6341 }
6342
6343 struct CB_OSD_GetVersion {
6344 OSD *osd;
6345 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6346 void operator ()(boost::system::error_code ec, version_t newest,
6347 version_t oldest) {
6348 if (!ec)
6349 osd->_got_mon_epochs(oldest, newest);
6350 }
6351 };
6352
6353 void OSD::start_boot()
6354 {
6355 if (!_is_healthy()) {
6356 // if we are not healthy, do not mark ourselves up (yet)
6357 dout(1) << "not healthy; waiting to boot" << dendl;
6358 if (!is_waiting_for_healthy())
6359 start_waiting_for_healthy();
6360 // send pings sooner rather than later
6361 heartbeat_kick();
6362 return;
6363 }
6364 dout(1) << __func__ << dendl;
6365 set_state(STATE_PREBOOT);
6366 dout(10) << "start_boot - have maps " << superblock.oldest_map
6367 << ".." << superblock.newest_map << dendl;
6368 monc->get_version("osdmap", CB_OSD_GetVersion(this));
6369 }
6370
6371 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6372 {
6373 std::lock_guard l(osd_lock);
6374 if (is_preboot()) {
6375 _preboot(oldest, newest);
6376 }
6377 }
6378
6379 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6380 {
6381 ceph_assert(is_preboot());
6382 dout(10) << __func__ << " _preboot mon has osdmaps "
6383 << oldest << ".." << newest << dendl;
6384
6385 // ensure our local fullness awareness is accurate
6386 {
6387 std::lock_guard l(heartbeat_lock);
6388 heartbeat();
6389 }
6390
6391 const auto& monmap = monc->monmap;
6392 const auto osdmap = get_osdmap();
6393 // if our map within recent history, try to add ourselves to the osdmap.
6394 if (osdmap->get_epoch() == 0) {
6395 derr << "waiting for initial osdmap" << dendl;
6396 } else if (osdmap->is_destroyed(whoami)) {
6397 derr << "osdmap says I am destroyed" << dendl;
6398 // provide a small margin so we don't livelock seeing if we
6399 // un-destroyed ourselves.
6400 if (osdmap->get_epoch() > newest - 1) {
6401 exit(0);
6402 }
6403 } else if (osdmap->is_noup(whoami)) {
6404 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6405 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6406 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6407 << dendl;
6408 } else if (service.need_fullness_update()) {
6409 derr << "osdmap fullness state needs update" << dendl;
6410 send_full_update();
6411 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6412 superblock.purged_snaps_last < superblock.current_epoch) {
6413 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6414 << " < newest_map " << superblock.current_epoch << dendl;
6415 _get_purged_snaps();
6416 } else if (osdmap->get_epoch() >= oldest - 1 &&
6417 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6418
6419 // wait for pgs to fully catch up in a different thread, since
6420 // this thread might be required for splitting and merging PGs to
6421 // make progress.
6422 boot_finisher.queue(
6423 new LambdaContext(
6424 [this](int r) {
6425 std::unique_lock l(osd_lock);
6426 if (is_preboot()) {
6427 dout(10) << __func__ << " waiting for peering work to drain"
6428 << dendl;
6429 l.unlock();
6430 for (auto shard : shards) {
6431 shard->wait_min_pg_epoch(get_osdmap_epoch());
6432 }
6433 l.lock();
6434 }
6435 if (is_preboot()) {
6436 _send_boot();
6437 }
6438 }));
6439 return;
6440 }
6441
6442 // get all the latest maps
6443 if (osdmap->get_epoch() + 1 >= oldest)
6444 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6445 else
6446 osdmap_subscribe(oldest - 1, true);
6447 }
6448
6449 void OSD::_get_purged_snaps()
6450 {
6451 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6452 // overlapping requests to the mon, which will be somewhat inefficient, but
6453 // it should be reliable.
6454 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6455 << ", newest_map " << superblock.current_epoch << dendl;
6456 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6457 superblock.purged_snaps_last + 1,
6458 superblock.current_epoch + 1);
6459 monc->send_mon_message(m);
6460 }
6461
6462 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6463 {
6464 dout(10) << __func__ << " " << *m << dendl;
6465 ObjectStore::Transaction t;
6466 if (!is_preboot() ||
6467 m->last < superblock.purged_snaps_last) {
6468 goto out;
6469 }
6470 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6471 make_purged_snaps_oid(), &t,
6472 m->purged_snaps);
6473 superblock.purged_snaps_last = m->last;
6474 write_superblock(t);
6475 store->queue_transaction(
6476 service.meta_ch,
6477 std::move(t));
6478 service.publish_superblock(superblock);
6479 if (m->last < superblock.current_epoch) {
6480 _get_purged_snaps();
6481 } else {
6482 start_boot();
6483 }
6484 out:
6485 m->put();
6486 }
6487
6488 void OSD::send_full_update()
6489 {
6490 if (!service.need_fullness_update())
6491 return;
6492 unsigned state = 0;
6493 if (service.is_full()) {
6494 state = CEPH_OSD_FULL;
6495 } else if (service.is_backfillfull()) {
6496 state = CEPH_OSD_BACKFILLFULL;
6497 } else if (service.is_nearfull()) {
6498 state = CEPH_OSD_NEARFULL;
6499 }
6500 set<string> s;
6501 OSDMap::calc_state_set(state, s);
6502 dout(10) << __func__ << " want state " << s << dendl;
6503 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6504 }
6505
6506 void OSD::start_waiting_for_healthy()
6507 {
6508 dout(1) << "start_waiting_for_healthy" << dendl;
6509 set_state(STATE_WAITING_FOR_HEALTHY);
6510 last_heartbeat_resample = utime_t();
6511
6512 // subscribe to osdmap updates, in case our peers really are known to be dead
6513 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6514 }
6515
6516 bool OSD::_is_healthy()
6517 {
6518 if (!cct->get_heartbeat_map()->is_healthy()) {
6519 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6520 return false;
6521 }
6522
6523 if (is_waiting_for_healthy()) {
6524 utime_t now = ceph_clock_now();
6525 if (osd_markdown_log.empty()) {
6526 dout(5) << __func__ << " force returning true since last markdown"
6527 << " was " << cct->_conf->osd_max_markdown_period
6528 << "s ago" << dendl;
6529 return true;
6530 }
6531 std::lock_guard l(heartbeat_lock);
6532 int num = 0, up = 0;
6533 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6534 p != heartbeat_peers.end();
6535 ++p) {
6536 if (p->second.is_healthy(now))
6537 ++up;
6538 ++num;
6539 }
6540 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6541 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6542 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6543 return false;
6544 }
6545 }
6546
6547 return true;
6548 }
6549
6550 void OSD::_send_boot()
6551 {
6552 dout(10) << "_send_boot" << dendl;
6553 Connection *local_connection =
6554 cluster_messenger->get_loopback_connection().get();
6555 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6556 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6557 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6558 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6559
6560 dout(20) << " initial client_addrs " << client_addrs
6561 << ", cluster_addrs " << cluster_addrs
6562 << ", hb_back_addrs " << hb_back_addrs
6563 << ", hb_front_addrs " << hb_front_addrs
6564 << dendl;
6565 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6566 dout(10) << " assuming cluster_addrs match client_addrs "
6567 << client_addrs << dendl;
6568 cluster_addrs = cluster_messenger->get_myaddrs();
6569 }
6570 if (auto session = local_connection->get_priv(); !session) {
6571 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6572 }
6573
6574 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6575 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6576 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6577 << cluster_addrs << dendl;
6578 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6579 }
6580 if (auto session = local_connection->get_priv(); !session) {
6581 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6582 }
6583
6584 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6585 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6586 dout(10) << " assuming hb_front_addrs match client_addrs "
6587 << client_addrs << dendl;
6588 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6589 }
6590 if (auto session = local_connection->get_priv(); !session) {
6591 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6592 }
6593
6594 // we now know what our front and back addrs will be, and we are
6595 // about to tell the mon what our metadata (including numa bindings)
6596 // are, so now is a good time!
6597 set_numa_affinity();
6598
6599 MOSDBoot *mboot = new MOSDBoot(
6600 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6601 hb_back_addrs, hb_front_addrs, cluster_addrs,
6602 CEPH_FEATURES_ALL);
6603 dout(10) << " final client_addrs " << client_addrs
6604 << ", cluster_addrs " << cluster_addrs
6605 << ", hb_back_addrs " << hb_back_addrs
6606 << ", hb_front_addrs " << hb_front_addrs
6607 << dendl;
6608 _collect_metadata(&mboot->metadata);
6609 monc->send_mon_message(mboot);
6610 set_state(STATE_BOOTING);
6611 }
6612
6613 void OSD::_collect_metadata(map<string,string> *pm)
6614 {
6615 // config info
6616 (*pm)["osd_data"] = dev_path;
6617 if (store->get_type() == "filestore") {
6618 // not applicable for bluestore
6619 (*pm)["osd_journal"] = journal_path;
6620 }
6621 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6622 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6623 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6624 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6625
6626 // backend
6627 (*pm)["osd_objectstore"] = store->get_type();
6628 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6629 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6630 (*pm)["default_device_class"] = store->get_default_device_class();
6631 string osdspec_affinity;
6632 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6633 if (r < 0 || osdspec_affinity.empty()) {
6634 osdspec_affinity = "";
6635 }
6636 (*pm)["osdspec_affinity"] = osdspec_affinity;
6637 store->collect_metadata(pm);
6638
6639 collect_sys_info(pm, cct);
6640
6641 (*pm)["front_iface"] = pick_iface(
6642 cct,
6643 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6644 (*pm)["back_iface"] = pick_iface(
6645 cct,
6646 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6647
6648 // network numa
6649 {
6650 int node = -1;
6651 set<int> nodes;
6652 set<string> unknown;
6653 for (auto nm : { "front_iface", "back_iface" }) {
6654 if (!(*pm)[nm].size()) {
6655 unknown.insert(nm);
6656 continue;
6657 }
6658 int n = -1;
6659 int r = get_iface_numa_node((*pm)[nm], &n);
6660 if (r < 0) {
6661 unknown.insert((*pm)[nm]);
6662 continue;
6663 }
6664 nodes.insert(n);
6665 if (node < 0) {
6666 node = n;
6667 }
6668 }
6669 if (unknown.size()) {
6670 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6671 }
6672 if (!nodes.empty()) {
6673 (*pm)["network_numa_nodes"] = stringify(nodes);
6674 }
6675 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6676 (*pm)["network_numa_node"] = stringify(node);
6677 }
6678 }
6679
6680 if (numa_node >= 0) {
6681 (*pm)["numa_node"] = stringify(numa_node);
6682 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6683 &numa_cpu_set);
6684 }
6685
6686 set<string> devnames;
6687 store->get_devices(&devnames);
6688 map<string,string> errs;
6689 get_device_metadata(devnames, pm, &errs);
6690 for (auto& i : errs) {
6691 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6692 }
6693 dout(10) << __func__ << " " << *pm << dendl;
6694 }
6695
6696 void OSD::queue_want_up_thru(epoch_t want)
6697 {
6698 std::shared_lock map_locker{map_lock};
6699 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6700 std::lock_guard report_locker(mon_report_lock);
6701 if (want > up_thru_wanted) {
6702 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6703 << ", currently " << cur
6704 << dendl;
6705 up_thru_wanted = want;
6706 send_alive();
6707 } else {
6708 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6709 << ", currently " << cur
6710 << dendl;
6711 }
6712 }
6713
6714 void OSD::send_alive()
6715 {
6716 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6717 const auto osdmap = get_osdmap();
6718 if (!osdmap->exists(whoami))
6719 return;
6720 epoch_t up_thru = osdmap->get_up_thru(whoami);
6721 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6722 if (up_thru_wanted > up_thru) {
6723 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6724 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6725 }
6726 }
6727
6728 void OSD::request_full_map(epoch_t first, epoch_t last)
6729 {
6730 dout(10) << __func__ << " " << first << ".." << last
6731 << ", previously requested "
6732 << requested_full_first << ".." << requested_full_last << dendl;
6733 ceph_assert(ceph_mutex_is_locked(osd_lock));
6734 ceph_assert(first > 0 && last > 0);
6735 ceph_assert(first <= last);
6736 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6737 if (requested_full_first == 0) {
6738 // first request
6739 requested_full_first = first;
6740 requested_full_last = last;
6741 } else if (last <= requested_full_last) {
6742 // dup
6743 return;
6744 } else {
6745 // additional request
6746 first = requested_full_last + 1;
6747 requested_full_last = last;
6748 }
6749 MMonGetOSDMap *req = new MMonGetOSDMap;
6750 req->request_full(first, last);
6751 monc->send_mon_message(req);
6752 }
6753
6754 void OSD::got_full_map(epoch_t e)
6755 {
6756 ceph_assert(requested_full_first <= requested_full_last);
6757 ceph_assert(ceph_mutex_is_locked(osd_lock));
6758 if (requested_full_first == 0) {
6759 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6760 return;
6761 }
6762 if (e < requested_full_first) {
6763 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6764 << ".." << requested_full_last
6765 << ", ignoring" << dendl;
6766 return;
6767 }
6768 if (e >= requested_full_last) {
6769 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6770 << ".." << requested_full_last << ", resetting" << dendl;
6771 requested_full_first = requested_full_last = 0;
6772 return;
6773 }
6774
6775 requested_full_first = e + 1;
6776
6777 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6778 << ".." << requested_full_last
6779 << ", still need more" << dendl;
6780 }
6781
6782 void OSD::requeue_failures()
6783 {
6784 std::lock_guard l(heartbeat_lock);
6785 unsigned old_queue = failure_queue.size();
6786 unsigned old_pending = failure_pending.size();
6787 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6788 failure_queue[p->first] = p->second.first;
6789 failure_pending.erase(p++);
6790 }
6791 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6792 << failure_queue.size() << dendl;
6793 }
6794
6795 void OSD::send_failures()
6796 {
6797 ceph_assert(ceph_mutex_is_locked(map_lock));
6798 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6799 std::lock_guard l(heartbeat_lock);
6800 utime_t now = ceph_clock_now();
6801 const auto osdmap = get_osdmap();
6802 while (!failure_queue.empty()) {
6803 int osd = failure_queue.begin()->first;
6804 if (!failure_pending.count(osd)) {
6805 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6806 monc->send_mon_message(
6807 new MOSDFailure(
6808 monc->get_fsid(),
6809 osd,
6810 osdmap->get_addrs(osd),
6811 failed_for,
6812 osdmap->get_epoch()));
6813 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6814 osdmap->get_addrs(osd));
6815 }
6816 failure_queue.erase(osd);
6817 }
6818 }
6819
6820 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6821 {
6822 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6823 MOSDFailure::FLAG_ALIVE);
6824 monc->send_mon_message(m);
6825 }
6826
6827 void OSD::cancel_pending_failures()
6828 {
6829 std::lock_guard l(heartbeat_lock);
6830 auto it = failure_pending.begin();
6831 while (it != failure_pending.end()) {
6832 dout(10) << __func__ << " canceling in-flight failure report for osd."
6833 << it->first << dendl;
6834 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6835 failure_pending.erase(it++);
6836 }
6837 }
6838
6839 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6840 {
6841 const auto& monmap = monc->monmap;
6842 // send beacon to mon even if we are just connected, and the monmap is not
6843 // initialized yet by then.
6844 if (monmap.epoch > 0 &&
6845 monmap.get_required_features().contains_all(
6846 ceph::features::mon::FEATURE_LUMINOUS)) {
6847 dout(20) << __func__ << " sending" << dendl;
6848 MOSDBeacon* beacon = nullptr;
6849 {
6850 std::lock_guard l{min_last_epoch_clean_lock};
6851 beacon = new MOSDBeacon(get_osdmap_epoch(),
6852 min_last_epoch_clean,
6853 superblock.last_purged_snaps_scrub,
6854 cct->_conf->osd_beacon_report_interval);
6855 beacon->pgs = min_last_epoch_clean_pgs;
6856 last_sent_beacon = now;
6857 }
6858 monc->send_mon_message(beacon);
6859 } else {
6860 dout(20) << __func__ << " not sending" << dendl;
6861 }
6862 }
6863
6864 void OSD::handle_command(MCommand *m)
6865 {
6866 ConnectionRef con = m->get_connection();
6867 auto session = ceph::ref_cast<Session>(con->get_priv());
6868 if (!session) {
6869 con->send_message(new MCommandReply(m, -EACCES));
6870 m->put();
6871 return;
6872 }
6873 if (!session->caps.allow_all()) {
6874 con->send_message(new MCommandReply(m, -EACCES));
6875 m->put();
6876 return;
6877 }
6878 cct->get_admin_socket()->queue_tell_command(m);
6879 m->put();
6880 }
6881
6882 namespace {
6883 class unlock_guard {
6884 ceph::mutex& m;
6885 public:
6886 explicit unlock_guard(ceph::mutex& mutex)
6887 : m(mutex)
6888 {
6889 m.unlock();
6890 }
6891 unlock_guard(unlock_guard&) = delete;
6892 ~unlock_guard() {
6893 m.lock();
6894 }
6895 };
6896 }
6897
6898 void OSD::scrub_purged_snaps()
6899 {
6900 dout(10) << __func__ << dendl;
6901 ceph_assert(ceph_mutex_is_locked(osd_lock));
6902 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6903 make_snapmapper_oid(),
6904 make_purged_snaps_oid());
6905 clog->debug() << "purged_snaps scrub starts";
6906 osd_lock.unlock();
6907 s.run();
6908 if (s.stray.size()) {
6909 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6910 } else {
6911 clog->debug() << "purged_snaps scrub ok";
6912 }
6913 set<pair<spg_t,snapid_t>> queued;
6914 for (auto& [pool, snap, hash, shard] : s.stray) {
6915 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6916 if (!pi) {
6917 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6918 continue;
6919 }
6920 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6921 spg_t spgid(pgid, shard);
6922 pair<spg_t,snapid_t> p(spgid, snap);
6923 if (queued.count(p)) {
6924 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6925 << " already queued" << dendl;
6926 continue;
6927 }
6928 PGRef pg = lookup_lock_pg(spgid);
6929 if (!pg) {
6930 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6931 continue;
6932 }
6933 queued.insert(p);
6934 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6935 << snap << dendl;
6936 pg->queue_snap_retrim(snap);
6937 pg->unlock();
6938 }
6939 osd_lock.lock();
6940 if (is_stopping()) {
6941 return;
6942 }
6943 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6944 ObjectStore::Transaction t;
6945 superblock.last_purged_snaps_scrub = ceph_clock_now();
6946 write_superblock(t);
6947 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6948 ceph_assert(tr == 0);
6949 if (is_active()) {
6950 send_beacon(ceph::coarse_mono_clock::now());
6951 }
6952 dout(10) << __func__ << " done" << dendl;
6953 }
6954
6955 void OSD::probe_smart(const string& only_devid, ostream& ss)
6956 {
6957 set<string> devnames;
6958 store->get_devices(&devnames);
6959 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6960 "osd_smart_report_timeout");
6961
6962 // == typedef std::map<std::string, mValue> mObject;
6963 json_spirit::mObject json_map;
6964
6965 for (auto dev : devnames) {
6966 // smartctl works only on physical devices; filter out any logical device
6967 if (dev.find("dm-") == 0) {
6968 continue;
6969 }
6970
6971 string err;
6972 string devid = get_device_id(dev, &err);
6973 if (devid.size() == 0) {
6974 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6975 << err << "), skipping" << dendl;
6976 continue;
6977 }
6978 if (only_devid.size() && devid != only_devid) {
6979 continue;
6980 }
6981
6982 json_spirit::mValue smart_json;
6983 if (block_device_get_metrics(dev, smart_timeout,
6984 &smart_json)) {
6985 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6986 continue;
6987 }
6988 json_map[devid] = smart_json;
6989 }
6990 json_spirit::write(json_map, ss, json_spirit::pretty_print);
6991 }
6992
6993 bool OSD::heartbeat_dispatch(Message *m)
6994 {
6995 dout(30) << "heartbeat_dispatch " << m << dendl;
6996 switch (m->get_type()) {
6997
6998 case CEPH_MSG_PING:
6999 dout(10) << "ping from " << m->get_source_inst() << dendl;
7000 m->put();
7001 break;
7002
7003 case MSG_OSD_PING:
7004 handle_osd_ping(static_cast<MOSDPing*>(m));
7005 break;
7006
7007 default:
7008 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7009 m->put();
7010 }
7011
7012 return true;
7013 }
7014
7015 bool OSD::ms_dispatch(Message *m)
7016 {
7017 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7018 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7019 service.got_stop_ack();
7020 m->put();
7021 return true;
7022 }
7023
7024 // lock!
7025
7026 osd_lock.lock();
7027 if (is_stopping()) {
7028 osd_lock.unlock();
7029 m->put();
7030 return true;
7031 }
7032
7033 do_waiters();
7034 _dispatch(m);
7035
7036 osd_lock.unlock();
7037
7038 return true;
7039 }
7040
7041 void OSDService::maybe_share_map(
7042 Connection *con,
7043 const OSDMapRef& osdmap,
7044 epoch_t peer_epoch_lb)
7045 {
7046 // NOTE: we assume caller hold something that keeps the Connection itself
7047 // pinned (e.g., an OpRequest's MessageRef).
7048 auto session = ceph::ref_cast<Session>(con->get_priv());
7049 if (!session) {
7050 return;
7051 }
7052
7053 // assume the peer has the newer of the op's sent_epoch and what
7054 // we think we sent them.
7055 session->sent_epoch_lock.lock();
7056 if (peer_epoch_lb > session->last_sent_epoch) {
7057 dout(10) << __func__ << " con " << con
7058 << " " << con->get_peer_addr()
7059 << " map epoch " << session->last_sent_epoch
7060 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7061 session->last_sent_epoch = peer_epoch_lb;
7062 }
7063 epoch_t last_sent_epoch = session->last_sent_epoch;
7064 session->sent_epoch_lock.unlock();
7065
7066 if (osdmap->get_epoch() <= last_sent_epoch) {
7067 return;
7068 }
7069
7070 send_incremental_map(last_sent_epoch, con, osdmap);
7071 last_sent_epoch = osdmap->get_epoch();
7072
7073 session->sent_epoch_lock.lock();
7074 if (session->last_sent_epoch < last_sent_epoch) {
7075 dout(10) << __func__ << " con " << con
7076 << " " << con->get_peer_addr()
7077 << " map epoch " << session->last_sent_epoch
7078 << " -> " << last_sent_epoch << " (shared)" << dendl;
7079 session->last_sent_epoch = last_sent_epoch;
7080 }
7081 session->sent_epoch_lock.unlock();
7082 }
7083
7084 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7085 {
7086 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7087
7088 auto i = session->waiting_on_map.begin();
7089 while (i != session->waiting_on_map.end()) {
7090 OpRequestRef op = &(*i);
7091 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7092 auto m = op->get_req<MOSDFastDispatchOp>();
7093 if (m->get_min_epoch() > osdmap->get_epoch()) {
7094 break;
7095 }
7096 session->waiting_on_map.erase(i++);
7097 op->put();
7098
7099 spg_t pgid;
7100 if (m->get_type() == CEPH_MSG_OSD_OP) {
7101 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7102 static_cast<const MOSDOp*>(m)->get_pg());
7103 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7104 continue;
7105 }
7106 } else {
7107 pgid = m->get_spg();
7108 }
7109 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7110 }
7111
7112 if (session->waiting_on_map.empty()) {
7113 clear_session_waiting_on_map(session);
7114 } else {
7115 register_session_waiting_on_map(session);
7116 }
7117 }
7118
7119 void OSD::ms_fast_dispatch(Message *m)
7120 {
7121
7122 #ifdef HAVE_JAEGER
7123 jaeger_tracing::init_tracer("osd-services-reinit");
7124 dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl;
7125 auto dispatch_span = jaeger_tracing::new_span(__func__);
7126 #endif
7127 FUNCTRACE(cct);
7128 if (service.is_stopping()) {
7129 m->put();
7130 return;
7131 }
7132
7133 // peering event?
7134 switch (m->get_type()) {
7135 case CEPH_MSG_PING:
7136 dout(10) << "ping from " << m->get_source() << dendl;
7137 m->put();
7138 return;
7139 case MSG_OSD_FORCE_RECOVERY:
7140 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7141 return;
7142 case MSG_OSD_SCRUB2:
7143 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7144 return;
7145
7146 case MSG_OSD_PG_CREATE2:
7147 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7148 case MSG_OSD_PG_QUERY:
7149 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7150 case MSG_OSD_PG_NOTIFY:
7151 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7152 case MSG_OSD_PG_INFO:
7153 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7154 case MSG_OSD_PG_REMOVE:
7155 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7156
7157 // these are single-pg messages that handle themselves
7158 case MSG_OSD_PG_LOG:
7159 case MSG_OSD_PG_TRIM:
7160 case MSG_OSD_PG_NOTIFY2:
7161 case MSG_OSD_PG_QUERY2:
7162 case MSG_OSD_PG_INFO2:
7163 case MSG_OSD_BACKFILL_RESERVE:
7164 case MSG_OSD_RECOVERY_RESERVE:
7165 case MSG_OSD_PG_LEASE:
7166 case MSG_OSD_PG_LEASE_ACK:
7167 {
7168 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7169 if (require_osd_peer(pm)) {
7170 enqueue_peering_evt(
7171 pm->get_spg(),
7172 PGPeeringEventRef(pm->get_event()));
7173 }
7174 pm->put();
7175 return;
7176 }
7177 }
7178
7179 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7180 {
7181 #ifdef WITH_LTTNG
7182 osd_reqid_t reqid = op->get_reqid();
7183 #endif
7184 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7185 reqid.name._num, reqid.tid, reqid.inc);
7186 }
7187 #ifdef HAVE_JAEGER
7188 op->set_osd_parent_span(dispatch_span);
7189 if (op->osd_parent_span) {
7190 auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span);
7191 op->set_osd_parent_span(op_req_span);
7192 }
7193 #endif
7194 if (m->trace)
7195 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7196
7197 // note sender epoch, min req's epoch
7198 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7199 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7200 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7201
7202 service.maybe_inject_dispatch_delay();
7203
7204 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7205 m->get_type() != CEPH_MSG_OSD_OP) {
7206 // queue it directly
7207 enqueue_op(
7208 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7209 std::move(op),
7210 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7211 } else {
7212 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7213 // message that didn't have an explicit spg_t); we need to map
7214 // them to an spg_t while preserving delivery order.
7215 auto priv = m->get_connection()->get_priv();
7216 if (auto session = static_cast<Session*>(priv.get()); session) {
7217 std::lock_guard l{session->session_dispatch_lock};
7218 op->get();
7219 session->waiting_on_map.push_back(*op);
7220 OSDMapRef nextmap = service.get_nextmap_reserved();
7221 dispatch_session_waiting(session, nextmap);
7222 service.release_map(nextmap);
7223 }
7224 }
7225 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7226 }
7227
7228 int OSD::ms_handle_authentication(Connection *con)
7229 {
7230 int ret = 0;
7231 auto s = ceph::ref_cast<Session>(con->get_priv());
7232 if (!s) {
7233 s = ceph::make_ref<Session>(cct, con);
7234 con->set_priv(s);
7235 s->entity_name = con->get_peer_entity_name();
7236 dout(10) << __func__ << " new session " << s << " con " << s->con
7237 << " entity " << s->entity_name
7238 << " addr " << con->get_peer_addrs() << dendl;
7239 } else {
7240 dout(10) << __func__ << " existing session " << s << " con " << s->con
7241 << " entity " << s->entity_name
7242 << " addr " << con->get_peer_addrs() << dendl;
7243 }
7244
7245 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7246 if (caps_info.allow_all) {
7247 s->caps.set_allow_all();
7248 } else if (caps_info.caps.length() > 0) {
7249 bufferlist::const_iterator p = caps_info.caps.cbegin();
7250 string str;
7251 try {
7252 decode(str, p);
7253 }
7254 catch (ceph::buffer::error& e) {
7255 dout(10) << __func__ << " session " << s << " " << s->entity_name
7256 << " failed to decode caps string" << dendl;
7257 ret = -EACCES;
7258 }
7259 if (!ret) {
7260 bool success = s->caps.parse(str);
7261 if (success) {
7262 dout(10) << __func__ << " session " << s
7263 << " " << s->entity_name
7264 << " has caps " << s->caps << " '" << str << "'" << dendl;
7265 ret = 1;
7266 } else {
7267 dout(10) << __func__ << " session " << s << " " << s->entity_name
7268 << " failed to parse caps '" << str << "'" << dendl;
7269 ret = -EACCES;
7270 }
7271 }
7272 }
7273 return ret;
7274 }
7275
7276 void OSD::do_waiters()
7277 {
7278 ceph_assert(ceph_mutex_is_locked(osd_lock));
7279
7280 dout(10) << "do_waiters -- start" << dendl;
7281 while (!finished.empty()) {
7282 OpRequestRef next = finished.front();
7283 finished.pop_front();
7284 dispatch_op(next);
7285 }
7286 dout(10) << "do_waiters -- finish" << dendl;
7287 }
7288
7289 void OSD::dispatch_op(OpRequestRef op)
7290 {
7291 switch (op->get_req()->get_type()) {
7292
7293 case MSG_OSD_PG_CREATE:
7294 handle_pg_create(op);
7295 break;
7296 }
7297 }
7298
7299 void OSD::_dispatch(Message *m)
7300 {
7301 ceph_assert(ceph_mutex_is_locked(osd_lock));
7302 dout(20) << "_dispatch " << m << " " << *m << dendl;
7303
7304 switch (m->get_type()) {
7305 // -- don't need OSDMap --
7306
7307 // map and replication
7308 case CEPH_MSG_OSD_MAP:
7309 handle_osd_map(static_cast<MOSDMap*>(m));
7310 break;
7311 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7312 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7313 break;
7314
7315 // osd
7316 case MSG_OSD_SCRUB:
7317 handle_scrub(static_cast<MOSDScrub*>(m));
7318 break;
7319
7320 case MSG_COMMAND:
7321 handle_command(static_cast<MCommand*>(m));
7322 return;
7323
7324 // -- need OSDMap --
7325
7326 case MSG_OSD_PG_CREATE:
7327 {
7328 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7329 if (m->trace)
7330 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7331 // no map? starting up?
7332 if (!get_osdmap()) {
7333 dout(7) << "no OSDMap, not booted" << dendl;
7334 logger->inc(l_osd_waiting_for_map);
7335 waiting_for_osdmap.push_back(op);
7336 op->mark_delayed("no osdmap");
7337 break;
7338 }
7339
7340 // need OSDMap
7341 dispatch_op(op);
7342 }
7343 }
7344 }
7345
7346 // remove me post-nautilus
7347 void OSD::handle_scrub(MOSDScrub *m)
7348 {
7349 dout(10) << "handle_scrub " << *m << dendl;
7350 if (!require_mon_or_mgr_peer(m)) {
7351 m->put();
7352 return;
7353 }
7354 if (m->fsid != monc->get_fsid()) {
7355 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7356 << dendl;
7357 m->put();
7358 return;
7359 }
7360
7361 vector<spg_t> spgs;
7362 _get_pgids(&spgs);
7363
7364 if (!m->scrub_pgs.empty()) {
7365 vector<spg_t> v;
7366 for (auto pgid : m->scrub_pgs) {
7367 spg_t pcand;
7368 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7369 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7370 v.push_back(pcand);
7371 }
7372 }
7373 spgs.swap(v);
7374 }
7375
7376 for (auto pgid : spgs) {
7377 enqueue_peering_evt(
7378 pgid,
7379 PGPeeringEventRef(
7380 std::make_shared<PGPeeringEvent>(
7381 get_osdmap_epoch(),
7382 get_osdmap_epoch(),
7383 PeeringState::RequestScrub(m->deep, m->repair))));
7384 }
7385
7386 m->put();
7387 }
7388
7389 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7390 {
7391 dout(10) << __func__ << " " << *m << dendl;
7392 if (!require_mon_or_mgr_peer(m)) {
7393 m->put();
7394 return;
7395 }
7396 if (m->fsid != monc->get_fsid()) {
7397 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7398 << dendl;
7399 m->put();
7400 return;
7401 }
7402 for (auto pgid : m->scrub_pgs) {
7403 enqueue_peering_evt(
7404 pgid,
7405 PGPeeringEventRef(
7406 std::make_shared<PGPeeringEvent>(
7407 m->epoch,
7408 m->epoch,
7409 PeeringState::RequestScrub(m->deep, m->repair))));
7410 }
7411 m->put();
7412 }
7413
7414 bool OSD::scrub_random_backoff()
7415 {
7416 bool coin_flip = (rand() / (double)RAND_MAX >=
7417 cct->_conf->osd_scrub_backoff_ratio);
7418 if (!coin_flip) {
7419 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7420 return true;
7421 }
7422 return false;
7423 }
7424
7425 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7426 const spg_t& pg, const utime_t& timestamp,
7427 double pool_scrub_min_interval,
7428 double pool_scrub_max_interval, bool must)
7429 : cct(cct),
7430 pgid(pg),
7431 sched_time(timestamp),
7432 deadline(timestamp)
7433 {
7434 // if not explicitly requested, postpone the scrub with a random delay
7435 if (!must) {
7436 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7437 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7438 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7439 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7440
7441 sched_time += scrub_min_interval;
7442 double r = rand() / (double)RAND_MAX;
7443 sched_time +=
7444 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7445 if (scrub_max_interval == 0) {
7446 deadline = utime_t();
7447 } else {
7448 deadline += scrub_max_interval;
7449 }
7450
7451 }
7452 }
7453
7454 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7455 if (sched_time < rhs.sched_time)
7456 return true;
7457 if (sched_time > rhs.sched_time)
7458 return false;
7459 return pgid < rhs.pgid;
7460 }
7461
7462 void OSDService::dumps_scrub(ceph::Formatter *f)
7463 {
7464 ceph_assert(f != nullptr);
7465 std::lock_guard l(sched_scrub_lock);
7466
7467 f->open_array_section("scrubs");
7468 for (const auto &i: sched_scrub_pg) {
7469 f->open_object_section("scrub");
7470 f->dump_stream("pgid") << i.pgid;
7471 f->dump_stream("sched_time") << i.sched_time;
7472 f->dump_stream("deadline") << i.deadline;
7473 f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
7474 f->close_section();
7475 }
7476 f->close_section();
7477 }
7478
7479 double OSD::scrub_sleep_time(bool must_scrub)
7480 {
7481 if (must_scrub) {
7482 return cct->_conf->osd_scrub_sleep;
7483 }
7484 utime_t now = ceph_clock_now();
7485 if (scrub_time_permit(now)) {
7486 return cct->_conf->osd_scrub_sleep;
7487 }
7488 double normal_sleep = cct->_conf->osd_scrub_sleep;
7489 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7490 return std::max(extended_sleep, normal_sleep);
7491 }
7492
7493 bool OSD::scrub_time_permit(utime_t now)
7494 {
7495 struct tm bdt;
7496 time_t tt = now.sec();
7497 localtime_r(&tt, &bdt);
7498
7499 bool day_permit = false;
7500 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7501 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7502 day_permit = true;
7503 }
7504 } else {
7505 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7506 day_permit = true;
7507 }
7508 }
7509
7510 if (!day_permit) {
7511 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7512 << " - " << cct->_conf->osd_scrub_end_week_day
7513 << " now " << bdt.tm_wday << " = no" << dendl;
7514 return false;
7515 }
7516
7517 bool time_permit = false;
7518 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7519 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7520 time_permit = true;
7521 }
7522 } else {
7523 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7524 time_permit = true;
7525 }
7526 }
7527 if (time_permit) {
7528 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7529 << " - " << cct->_conf->osd_scrub_end_hour
7530 << " now " << bdt.tm_hour << " = yes" << dendl;
7531 } else {
7532 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7533 << " - " << cct->_conf->osd_scrub_end_hour
7534 << " now " << bdt.tm_hour << " = no" << dendl;
7535 }
7536 return time_permit;
7537 }
7538
7539 bool OSD::scrub_load_below_threshold()
7540 {
7541 double loadavgs[3];
7542 if (getloadavg(loadavgs, 3) != 3) {
7543 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7544 return false;
7545 }
7546
7547 // allow scrub if below configured threshold
7548 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7549 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7550 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7551 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7552 << " < max " << cct->_conf->osd_scrub_load_threshold
7553 << " = yes" << dendl;
7554 return true;
7555 }
7556
7557 // allow scrub if below daily avg and currently decreasing
7558 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7559 dout(20) << __func__ << " loadavg " << loadavgs[0]
7560 << " < daily_loadavg " << daily_loadavg
7561 << " and < 15m avg " << loadavgs[2]
7562 << " = yes" << dendl;
7563 return true;
7564 }
7565
7566 dout(20) << __func__ << " loadavg " << loadavgs[0]
7567 << " >= max " << cct->_conf->osd_scrub_load_threshold
7568 << " and ( >= daily_loadavg " << daily_loadavg
7569 << " or >= 15m avg " << loadavgs[2]
7570 << ") = no" << dendl;
7571 return false;
7572 }
7573
7574 void OSD::sched_scrub()
7575 {
7576 dout(20) << __func__ << " sched_scrub starts" << dendl;
7577
7578 // if not permitted, fail fast
7579 if (!service.can_inc_scrubs()) {
7580 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7581 return;
7582 }
7583 bool allow_requested_repair_only = false;
7584 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7585 if (!cct->_conf->osd_repair_during_recovery) {
7586 dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
7587 return;
7588 }
7589 dout(10) << __func__
7590 << " will only schedule explicitly requested repair due to active recovery"
7591 << dendl;
7592 allow_requested_repair_only = true;
7593 }
7594
7595 utime_t now = ceph_clock_now();
7596 bool time_permit = scrub_time_permit(now);
7597 bool load_is_low = scrub_load_below_threshold();
7598 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7599
7600 OSDService::ScrubJob scrub_job;
7601 if (service.first_scrub_stamp(&scrub_job)) {
7602 do {
7603 dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl;
7604
7605 if (scrub_job.sched_time > now) {
7606 // save ourselves some effort
7607 dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
7608 << " > " << now << dendl;
7609 break;
7610 }
7611
7612 if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
7613 dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
7614 << (!time_permit ? "time not permit" : "high load") << dendl;
7615 continue;
7616 }
7617
7618 PGRef pg = _lookup_lock_pg(scrub_job.pgid);
7619 if (!pg) {
7620 dout(20) << __func__ << " pg " << scrub_job.pgid << " not found" << dendl;
7621 continue;
7622 }
7623
7624 // This has already started, so go on to the next scrub job
7625 if (pg->is_scrub_active()) {
7626 pg->unlock();
7627 dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
7628 continue;
7629 }
7630 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7631 if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7632 pg->unlock();
7633 dout(10) << __func__ << " skip " << scrub_job.pgid
7634 << " because repairing is not explicitly requested on it"
7635 << dendl;
7636 continue;
7637 }
7638
7639 // If it is reserving, let it resolve before going to the next scrub job
7640 if (pg->m_scrubber->is_reserving()) {
7641 pg->unlock();
7642 dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
7643 break;
7644 }
7645 dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
7646 << (pg->get_must_scrub() ? ", explicitly requested" :
7647 (load_is_low ? ", load_is_low" : " deadline < now"))
7648 << dendl;
7649 if (pg->sched_scrub()) {
7650 pg->unlock();
7651 dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
7652 break;
7653 }
7654 pg->unlock();
7655 } while (service.next_scrub_stamp(scrub_job, &scrub_job));
7656 }
7657 dout(20) << "sched_scrub done" << dendl;
7658 }
7659
7660 void OSD::resched_all_scrubs()
7661 {
7662 dout(10) << __func__ << ": start" << dendl;
7663 const vector<spg_t> pgs = [this] {
7664 vector<spg_t> pgs;
7665 OSDService::ScrubJob job;
7666 if (service.first_scrub_stamp(&job)) {
7667 do {
7668 pgs.push_back(job.pgid);
7669 } while (service.next_scrub_stamp(job, &job));
7670 }
7671 return pgs;
7672 }();
7673 for (auto& pgid : pgs) {
7674 dout(20) << __func__ << ": examine " << pgid << dendl;
7675 PGRef pg = _lookup_lock_pg(pgid);
7676 if (!pg)
7677 continue;
7678 if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7679 dout(15) << __func__ << ": reschedule " << pgid << dendl;
7680 pg->on_info_history_change();
7681 }
7682 pg->unlock();
7683 }
7684 dout(10) << __func__ << ": done" << dendl;
7685 }
7686
7687 MPGStats* OSD::collect_pg_stats()
7688 {
7689 // This implementation unconditionally sends every is_primary PG's
7690 // stats every time we're called. This has equivalent cost to the
7691 // previous implementation's worst case where all PGs are busy and
7692 // their stats are always enqueued for sending.
7693 std::shared_lock l{map_lock};
7694
7695 osd_stat_t cur_stat = service.get_osd_stat();
7696 cur_stat.os_perf_stat = store->get_cur_stats();
7697
7698 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7699 m->osd_stat = cur_stat;
7700
7701 std::lock_guard lec{min_last_epoch_clean_lock};
7702 min_last_epoch_clean = get_osdmap_epoch();
7703 min_last_epoch_clean_pgs.clear();
7704
7705 std::set<int64_t> pool_set;
7706 vector<PGRef> pgs;
7707 _get_pgs(&pgs);
7708 for (auto& pg : pgs) {
7709 auto pool = pg->pg_id.pgid.pool();
7710 pool_set.emplace((int64_t)pool);
7711 if (!pg->is_primary()) {
7712 continue;
7713 }
7714 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7715 m->pg_stat[pg->pg_id.pgid] = s;
7716 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7717 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7718 });
7719 }
7720 store_statfs_t st;
7721 bool per_pool_stats = false;
7722 bool per_pool_omap_stats = false;
7723 for (auto p : pool_set) {
7724 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7725 if (r == -ENOTSUP) {
7726 break;
7727 } else {
7728 assert(r >= 0);
7729 m->pool_stat[p] = st;
7730 per_pool_stats = true;
7731 }
7732 }
7733
7734 // indicate whether we are reporting per-pool stats
7735 m->osd_stat.num_osds = 1;
7736 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7737 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7738
7739 return m;
7740 }
7741
7742 vector<DaemonHealthMetric> OSD::get_health_metrics()
7743 {
7744 vector<DaemonHealthMetric> metrics;
7745 {
7746 utime_t oldest_secs;
7747 const utime_t now = ceph_clock_now();
7748 auto too_old = now;
7749 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7750 int slow = 0;
7751 TrackedOpRef oldest_op;
7752 auto count_slow_ops = [&](TrackedOp& op) {
7753 if (op.get_initiated() < too_old) {
7754 stringstream ss;
7755 ss << "slow request " << op.get_desc()
7756 << " initiated "
7757 << op.get_initiated()
7758 << " currently "
7759 << op.state_string();
7760 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7761 clog->warn() << ss.str();
7762 slow++;
7763 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7764 oldest_op = &op;
7765 }
7766 return true;
7767 } else {
7768 return false;
7769 }
7770 };
7771 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7772 if (slow) {
7773 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7774 << oldest_op->get_desc() << dendl;
7775 }
7776 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7777 } else {
7778 // no news is not good news.
7779 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7780 }
7781 }
7782 {
7783 std::lock_guard l(pending_creates_lock);
7784 auto n_primaries = pending_creates_from_mon;
7785 for (const auto& create : pending_creates_from_osd) {
7786 if (create.second) {
7787 n_primaries++;
7788 }
7789 }
7790 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7791 }
7792 return metrics;
7793 }
7794
7795 // =====================================================
7796 // MAP
7797
7798 void OSD::wait_for_new_map(OpRequestRef op)
7799 {
7800 // ask?
7801 if (waiting_for_osdmap.empty()) {
7802 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7803 }
7804
7805 logger->inc(l_osd_waiting_for_map);
7806 waiting_for_osdmap.push_back(op);
7807 op->mark_delayed("wait for new map");
7808 }
7809
7810
7811 /** update_map
7812 * assimilate new OSDMap(s). scan pgs, etc.
7813 */
7814
7815 void OSD::note_down_osd(int peer)
7816 {
7817 ceph_assert(ceph_mutex_is_locked(osd_lock));
7818 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7819
7820 std::lock_guard l{heartbeat_lock};
7821 failure_queue.erase(peer);
7822 failure_pending.erase(peer);
7823 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7824 if (p != heartbeat_peers.end()) {
7825 p->second.clear_mark_down();
7826 heartbeat_peers.erase(p);
7827 }
7828 }
7829
7830 void OSD::note_up_osd(int peer)
7831 {
7832 heartbeat_set_peers_need_update();
7833 }
7834
7835 struct C_OnMapCommit : public Context {
7836 OSD *osd;
7837 epoch_t first, last;
7838 MOSDMap *msg;
7839 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7840 : osd(o), first(f), last(l), msg(m) {}
7841 void finish(int r) override {
7842 osd->_committed_osd_maps(first, last, msg);
7843 msg->put();
7844 }
7845 };
7846
7847 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7848 {
7849 std::lock_guard l(osdmap_subscribe_lock);
7850 if (latest_subscribed_epoch >= epoch && !force_request)
7851 return;
7852
7853 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7854
7855 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7856 force_request) {
7857 monc->renew_subs();
7858 }
7859 }
7860
7861 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7862 {
7863 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7864 if (min <= superblock.oldest_map)
7865 return;
7866
7867 int num = 0;
7868 ObjectStore::Transaction t;
7869 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7870 dout(20) << " removing old osdmap epoch " << e << dendl;
7871 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7872 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7873 superblock.oldest_map = e + 1;
7874 num++;
7875 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7876 service.publish_superblock(superblock);
7877 write_superblock(t);
7878 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7879 ceph_assert(tr == 0);
7880 num = 0;
7881 if (!skip_maps) {
7882 // skip_maps leaves us with a range of old maps if we fail to remove all
7883 // of them before moving superblock.oldest_map forward to the first map
7884 // in the incoming MOSDMap msg. so we should continue removing them in
7885 // this case, even we could do huge series of delete transactions all at
7886 // once.
7887 break;
7888 }
7889 }
7890 }
7891 if (num > 0) {
7892 service.publish_superblock(superblock);
7893 write_superblock(t);
7894 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7895 ceph_assert(tr == 0);
7896 }
7897 // we should not remove the cached maps
7898 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7899 }
7900
7901 void OSD::handle_osd_map(MOSDMap *m)
7902 {
7903 // wait for pgs to catch up
7904 {
7905 // we extend the map cache pins to accomodate pgs slow to consume maps
7906 // for some period, until we hit the max_lag_factor bound, at which point
7907 // we block here to stop injesting more maps than they are able to keep
7908 // up with.
7909 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7910 m_osd_pg_epoch_max_lag_factor;
7911 ceph_assert(max_lag > 0);
7912 epoch_t osd_min = 0;
7913 for (auto shard : shards) {
7914 epoch_t min = shard->get_min_pg_epoch();
7915 if (osd_min == 0 || min < osd_min) {
7916 osd_min = min;
7917 }
7918 }
7919 epoch_t osdmap_epoch = get_osdmap_epoch();
7920 if (osd_min > 0 &&
7921 osdmap_epoch > max_lag &&
7922 osdmap_epoch - max_lag > osd_min) {
7923 epoch_t need = osdmap_epoch - max_lag;
7924 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7925 << " max_lag " << max_lag << ")" << dendl;
7926 for (auto shard : shards) {
7927 epoch_t min = shard->get_min_pg_epoch();
7928 if (need > min) {
7929 dout(10) << __func__ << " waiting for pgs to consume " << need
7930 << " (shard " << shard->shard_id << " min " << min
7931 << ", map cache is " << cct->_conf->osd_map_cache_size
7932 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7933 << ")" << dendl;
7934 unlock_guard unlock{osd_lock};
7935 shard->wait_min_pg_epoch(need);
7936 }
7937 }
7938 }
7939 }
7940
7941 ceph_assert(ceph_mutex_is_locked(osd_lock));
7942 map<epoch_t,OSDMapRef> added_maps;
7943 map<epoch_t,bufferlist> added_maps_bl;
7944 if (m->fsid != monc->get_fsid()) {
7945 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7946 << monc->get_fsid() << dendl;
7947 m->put();
7948 return;
7949 }
7950 if (is_initializing()) {
7951 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7952 m->put();
7953 return;
7954 }
7955
7956 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7957 if (session && !(session->entity_name.is_mon() ||
7958 session->entity_name.is_osd())) {
7959 //not enough perms!
7960 dout(10) << "got osd map from Session " << session
7961 << " which we can't take maps from (not a mon or osd)" << dendl;
7962 m->put();
7963 return;
7964 }
7965
7966 // share with the objecter
7967 if (!is_preboot())
7968 service.objecter->handle_osd_map(m);
7969
7970 epoch_t first = m->get_first();
7971 epoch_t last = m->get_last();
7972 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7973 << superblock.newest_map
7974 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7975 << dendl;
7976
7977 logger->inc(l_osd_map);
7978 logger->inc(l_osd_mape, last - first + 1);
7979 if (first <= superblock.newest_map)
7980 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7981 if (service.max_oldest_map < m->oldest_map) {
7982 service.max_oldest_map = m->oldest_map;
7983 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7984 }
7985
7986 // make sure there is something new, here, before we bother flushing
7987 // the queues and such
7988 if (last <= superblock.newest_map) {
7989 dout(10) << " no new maps here, dropping" << dendl;
7990 m->put();
7991 return;
7992 }
7993
7994 // missing some?
7995 bool skip_maps = false;
7996 if (first > superblock.newest_map + 1) {
7997 dout(10) << "handle_osd_map message skips epochs "
7998 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7999 if (m->oldest_map <= superblock.newest_map + 1) {
8000 osdmap_subscribe(superblock.newest_map + 1, false);
8001 m->put();
8002 return;
8003 }
8004 // always try to get the full range of maps--as many as we can. this
8005 // 1- is good to have
8006 // 2- is at present the only way to ensure that we get a *full* map as
8007 // the first map!
8008 if (m->oldest_map < first) {
8009 osdmap_subscribe(m->oldest_map - 1, true);
8010 m->put();
8011 return;
8012 }
8013 skip_maps = true;
8014 }
8015
8016 ObjectStore::Transaction t;
8017 uint64_t txn_size = 0;
8018
8019 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8020
8021 // store new maps: queue for disk and put in the osdmap cache
8022 epoch_t start = std::max(superblock.newest_map + 1, first);
8023 for (epoch_t e = start; e <= last; e++) {
8024 if (txn_size >= t.get_num_bytes()) {
8025 derr << __func__ << " transaction size overflowed" << dendl;
8026 ceph_assert(txn_size < t.get_num_bytes());
8027 }
8028 txn_size = t.get_num_bytes();
8029 map<epoch_t,bufferlist>::iterator p;
8030 p = m->maps.find(e);
8031 if (p != m->maps.end()) {
8032 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8033 OSDMap *o = new OSDMap;
8034 bufferlist& bl = p->second;
8035
8036 o->decode(bl);
8037
8038 purged_snaps[e] = o->get_new_purged_snaps();
8039
8040 ghobject_t fulloid = get_osdmap_pobject_name(e);
8041 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8042 added_maps[e] = add_map(o);
8043 added_maps_bl[e] = bl;
8044 got_full_map(e);
8045 continue;
8046 }
8047
8048 p = m->incremental_maps.find(e);
8049 if (p != m->incremental_maps.end()) {
8050 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8051 bufferlist& bl = p->second;
8052 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8053 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8054
8055 OSDMap *o = new OSDMap;
8056 if (e > 1) {
8057 bufferlist obl;
8058 bool got = get_map_bl(e - 1, obl);
8059 if (!got) {
8060 auto p = added_maps_bl.find(e - 1);
8061 ceph_assert(p != added_maps_bl.end());
8062 obl = p->second;
8063 }
8064 o->decode(obl);
8065 }
8066
8067 OSDMap::Incremental inc;
8068 auto p = bl.cbegin();
8069 inc.decode(p);
8070
8071 if (o->apply_incremental(inc) < 0) {
8072 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8073 ceph_abort_msg("bad fsid");
8074 }
8075
8076 bufferlist fbl;
8077 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8078
8079 bool injected_failure = false;
8080 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8081 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8082 derr << __func__ << " injecting map crc failure" << dendl;
8083 injected_failure = true;
8084 }
8085
8086 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8087 dout(2) << "got incremental " << e
8088 << " but failed to encode full with correct crc; requesting"
8089 << dendl;
8090 clog->warn() << "failed to encode map e" << e << " with expected crc";
8091 dout(20) << "my encoded map was:\n";
8092 fbl.hexdump(*_dout);
8093 *_dout << dendl;
8094 delete o;
8095 request_full_map(e, last);
8096 last = e - 1;
8097
8098 // don't continue committing if we failed to enc the first inc map
8099 if (last < start) {
8100 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8101 m->put();
8102 return;
8103 }
8104 break;
8105 }
8106 got_full_map(e);
8107 purged_snaps[e] = o->get_new_purged_snaps();
8108
8109 ghobject_t fulloid = get_osdmap_pobject_name(e);
8110 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8111 added_maps[e] = add_map(o);
8112 added_maps_bl[e] = fbl;
8113 continue;
8114 }
8115
8116 ceph_abort_msg("MOSDMap lied about what maps it had?");
8117 }
8118
8119 // even if this map isn't from a mon, we may have satisfied our subscription
8120 monc->sub_got("osdmap", last);
8121
8122 if (!m->maps.empty() && requested_full_first) {
8123 dout(10) << __func__ << " still missing full maps " << requested_full_first
8124 << ".." << requested_full_last << dendl;
8125 rerequest_full_maps();
8126 }
8127
8128 if (superblock.oldest_map) {
8129 // make sure we at least keep pace with incoming maps
8130 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8131 pg_num_history.prune(superblock.oldest_map);
8132 }
8133
8134 if (!superblock.oldest_map || skip_maps)
8135 superblock.oldest_map = first;
8136 superblock.newest_map = last;
8137 superblock.current_epoch = last;
8138
8139 // note in the superblock that we were clean thru the prior epoch
8140 epoch_t boot_epoch = service.get_boot_epoch();
8141 if (boot_epoch && boot_epoch >= superblock.mounted) {
8142 superblock.mounted = boot_epoch;
8143 superblock.clean_thru = last;
8144 }
8145
8146 // check for pg_num changes and deleted pools
8147 OSDMapRef lastmap;
8148 for (auto& i : added_maps) {
8149 if (!lastmap) {
8150 if (!(lastmap = service.try_get_map(i.first - 1))) {
8151 dout(10) << __func__ << " can't get previous map " << i.first - 1
8152 << " probably first start of this osd" << dendl;
8153 continue;
8154 }
8155 }
8156 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8157 for (auto& j : lastmap->get_pools()) {
8158 if (!i.second->have_pg_pool(j.first)) {
8159 pg_num_history.log_pool_delete(i.first, j.first);
8160 dout(10) << __func__ << " recording final pg_pool_t for pool "
8161 << j.first << dendl;
8162 // this information is needed by _make_pg() if have to restart before
8163 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8164 ghobject_t obj = make_final_pool_info_oid(j.first);
8165 bufferlist bl;
8166 encode(j.second, bl, CEPH_FEATURES_ALL);
8167 string name = lastmap->get_pool_name(j.first);
8168 encode(name, bl);
8169 map<string,string> profile;
8170 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8171 profile = lastmap->get_erasure_code_profile(
8172 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8173 }
8174 encode(profile, bl);
8175 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8176 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8177 new_pg_num != j.second.get_pg_num()) {
8178 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8179 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8180 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8181 }
8182 }
8183 for (auto& j : i.second->get_pools()) {
8184 if (!lastmap->have_pg_pool(j.first)) {
8185 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8186 << j.second.get_pg_num() << dendl;
8187 pg_num_history.log_pg_num_change(i.first, j.first,
8188 j.second.get_pg_num());
8189 }
8190 }
8191 lastmap = i.second;
8192 }
8193 pg_num_history.epoch = last;
8194 {
8195 bufferlist bl;
8196 ::encode(pg_num_history, bl);
8197 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8198 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8199 }
8200
8201 // record new purged_snaps
8202 if (superblock.purged_snaps_last == start - 1) {
8203 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8204 make_purged_snaps_oid(), &t,
8205 purged_snaps);
8206 superblock.purged_snaps_last = last;
8207 } else {
8208 dout(10) << __func__ << " superblock purged_snaps_last is "
8209 << superblock.purged_snaps_last
8210 << ", not recording new purged_snaps" << dendl;
8211 }
8212
8213 // superblock and commit
8214 write_superblock(t);
8215 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8216 store->queue_transaction(
8217 service.meta_ch,
8218 std::move(t));
8219 service.publish_superblock(superblock);
8220 }
8221
8222 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8223 {
8224 dout(10) << __func__ << " " << first << ".." << last << dendl;
8225 if (is_stopping()) {
8226 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8227 return;
8228 }
8229 std::lock_guard l(osd_lock);
8230 if (is_stopping()) {
8231 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8232 return;
8233 }
8234 map_lock.lock();
8235
8236 ceph_assert(first <= last);
8237
8238 bool do_shutdown = false;
8239 bool do_restart = false;
8240 bool network_error = false;
8241 OSDMapRef osdmap = get_osdmap();
8242
8243 // advance through the new maps
8244 for (epoch_t cur = first; cur <= last; cur++) {
8245 dout(10) << " advance to epoch " << cur
8246 << " (<= last " << last
8247 << " <= newest_map " << superblock.newest_map
8248 << ")" << dendl;
8249
8250 OSDMapRef newmap = get_map(cur);
8251 ceph_assert(newmap); // we just cached it above!
8252
8253 // start blocklisting messages sent to peers that go down.
8254 service.pre_publish_map(newmap);
8255
8256 // kill connections to newly down osds
8257 bool waited_for_reservations = false;
8258 set<int> old;
8259 osdmap = get_osdmap();
8260 osdmap->get_all_osds(old);
8261 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8262 if (*p != whoami &&
8263 osdmap->is_up(*p) && // in old map
8264 newmap->is_down(*p)) { // but not the new one
8265 if (!waited_for_reservations) {
8266 service.await_reserved_maps();
8267 waited_for_reservations = true;
8268 }
8269 note_down_osd(*p);
8270 } else if (*p != whoami &&
8271 osdmap->is_down(*p) &&
8272 newmap->is_up(*p)) {
8273 note_up_osd(*p);
8274 }
8275 }
8276
8277 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8278 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8279 << dendl;
8280 if (is_booting()) {
8281 // this captures the case where we sent the boot message while
8282 // NOUP was being set on the mon and our boot request was
8283 // dropped, and then later it is cleared. it imperfectly
8284 // handles the case where our original boot message was not
8285 // dropped and we restart even though we might have booted, but
8286 // that is harmless (boot will just take slightly longer).
8287 do_restart = true;
8288 }
8289 }
8290
8291 osdmap = std::move(newmap);
8292 set_osdmap(osdmap);
8293 epoch_t up_epoch;
8294 epoch_t boot_epoch;
8295 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8296 if (!up_epoch &&
8297 osdmap->is_up(whoami) &&
8298 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8299 up_epoch = osdmap->get_epoch();
8300 dout(10) << "up_epoch is " << up_epoch << dendl;
8301 if (!boot_epoch) {
8302 boot_epoch = osdmap->get_epoch();
8303 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8304 }
8305 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8306 }
8307 }
8308
8309 epoch_t _bind_epoch = service.get_bind_epoch();
8310 if (osdmap->is_up(whoami) &&
8311 osdmap->get_addrs(whoami).legacy_equals(
8312 client_messenger->get_myaddrs()) &&
8313 _bind_epoch < osdmap->get_up_from(whoami)) {
8314
8315 if (is_booting()) {
8316 dout(1) << "state: booting -> active" << dendl;
8317 set_state(STATE_ACTIVE);
8318 do_restart = false;
8319
8320 // set incarnation so that osd_reqid_t's we generate for our
8321 // objecter requests are unique across restarts.
8322 service.objecter->set_client_incarnation(osdmap->get_epoch());
8323 cancel_pending_failures();
8324 }
8325 }
8326
8327 if (osdmap->get_epoch() > 0 &&
8328 is_active()) {
8329 if (!osdmap->exists(whoami)) {
8330 derr << "map says i do not exist. shutting down." << dendl;
8331 do_shutdown = true; // don't call shutdown() while we have
8332 // everything paused
8333 } else if (osdmap->is_stop(whoami)) {
8334 derr << "map says i am stopped by admin. shutting down." << dendl;
8335 do_shutdown = true;
8336 } else if (!osdmap->is_up(whoami) ||
8337 !osdmap->get_addrs(whoami).legacy_equals(
8338 client_messenger->get_myaddrs()) ||
8339 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8340 cluster_messenger->get_myaddrs()) ||
8341 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8342 hb_back_server_messenger->get_myaddrs()) ||
8343 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8344 hb_front_server_messenger->get_myaddrs())) {
8345 if (!osdmap->is_up(whoami)) {
8346 if (service.is_preparing_to_stop() || service.is_stopping()) {
8347 service.got_stop_ack();
8348 } else {
8349 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8350 "but it is still running";
8351 clog->debug() << "map e" << osdmap->get_epoch()
8352 << " wrongly marked me down at e"
8353 << osdmap->get_down_at(whoami);
8354 }
8355 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8356 // note that this is best-effort...
8357 monc->send_mon_message(
8358 new MOSDMarkMeDead(
8359 monc->get_fsid(),
8360 whoami,
8361 osdmap->get_epoch()));
8362 }
8363 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8364 client_messenger->get_myaddrs())) {
8365 clog->error() << "map e" << osdmap->get_epoch()
8366 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8367 << " != my " << client_messenger->get_myaddrs() << ")";
8368 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8369 cluster_messenger->get_myaddrs())) {
8370 clog->error() << "map e" << osdmap->get_epoch()
8371 << " had wrong cluster addr ("
8372 << osdmap->get_cluster_addrs(whoami)
8373 << " != my " << cluster_messenger->get_myaddrs() << ")";
8374 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8375 hb_back_server_messenger->get_myaddrs())) {
8376 clog->error() << "map e" << osdmap->get_epoch()
8377 << " had wrong heartbeat back addr ("
8378 << osdmap->get_hb_back_addrs(whoami)
8379 << " != my " << hb_back_server_messenger->get_myaddrs()
8380 << ")";
8381 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8382 hb_front_server_messenger->get_myaddrs())) {
8383 clog->error() << "map e" << osdmap->get_epoch()
8384 << " had wrong heartbeat front addr ("
8385 << osdmap->get_hb_front_addrs(whoami)
8386 << " != my " << hb_front_server_messenger->get_myaddrs()
8387 << ")";
8388 }
8389
8390 if (!service.is_stopping()) {
8391 epoch_t up_epoch = 0;
8392 epoch_t bind_epoch = osdmap->get_epoch();
8393 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8394 do_restart = true;
8395
8396 //add markdown log
8397 utime_t now = ceph_clock_now();
8398 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8399 osd_markdown_log.push_back(now);
8400 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8401 derr << __func__ << " marked down "
8402 << osd_markdown_log.size()
8403 << " > osd_max_markdown_count "
8404 << cct->_conf->osd_max_markdown_count
8405 << " in last " << grace << " seconds, shutting down"
8406 << dendl;
8407 do_restart = false;
8408 do_shutdown = true;
8409 }
8410
8411 start_waiting_for_healthy();
8412
8413 set<int> avoid_ports;
8414 #if defined(__FreeBSD__)
8415 // prevent FreeBSD from grabbing the client_messenger port during
8416 // rebinding. In which case a cluster_meesneger will connect also
8417 // to the same port
8418 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8419 #endif
8420 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8421
8422 int r = cluster_messenger->rebind(avoid_ports);
8423 if (r != 0) {
8424 do_shutdown = true; // FIXME: do_restart?
8425 network_error = true;
8426 derr << __func__ << " marked down:"
8427 << " rebind cluster_messenger failed" << dendl;
8428 }
8429
8430 hb_back_server_messenger->mark_down_all();
8431 hb_front_server_messenger->mark_down_all();
8432 hb_front_client_messenger->mark_down_all();
8433 hb_back_client_messenger->mark_down_all();
8434
8435 reset_heartbeat_peers(true);
8436 }
8437 }
8438 }
8439
8440 map_lock.unlock();
8441
8442 check_osdmap_features();
8443
8444 // yay!
8445 consume_map();
8446
8447 if (is_active() || is_waiting_for_healthy())
8448 maybe_update_heartbeat_peers();
8449
8450 if (is_active()) {
8451 activate_map();
8452 }
8453
8454 if (do_shutdown) {
8455 if (network_error) {
8456 cancel_pending_failures();
8457 }
8458 // trigger shutdown in a different thread
8459 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8460 queue_async_signal(SIGINT);
8461 }
8462 else if (m->newest_map && m->newest_map > last) {
8463 dout(10) << " msg say newest map is " << m->newest_map
8464 << ", requesting more" << dendl;
8465 osdmap_subscribe(osdmap->get_epoch()+1, false);
8466 }
8467 else if (is_preboot()) {
8468 if (m->get_source().is_mon())
8469 _preboot(m->oldest_map, m->newest_map);
8470 else
8471 start_boot();
8472 }
8473 else if (do_restart)
8474 start_boot();
8475
8476 }
8477
8478 void OSD::check_osdmap_features()
8479 {
8480 // adjust required feature bits?
8481
8482 // we have to be a bit careful here, because we are accessing the
8483 // Policy structures without taking any lock. in particular, only
8484 // modify integer values that can safely be read by a racing CPU.
8485 // since we are only accessing existing Policy structures a their
8486 // current memory location, and setting or clearing bits in integer
8487 // fields, and we are the only writer, this is not a problem.
8488
8489 const auto osdmap = get_osdmap();
8490 {
8491 Messenger::Policy p = client_messenger->get_default_policy();
8492 uint64_t mask;
8493 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8494 if ((p.features_required & mask) != features) {
8495 dout(0) << "crush map has features " << features
8496 << ", adjusting msgr requires for clients" << dendl;
8497 p.features_required = (p.features_required & ~mask) | features;
8498 client_messenger->set_default_policy(p);
8499 }
8500 }
8501 {
8502 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8503 uint64_t mask;
8504 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8505 if ((p.features_required & mask) != features) {
8506 dout(0) << "crush map has features " << features
8507 << " was " << p.features_required
8508 << ", adjusting msgr requires for mons" << dendl;
8509 p.features_required = (p.features_required & ~mask) | features;
8510 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8511 }
8512 }
8513 {
8514 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8515 uint64_t mask;
8516 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8517
8518 if ((p.features_required & mask) != features) {
8519 dout(0) << "crush map has features " << features
8520 << ", adjusting msgr requires for osds" << dendl;
8521 p.features_required = (p.features_required & ~mask) | features;
8522 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8523 }
8524
8525 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8526 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8527 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8528 ObjectStore::Transaction t;
8529 write_superblock(t);
8530 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8531 ceph_assert(err == 0);
8532 }
8533 }
8534
8535 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8536 hb_front_server_messenger->set_require_authorizer(false);
8537 hb_back_server_messenger->set_require_authorizer(false);
8538 } else {
8539 hb_front_server_messenger->set_require_authorizer(true);
8540 hb_back_server_messenger->set_require_authorizer(true);
8541 }
8542
8543 if (osdmap->require_osd_release != last_require_osd_release) {
8544 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8545 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8546 store->write_meta("require_osd_release",
8547 stringify((int)osdmap->require_osd_release));
8548 last_require_osd_release = osdmap->require_osd_release;
8549 }
8550 }
8551
8552 struct C_FinishSplits : public Context {
8553 OSD *osd;
8554 set<PGRef> pgs;
8555 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8556 : osd(osd), pgs(in) {}
8557 void finish(int r) override {
8558 osd->_finish_splits(pgs);
8559 }
8560 };
8561
8562 void OSD::_finish_splits(set<PGRef>& pgs)
8563 {
8564 dout(10) << __func__ << " " << pgs << dendl;
8565 if (is_stopping())
8566 return;
8567 for (set<PGRef>::iterator i = pgs.begin();
8568 i != pgs.end();
8569 ++i) {
8570 PG *pg = i->get();
8571
8572 PeeringCtx rctx = create_context();
8573 pg->lock();
8574 dout(10) << __func__ << " " << *pg << dendl;
8575 epoch_t e = pg->get_osdmap_epoch();
8576 pg->handle_initialize(rctx);
8577 pg->queue_null(e, e);
8578 dispatch_context(rctx, pg, service.get_osdmap());
8579 pg->unlock();
8580
8581 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8582 shards[shard_index]->register_and_wake_split_child(pg);
8583 }
8584 };
8585
8586 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8587 unsigned need)
8588 {
8589 std::lock_guard l(merge_lock);
8590 auto& p = merge_waiters[nextmap->get_epoch()][target];
8591 p[src->pg_id] = src;
8592 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8593 << " for " << target << ", have " << p.size() << "/" << need
8594 << dendl;
8595 return p.size() == need;
8596 }
8597
8598 bool OSD::advance_pg(
8599 epoch_t osd_epoch,
8600 PG *pg,
8601 ThreadPool::TPHandle &handle,
8602 PeeringCtx &rctx)
8603 {
8604 if (osd_epoch <= pg->get_osdmap_epoch()) {
8605 return true;
8606 }
8607 ceph_assert(pg->is_locked());
8608 OSDMapRef lastmap = pg->get_osdmap();
8609 set<PGRef> new_pgs; // any split children
8610 bool ret = true;
8611
8612 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8613 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8614 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8615 next_epoch <= osd_epoch;
8616 ++next_epoch) {
8617 OSDMapRef nextmap = service.try_get_map(next_epoch);
8618 if (!nextmap) {
8619 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8620 continue;
8621 }
8622
8623 unsigned new_pg_num =
8624 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8625 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8626 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8627 // check for merge
8628 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8629 spg_t parent;
8630 if (pg->pg_id.is_merge_source(
8631 old_pg_num,
8632 new_pg_num,
8633 &parent)) {
8634 // we are merge source
8635 PGRef spg = pg; // carry a ref
8636 dout(1) << __func__ << " " << pg->pg_id
8637 << " is merge source, target is " << parent
8638 << dendl;
8639 pg->write_if_dirty(rctx);
8640 if (!new_pgs.empty()) {
8641 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8642 new_pgs));
8643 new_pgs.clear();
8644 }
8645 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8646 pg->ch->flush();
8647 // release backoffs explicitly, since the on_shutdown path
8648 // aggressively tears down backoff state.
8649 if (pg->is_primary()) {
8650 pg->release_pg_backoffs();
8651 }
8652 pg->on_shutdown();
8653 OSDShard *sdata = pg->osd_shard;
8654 {
8655 std::lock_guard l(sdata->shard_lock);
8656 if (pg->pg_slot) {
8657 sdata->_detach_pg(pg->pg_slot);
8658 // update pg count now since we might not get an osdmap
8659 // any time soon.
8660 if (pg->is_primary())
8661 logger->dec(l_osd_pg_primary);
8662 else if (pg->is_nonprimary())
8663 logger->dec(l_osd_pg_replica); // misnomer
8664 else
8665 logger->dec(l_osd_pg_stray);
8666 }
8667 }
8668 pg->unlock();
8669
8670 set<spg_t> children;
8671 parent.is_split(new_pg_num, old_pg_num, &children);
8672 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8673 enqueue_peering_evt(
8674 parent,
8675 PGPeeringEventRef(
8676 std::make_shared<PGPeeringEvent>(
8677 nextmap->get_epoch(),
8678 nextmap->get_epoch(),
8679 NullEvt())));
8680 }
8681 ret = false;
8682 goto out;
8683 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8684 // we are merge target
8685 set<spg_t> children;
8686 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8687 dout(20) << __func__ << " " << pg->pg_id
8688 << " is merge target, sources are " << children
8689 << dendl;
8690 map<spg_t,PGRef> sources;
8691 {
8692 std::lock_guard l(merge_lock);
8693 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8694 unsigned need = children.size();
8695 dout(20) << __func__ << " have " << s.size() << "/"
8696 << need << dendl;
8697 if (s.size() == need) {
8698 sources.swap(s);
8699 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8700 if (merge_waiters[nextmap->get_epoch()].empty()) {
8701 merge_waiters.erase(nextmap->get_epoch());
8702 }
8703 }
8704 }
8705 if (!sources.empty()) {
8706 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8707 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8708 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8709 pg->merge_from(
8710 sources, rctx, split_bits,
8711 nextmap->get_pg_pool(
8712 pg->pg_id.pool())->last_pg_merge_meta);
8713 pg->pg_slot->waiting_for_merge_epoch = 0;
8714 } else {
8715 dout(20) << __func__ << " not ready to merge yet" << dendl;
8716 pg->write_if_dirty(rctx);
8717 if (!new_pgs.empty()) {
8718 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8719 new_pgs));
8720 new_pgs.clear();
8721 }
8722 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8723 pg->unlock();
8724 // kick source(s) to get them ready
8725 for (auto& i : children) {
8726 dout(20) << __func__ << " kicking source " << i << dendl;
8727 enqueue_peering_evt(
8728 i,
8729 PGPeeringEventRef(
8730 std::make_shared<PGPeeringEvent>(
8731 nextmap->get_epoch(),
8732 nextmap->get_epoch(),
8733 NullEvt())));
8734 }
8735 ret = false;
8736 goto out;
8737 }
8738 }
8739 }
8740 }
8741
8742 vector<int> newup, newacting;
8743 int up_primary, acting_primary;
8744 nextmap->pg_to_up_acting_osds(
8745 pg->pg_id.pgid,
8746 &newup, &up_primary,
8747 &newacting, &acting_primary);
8748 pg->handle_advance_map(
8749 nextmap, lastmap, newup, up_primary,
8750 newacting, acting_primary, rctx);
8751
8752 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8753 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8754 if (oldpool != lastmap->get_pools().end()
8755 && newpool != nextmap->get_pools().end()) {
8756 dout(20) << __func__
8757 << " new pool opts " << newpool->second.opts
8758 << " old pool opts " << oldpool->second.opts
8759 << dendl;
8760
8761 double old_min_interval = 0, new_min_interval = 0;
8762 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8763 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8764
8765 double old_max_interval = 0, new_max_interval = 0;
8766 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8767 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8768
8769 // Assume if an interval is change from set to unset or vice versa the actual config
8770 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8771 // unnecessarily.
8772 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8773 pg->on_info_history_change();
8774 }
8775 }
8776
8777 if (new_pg_num && old_pg_num != new_pg_num) {
8778 // check for split
8779 set<spg_t> children;
8780 if (pg->pg_id.is_split(
8781 old_pg_num,
8782 new_pg_num,
8783 &children)) {
8784 split_pgs(
8785 pg, children, &new_pgs, lastmap, nextmap,
8786 rctx);
8787 }
8788 }
8789
8790 lastmap = nextmap;
8791 old_pg_num = new_pg_num;
8792 handle.reset_tp_timeout();
8793 }
8794 pg->handle_activate_map(rctx);
8795
8796 ret = true;
8797 out:
8798 if (!new_pgs.empty()) {
8799 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8800 }
8801 return ret;
8802 }
8803
8804 void OSD::consume_map()
8805 {
8806 ceph_assert(ceph_mutex_is_locked(osd_lock));
8807 auto osdmap = get_osdmap();
8808 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8809
8810 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8811 * speak the older sorting version any more. Be careful not to force
8812 * a shutdown if we are merely processing old maps, though.
8813 */
8814 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8815 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8816 ceph_abort();
8817 }
8818
8819 service.pre_publish_map(osdmap);
8820 service.await_reserved_maps();
8821 service.publish_map(osdmap);
8822
8823 // prime splits and merges
8824 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8825 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8826 for (auto& shard : shards) {
8827 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8828 }
8829 if (!newly_split.empty()) {
8830 for (auto& shard : shards) {
8831 shard->prime_splits(osdmap, &newly_split);
8832 }
8833 ceph_assert(newly_split.empty());
8834 }
8835
8836 // prune sent_ready_to_merge
8837 service.prune_sent_ready_to_merge(osdmap);
8838
8839 // FIXME, maybe: We could race against an incoming peering message
8840 // that instantiates a merge PG after identify_merges() below and
8841 // never set up its peer to complete the merge. An OSD restart
8842 // would clear it up. This is a hard race to resolve,
8843 // extraordinarily rare (we only merge PGs that are stable and
8844 // clean, so it'd have to be an imported PG to an OSD with a
8845 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8846 // replace all of this with a seastar-based code soon anyway.
8847 if (!merge_pgs.empty()) {
8848 // mark the pgs we already have, or create new and empty merge
8849 // participants for those we are missing. do this all under the
8850 // shard lock so we don't have to worry about racing pg creates
8851 // via _process.
8852 for (auto& shard : shards) {
8853 shard->prime_merges(osdmap, &merge_pgs);
8854 }
8855 ceph_assert(merge_pgs.empty());
8856 }
8857
8858 service.prune_pg_created();
8859
8860 unsigned pushes_to_free = 0;
8861 for (auto& shard : shards) {
8862 shard->consume_map(osdmap, &pushes_to_free);
8863 }
8864
8865 vector<spg_t> pgids;
8866 _get_pgids(&pgids);
8867
8868 // count (FIXME, probably during seastar rewrite)
8869 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8870 vector<PGRef> pgs;
8871 _get_pgs(&pgs);
8872 for (auto& pg : pgs) {
8873 // FIXME (probably during seastar rewrite): this is lockless and
8874 // racy, but we don't want to take pg lock here.
8875 if (pg->is_primary())
8876 num_pg_primary++;
8877 else if (pg->is_nonprimary())
8878 num_pg_replica++; // misnomer
8879 else
8880 num_pg_stray++;
8881 }
8882
8883 {
8884 // FIXME (as part of seastar rewrite): move to OSDShard
8885 std::lock_guard l(pending_creates_lock);
8886 for (auto pg = pending_creates_from_osd.begin();
8887 pg != pending_creates_from_osd.end();) {
8888 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8889 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8890 << "discarding pending_create_from_osd" << dendl;
8891 pg = pending_creates_from_osd.erase(pg);
8892 } else {
8893 ++pg;
8894 }
8895 }
8896 }
8897
8898 service.maybe_inject_dispatch_delay();
8899
8900 dispatch_sessions_waiting_on_map();
8901
8902 service.maybe_inject_dispatch_delay();
8903
8904 service.release_reserved_pushes(pushes_to_free);
8905
8906 // queue null events to push maps down to individual PGs
8907 for (auto pgid : pgids) {
8908 enqueue_peering_evt(
8909 pgid,
8910 PGPeeringEventRef(
8911 std::make_shared<PGPeeringEvent>(
8912 osdmap->get_epoch(),
8913 osdmap->get_epoch(),
8914 NullEvt())));
8915 }
8916 logger->set(l_osd_pg, pgids.size());
8917 logger->set(l_osd_pg_primary, num_pg_primary);
8918 logger->set(l_osd_pg_replica, num_pg_replica);
8919 logger->set(l_osd_pg_stray, num_pg_stray);
8920 }
8921
8922 void OSD::activate_map()
8923 {
8924 ceph_assert(ceph_mutex_is_locked(osd_lock));
8925 auto osdmap = get_osdmap();
8926
8927 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8928
8929 // norecover?
8930 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8931 if (!service.recovery_is_paused()) {
8932 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8933 service.pause_recovery();
8934 }
8935 } else {
8936 if (service.recovery_is_paused()) {
8937 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8938 service.unpause_recovery();
8939 }
8940 }
8941
8942 service.activate_map();
8943
8944 // process waiters
8945 take_waiters(waiting_for_osdmap);
8946 }
8947
8948 bool OSD::require_mon_peer(const Message *m)
8949 {
8950 if (!m->get_connection()->peer_is_mon()) {
8951 dout(0) << "require_mon_peer received from non-mon "
8952 << m->get_connection()->get_peer_addr()
8953 << " " << *m << dendl;
8954 return false;
8955 }
8956 return true;
8957 }
8958
8959 bool OSD::require_mon_or_mgr_peer(const Message *m)
8960 {
8961 if (!m->get_connection()->peer_is_mon() &&
8962 !m->get_connection()->peer_is_mgr()) {
8963 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8964 << m->get_connection()->get_peer_addr()
8965 << " " << *m << dendl;
8966 return false;
8967 }
8968 return true;
8969 }
8970
8971 bool OSD::require_osd_peer(const Message *m)
8972 {
8973 if (!m->get_connection()->peer_is_osd()) {
8974 dout(0) << "require_osd_peer received from non-osd "
8975 << m->get_connection()->get_peer_addr()
8976 << " " << *m << dendl;
8977 return false;
8978 }
8979 return true;
8980 }
8981
8982 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8983 {
8984 epoch_t up_epoch = service.get_up_epoch();
8985 if (epoch < up_epoch) {
8986 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8987 return false;
8988 }
8989
8990 if (!is_active()) {
8991 dout(7) << "still in boot state, dropping message " << *m << dendl;
8992 return false;
8993 }
8994
8995 return true;
8996 }
8997
8998 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8999 bool is_fast_dispatch)
9000 {
9001 int from = m->get_source().num();
9002
9003 if (map->is_down(from) ||
9004 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9005 dout(5) << "from dead osd." << from << ", marking down, "
9006 << " msg was " << m->get_source_inst().addr
9007 << " expected "
9008 << (map->is_up(from) ?
9009 map->get_cluster_addrs(from) : entity_addrvec_t())
9010 << dendl;
9011 ConnectionRef con = m->get_connection();
9012 con->mark_down();
9013 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
9014 if (!is_fast_dispatch)
9015 s->session_dispatch_lock.lock();
9016 clear_session_waiting_on_map(s);
9017 con->set_priv(nullptr); // break ref <-> session cycle, if any
9018 s->con.reset();
9019 if (!is_fast_dispatch)
9020 s->session_dispatch_lock.unlock();
9021 }
9022 return false;
9023 }
9024 return true;
9025 }
9026
9027
9028 /*
9029 * require that we have same (or newer) map, and that
9030 * the source is the pg primary.
9031 */
9032 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9033 bool is_fast_dispatch)
9034 {
9035 const Message *m = op->get_req();
9036 const auto osdmap = get_osdmap();
9037 dout(15) << "require_same_or_newer_map " << epoch
9038 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9039
9040 ceph_assert(ceph_mutex_is_locked(osd_lock));
9041
9042 // do they have a newer map?
9043 if (epoch > osdmap->get_epoch()) {
9044 dout(7) << "waiting for newer map epoch " << epoch
9045 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9046 wait_for_new_map(op);
9047 return false;
9048 }
9049
9050 if (!require_self_aliveness(op->get_req(), epoch)) {
9051 return false;
9052 }
9053
9054 // ok, our map is same or newer.. do they still exist?
9055 if (m->get_connection()->get_messenger() == cluster_messenger &&
9056 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9057 return false;
9058 }
9059
9060 return true;
9061 }
9062
9063
9064
9065
9066
9067 // ----------------------------------------
9068 // pg creation
9069
9070 void OSD::split_pgs(
9071 PG *parent,
9072 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9073 OSDMapRef curmap,
9074 OSDMapRef nextmap,
9075 PeeringCtx &rctx)
9076 {
9077 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9078 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9079
9080 vector<object_stat_sum_t> updated_stats;
9081 parent->start_split_stats(childpgids, &updated_stats);
9082
9083 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9084 for (set<spg_t>::const_iterator i = childpgids.begin();
9085 i != childpgids.end();
9086 ++i, ++stat_iter) {
9087 ceph_assert(stat_iter != updated_stats.end());
9088 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9089 PG* child = _make_pg(nextmap, *i);
9090 child->lock(true);
9091 out_pgs->insert(child);
9092 child->ch = store->create_new_collection(child->coll);
9093
9094 {
9095 uint32_t shard_index = i->hash_to_shard(shards.size());
9096 assert(NULL != shards[shard_index]);
9097 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9098 }
9099
9100 unsigned split_bits = i->get_split_bits(pg_num);
9101 dout(10) << " pg_num is " << pg_num
9102 << ", m_seed " << i->ps()
9103 << ", split_bits is " << split_bits << dendl;
9104 parent->split_colls(
9105 *i,
9106 split_bits,
9107 i->ps(),
9108 &child->get_pool().info,
9109 rctx.transaction);
9110 parent->split_into(
9111 i->pgid,
9112 child,
9113 split_bits);
9114
9115 child->init_collection_pool_opts();
9116
9117 child->finish_split_stats(*stat_iter, rctx.transaction);
9118 child->unlock();
9119 }
9120 ceph_assert(stat_iter != updated_stats.end());
9121 parent->finish_split_stats(*stat_iter, rctx.transaction);
9122 }
9123
9124 /*
9125 * holding osd_lock
9126 */
9127 void OSD::handle_pg_create(OpRequestRef op)
9128 {
9129 // NOTE: this can be removed in P release (mimic is the last version to
9130 // send MOSDPGCreate messages).
9131
9132 auto m = op->get_req<MOSDPGCreate>();
9133 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9134
9135 dout(10) << "handle_pg_create " << *m << dendl;
9136
9137 if (!require_mon_peer(op->get_req())) {
9138 return;
9139 }
9140
9141 if (!require_same_or_newer_map(op, m->epoch, false))
9142 return;
9143
9144 op->mark_started();
9145
9146 const auto osdmap = get_osdmap();
9147 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9148 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9149 p != m->mkpg.end();
9150 ++p, ++ci) {
9151 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9152 epoch_t created = p->second.created;
9153 if (p->second.split_bits) // Skip split pgs
9154 continue;
9155 pg_t on = p->first;
9156
9157 if (!osdmap->have_pg_pool(on.pool())) {
9158 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9159 continue;
9160 }
9161
9162 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9163
9164 spg_t pgid;
9165 bool mapped = osdmap->get_primary_shard(on, &pgid);
9166 ceph_assert(mapped);
9167
9168 // is it still ours?
9169 vector<int> up, acting;
9170 int up_primary = -1;
9171 int acting_primary = -1;
9172 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9173 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9174
9175 if (acting_primary != whoami) {
9176 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9177 << "), my role=" << role << ", skipping" << dendl;
9178 continue;
9179 }
9180
9181
9182 PastIntervals pi;
9183 pg_history_t history;
9184 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9185
9186 // The mon won't resend unless the primary changed, so we ignore
9187 // same_interval_since. We'll pass this history with the current
9188 // epoch as the event.
9189 if (history.same_primary_since > m->epoch) {
9190 dout(10) << __func__ << ": got obsolete pg create on pgid "
9191 << pgid << " from epoch " << m->epoch
9192 << ", primary changed in " << history.same_primary_since
9193 << dendl;
9194 continue;
9195 }
9196 enqueue_peering_evt(
9197 pgid,
9198 PGPeeringEventRef(
9199 std::make_shared<PGPeeringEvent>(
9200 osdmap->get_epoch(),
9201 osdmap->get_epoch(),
9202 NullEvt(),
9203 true,
9204 new PGCreateInfo(
9205 pgid,
9206 osdmap->get_epoch(),
9207 history,
9208 pi,
9209 true)
9210 )));
9211 }
9212
9213 {
9214 std::lock_guard l(pending_creates_lock);
9215 if (pending_creates_from_mon == 0) {
9216 last_pg_create_epoch = m->epoch;
9217 }
9218 }
9219
9220 maybe_update_heartbeat_peers();
9221 }
9222
9223
9224 // ----------------------------------------
9225 // peering and recovery
9226
9227 PeeringCtx OSD::create_context()
9228 {
9229 return PeeringCtx(get_osdmap()->require_osd_release);
9230 }
9231
9232 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9233 ThreadPool::TPHandle *handle)
9234 {
9235 if (!service.get_osdmap()->is_up(whoami)) {
9236 dout(20) << __func__ << " not up in osdmap" << dendl;
9237 } else if (!is_active()) {
9238 dout(20) << __func__ << " not active" << dendl;
9239 } else {
9240 for (auto& [osd, ls] : ctx.message_map) {
9241 if (!curmap->is_up(osd)) {
9242 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9243 continue;
9244 }
9245 ConnectionRef con = service.get_con_osd_cluster(
9246 osd, curmap->get_epoch());
9247 if (!con) {
9248 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9249 << dendl;
9250 continue;
9251 }
9252 service.maybe_share_map(con.get(), curmap);
9253 for (auto m : ls) {
9254 con->send_message2(m);
9255 }
9256 ls.clear();
9257 }
9258 }
9259 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9260 int tr = store->queue_transaction(
9261 pg->ch,
9262 std::move(ctx.transaction), TrackedOpRef(),
9263 handle);
9264 ceph_assert(tr == 0);
9265 }
9266 }
9267
9268 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9269 {
9270 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9271 if (!require_mon_peer(m)) {
9272 m->put();
9273 return;
9274 }
9275 for (auto& p : m->pgs) {
9276 spg_t pgid = p.first;
9277 epoch_t created = p.second.first;
9278 utime_t created_stamp = p.second.second;
9279 auto q = m->pg_extra.find(pgid);
9280 if (q == m->pg_extra.end()) {
9281 dout(20) << __func__ << " " << pgid << " e" << created
9282 << "@" << created_stamp
9283 << " (no history or past_intervals)" << dendl;
9284 // pre-octopus ... no pg history. this can be removed in Q release.
9285 enqueue_peering_evt(
9286 pgid,
9287 PGPeeringEventRef(
9288 std::make_shared<PGPeeringEvent>(
9289 m->epoch,
9290 m->epoch,
9291 NullEvt(),
9292 true,
9293 new PGCreateInfo(
9294 pgid,
9295 created,
9296 pg_history_t(created, created_stamp),
9297 PastIntervals(),
9298 true)
9299 )));
9300 } else {
9301 dout(20) << __func__ << " " << pgid << " e" << created
9302 << "@" << created_stamp
9303 << " history " << q->second.first
9304 << " pi " << q->second.second << dendl;
9305 if (!q->second.second.empty() &&
9306 m->epoch < q->second.second.get_bounds().second) {
9307 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9308 << " and unmatched past_intervals " << q->second.second
9309 << " (history " << q->second.first << ")";
9310 } else {
9311 enqueue_peering_evt(
9312 pgid,
9313 PGPeeringEventRef(
9314 std::make_shared<PGPeeringEvent>(
9315 m->epoch,
9316 m->epoch,
9317 NullEvt(),
9318 true,
9319 new PGCreateInfo(
9320 pgid,
9321 m->epoch,
9322 q->second.first,
9323 q->second.second,
9324 true)
9325 )));
9326 }
9327 }
9328 }
9329
9330 {
9331 std::lock_guard l(pending_creates_lock);
9332 if (pending_creates_from_mon == 0) {
9333 last_pg_create_epoch = m->epoch;
9334 }
9335 }
9336
9337 m->put();
9338 }
9339
9340 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9341 {
9342 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9343 if (!require_osd_peer(m)) {
9344 m->put();
9345 return;
9346 }
9347 int from = m->get_source().num();
9348 for (auto& p : m->pg_list) {
9349 enqueue_peering_evt(
9350 p.first,
9351 PGPeeringEventRef(
9352 std::make_shared<PGPeeringEvent>(
9353 p.second.epoch_sent, p.second.epoch_sent,
9354 MQuery(
9355 p.first,
9356 pg_shard_t(from, p.second.from),
9357 p.second,
9358 p.second.epoch_sent),
9359 false))
9360 );
9361 }
9362 m->put();
9363 }
9364
9365 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9366 {
9367 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9368 if (!require_osd_peer(m)) {
9369 m->put();
9370 return;
9371 }
9372 int from = m->get_source().num();
9373 for (auto& p : m->get_pg_list()) {
9374 spg_t pgid(p.info.pgid.pgid, p.to);
9375 enqueue_peering_evt(
9376 pgid,
9377 PGPeeringEventRef(
9378 std::make_shared<PGPeeringEvent>(
9379 p.epoch_sent,
9380 p.query_epoch,
9381 MNotifyRec(
9382 pgid, pg_shard_t(from, p.from),
9383 p,
9384 m->get_connection()->get_features()),
9385 true,
9386 new PGCreateInfo(
9387 pgid,
9388 p.query_epoch,
9389 p.info.history,
9390 p.past_intervals,
9391 false)
9392 )));
9393 }
9394 m->put();
9395 }
9396
9397 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9398 {
9399 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9400 if (!require_osd_peer(m)) {
9401 m->put();
9402 return;
9403 }
9404 int from = m->get_source().num();
9405 for (auto& p : m->pg_list) {
9406 enqueue_peering_evt(
9407 spg_t(p.info.pgid.pgid, p.to),
9408 PGPeeringEventRef(
9409 std::make_shared<PGPeeringEvent>(
9410 p.epoch_sent, p.query_epoch,
9411 MInfoRec(
9412 pg_shard_t(from, p.from),
9413 p.info,
9414 p.epoch_sent)))
9415 );
9416 }
9417 m->put();
9418 }
9419
9420 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9421 {
9422 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9423 if (!require_osd_peer(m)) {
9424 m->put();
9425 return;
9426 }
9427 for (auto& pgid : m->pg_list) {
9428 enqueue_peering_evt(
9429 pgid,
9430 PGPeeringEventRef(
9431 std::make_shared<PGPeeringEvent>(
9432 m->get_epoch(), m->get_epoch(),
9433 PeeringState::DeleteStart())));
9434 }
9435 m->put();
9436 }
9437
9438 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9439 {
9440 dout(10) << __func__ << " " << *m << dendl;
9441 if (!require_mon_or_mgr_peer(m)) {
9442 m->put();
9443 return;
9444 }
9445 epoch_t epoch = get_osdmap_epoch();
9446 for (auto pgid : m->forced_pgs) {
9447 if (m->options & OFR_BACKFILL) {
9448 if (m->options & OFR_CANCEL) {
9449 enqueue_peering_evt(
9450 pgid,
9451 PGPeeringEventRef(
9452 std::make_shared<PGPeeringEvent>(
9453 epoch, epoch,
9454 PeeringState::UnsetForceBackfill())));
9455 } else {
9456 enqueue_peering_evt(
9457 pgid,
9458 PGPeeringEventRef(
9459 std::make_shared<PGPeeringEvent>(
9460 epoch, epoch,
9461 PeeringState::SetForceBackfill())));
9462 }
9463 } else if (m->options & OFR_RECOVERY) {
9464 if (m->options & OFR_CANCEL) {
9465 enqueue_peering_evt(
9466 pgid,
9467 PGPeeringEventRef(
9468 std::make_shared<PGPeeringEvent>(
9469 epoch, epoch,
9470 PeeringState::UnsetForceRecovery())));
9471 } else {
9472 enqueue_peering_evt(
9473 pgid,
9474 PGPeeringEventRef(
9475 std::make_shared<PGPeeringEvent>(
9476 epoch, epoch,
9477 PeeringState::SetForceRecovery())));
9478 }
9479 }
9480 }
9481 m->put();
9482 }
9483
9484 void OSD::handle_pg_query_nopg(const MQuery& q)
9485 {
9486 spg_t pgid = q.pgid;
9487 dout(10) << __func__ << " " << pgid << dendl;
9488
9489 OSDMapRef osdmap = get_osdmap();
9490 if (!osdmap->have_pg_pool(pgid.pool()))
9491 return;
9492
9493 dout(10) << " pg " << pgid << " dne" << dendl;
9494 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9495 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9496 if (con) {
9497 Message *m;
9498 if (q.query.type == pg_query_t::LOG ||
9499 q.query.type == pg_query_t::FULLLOG) {
9500 m = new MOSDPGLog(
9501 q.query.from, q.query.to,
9502 osdmap->get_epoch(), empty,
9503 q.query.epoch_sent);
9504 } else {
9505 vector<pg_notify_t> ls;
9506 ls.push_back(
9507 pg_notify_t(
9508 q.query.from, q.query.to,
9509 q.query.epoch_sent,
9510 osdmap->get_epoch(),
9511 empty,
9512 PastIntervals()));
9513 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9514 }
9515 service.maybe_share_map(con.get(), osdmap);
9516 con->send_message(m);
9517 }
9518 }
9519
9520 void OSDService::queue_check_readable(spg_t spgid,
9521 epoch_t lpr,
9522 ceph::signedspan delay)
9523 {
9524 if (delay == ceph::signedspan::zero()) {
9525 osd->enqueue_peering_evt(
9526 spgid,
9527 PGPeeringEventRef(
9528 std::make_shared<PGPeeringEvent>(
9529 lpr, lpr,
9530 PeeringState::CheckReadable())));
9531 } else {
9532 mono_timer.add_event(
9533 delay,
9534 [this, spgid, lpr]() {
9535 queue_check_readable(spgid, lpr);
9536 });
9537 }
9538 }
9539
9540
9541 // =========================================================
9542 // RECOVERY
9543
9544 void OSDService::_maybe_queue_recovery() {
9545 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9546 uint64_t available_pushes;
9547 while (!awaiting_throttle.empty() &&
9548 _recover_now(&available_pushes)) {
9549 uint64_t to_start = std::min(
9550 available_pushes,
9551 cct->_conf->osd_recovery_max_single_start);
9552 _queue_for_recovery(awaiting_throttle.front(), to_start);
9553 awaiting_throttle.pop_front();
9554 dout(10) << __func__ << " starting " << to_start
9555 << ", recovery_ops_reserved " << recovery_ops_reserved
9556 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9557 recovery_ops_reserved += to_start;
9558 }
9559 }
9560
9561 bool OSDService::_recover_now(uint64_t *available_pushes)
9562 {
9563 if (available_pushes)
9564 *available_pushes = 0;
9565
9566 if (ceph_clock_now() < defer_recovery_until) {
9567 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9568 return false;
9569 }
9570
9571 if (recovery_paused) {
9572 dout(15) << __func__ << " paused" << dendl;
9573 return false;
9574 }
9575
9576 uint64_t max = osd->get_recovery_max_active();
9577 if (max <= recovery_ops_active + recovery_ops_reserved) {
9578 dout(15) << __func__ << " active " << recovery_ops_active
9579 << " + reserved " << recovery_ops_reserved
9580 << " >= max " << max << dendl;
9581 return false;
9582 }
9583
9584 if (available_pushes)
9585 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9586
9587 return true;
9588 }
9589
9590 unsigned OSDService::get_target_pg_log_entries() const
9591 {
9592 auto num_pgs = osd->get_num_pgs();
9593 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9594 if (num_pgs > 0 && target > 0) {
9595 // target an even spread of our budgeted log entries across all
9596 // PGs. note that while we only get to control the entry count
9597 // for primary PGs, we'll normally be responsible for a mix of
9598 // primary and replica PGs (for the same pool(s) even), so this
9599 // will work out.
9600 return std::max<unsigned>(
9601 std::min<unsigned>(target / num_pgs,
9602 cct->_conf->osd_max_pg_log_entries),
9603 cct->_conf->osd_min_pg_log_entries);
9604 } else {
9605 // fall back to a per-pg value.
9606 return cct->_conf->osd_min_pg_log_entries;
9607 }
9608 }
9609
9610 void OSD::do_recovery(
9611 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9612 ThreadPool::TPHandle &handle)
9613 {
9614 uint64_t started = 0;
9615
9616 /*
9617 * When the value of osd_recovery_sleep is set greater than zero, recovery
9618 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9619 * recovery event's schedule time. This is done by adding a
9620 * recovery_requeue_callback event, which re-queues the recovery op using
9621 * queue_recovery_after_sleep.
9622 */
9623 float recovery_sleep = get_osd_recovery_sleep();
9624 {
9625 std::lock_guard l(service.sleep_lock);
9626 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9627 PGRef pgref(pg);
9628 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9629 dout(20) << "do_recovery wake up at "
9630 << ceph_clock_now()
9631 << ", re-queuing recovery" << dendl;
9632 std::lock_guard l(service.sleep_lock);
9633 service.recovery_needs_sleep = false;
9634 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9635 });
9636
9637 // This is true for the first recovery op and when the previous recovery op
9638 // has been scheduled in the past. The next recovery op is scheduled after
9639 // completing the sleep from now.
9640
9641 if (auto now = ceph::real_clock::now();
9642 service.recovery_schedule_time < now) {
9643 service.recovery_schedule_time = now;
9644 }
9645 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9646 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9647 recovery_requeue_callback);
9648 dout(20) << "Recovery event scheduled at "
9649 << service.recovery_schedule_time << dendl;
9650 return;
9651 }
9652 }
9653
9654 {
9655 {
9656 std::lock_guard l(service.sleep_lock);
9657 service.recovery_needs_sleep = true;
9658 }
9659
9660 if (pg->pg_has_reset_since(queued)) {
9661 goto out;
9662 }
9663
9664 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9665 #ifdef DEBUG_RECOVERY_OIDS
9666 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9667 #endif
9668
9669 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9670 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9671 << " on " << *pg << dendl;
9672
9673 if (do_unfound) {
9674 PeeringCtx rctx = create_context();
9675 rctx.handle = &handle;
9676 pg->find_unfound(queued, rctx);
9677 dispatch_context(rctx, pg, pg->get_osdmap());
9678 }
9679 }
9680
9681 out:
9682 ceph_assert(started <= reserved_pushes);
9683 service.release_reserved_pushes(reserved_pushes);
9684 }
9685
9686 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9687 {
9688 std::lock_guard l(recovery_lock);
9689 dout(10) << "start_recovery_op " << *pg << " " << soid
9690 << " (" << recovery_ops_active << "/"
9691 << osd->get_recovery_max_active() << " rops)"
9692 << dendl;
9693 recovery_ops_active++;
9694
9695 #ifdef DEBUG_RECOVERY_OIDS
9696 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9697 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9698 recovery_oids[pg->pg_id].insert(soid);
9699 #endif
9700 }
9701
9702 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9703 {
9704 std::lock_guard l(recovery_lock);
9705 dout(10) << "finish_recovery_op " << *pg << " " << soid
9706 << " dequeue=" << dequeue
9707 << " (" << recovery_ops_active << "/"
9708 << osd->get_recovery_max_active() << " rops)"
9709 << dendl;
9710
9711 // adjust count
9712 ceph_assert(recovery_ops_active > 0);
9713 recovery_ops_active--;
9714
9715 #ifdef DEBUG_RECOVERY_OIDS
9716 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9717 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9718 recovery_oids[pg->pg_id].erase(soid);
9719 #endif
9720
9721 _maybe_queue_recovery();
9722 }
9723
9724 bool OSDService::is_recovery_active()
9725 {
9726 if (cct->_conf->osd_debug_pretend_recovery_active) {
9727 return true;
9728 }
9729 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9730 }
9731
9732 void OSDService::release_reserved_pushes(uint64_t pushes)
9733 {
9734 std::lock_guard l(recovery_lock);
9735 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9736 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9737 << dendl;
9738 ceph_assert(recovery_ops_reserved >= pushes);
9739 recovery_ops_reserved -= pushes;
9740 _maybe_queue_recovery();
9741 }
9742
9743 // =========================================================
9744 // OPS
9745
9746 bool OSD::op_is_discardable(const MOSDOp *op)
9747 {
9748 // drop client request if they are not connected and can't get the
9749 // reply anyway.
9750 if (!op->get_connection()->is_connected()) {
9751 return true;
9752 }
9753 return false;
9754 }
9755
9756 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9757 {
9758 const utime_t stamp = op->get_req()->get_recv_stamp();
9759 const utime_t latency = ceph_clock_now() - stamp;
9760 const unsigned priority = op->get_req()->get_priority();
9761 const int cost = op->get_req()->get_cost();
9762 const uint64_t owner = op->get_req()->get_source().num();
9763 const int type = op->get_req()->get_type();
9764
9765 dout(15) << "enqueue_op " << op << " prio " << priority
9766 << " type " << type
9767 << " cost " << cost
9768 << " latency " << latency
9769 << " epoch " << epoch
9770 << " " << *(op->get_req()) << dendl;
9771 op->osd_trace.event("enqueue op");
9772 op->osd_trace.keyval("priority", priority);
9773 op->osd_trace.keyval("cost", cost);
9774 #ifdef HAVE_JAEGER
9775 if (op->osd_parent_span) {
9776 auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
9777 enqueue_span->Log({
9778 {"priority", priority},
9779 {"cost", cost},
9780 {"epoch", epoch},
9781 {"owner", owner},
9782 {"type", type}
9783 });
9784 }
9785 #endif
9786 op->mark_queued_for_pg();
9787 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9788 if (type == MSG_OSD_PG_PUSH ||
9789 type == MSG_OSD_PG_PUSH_REPLY) {
9790 op_shardedwq.queue(
9791 OpSchedulerItem(
9792 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9793 cost, priority, stamp, owner, epoch));
9794 } else {
9795 op_shardedwq.queue(
9796 OpSchedulerItem(
9797 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9798 cost, priority, stamp, owner, epoch));
9799 }
9800 }
9801
9802 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9803 {
9804 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9805 op_shardedwq.queue(
9806 OpSchedulerItem(
9807 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9808 10,
9809 cct->_conf->osd_peering_op_priority,
9810 utime_t(),
9811 0,
9812 evt->get_epoch_sent()));
9813 }
9814
9815 /*
9816 * NOTE: dequeue called in worker thread, with pg lock
9817 */
9818 void OSD::dequeue_op(
9819 PGRef pg, OpRequestRef op,
9820 ThreadPool::TPHandle &handle)
9821 {
9822 const Message *m = op->get_req();
9823
9824 FUNCTRACE(cct);
9825 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9826
9827 utime_t now = ceph_clock_now();
9828 op->set_dequeued_time(now);
9829
9830 utime_t latency = now - m->get_recv_stamp();
9831 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9832 << " cost " << m->get_cost()
9833 << " latency " << latency
9834 << " " << *m
9835 << " pg " << *pg << dendl;
9836
9837 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9838
9839 service.maybe_share_map(m->get_connection().get(),
9840 pg->get_osdmap(),
9841 op->sent_epoch);
9842
9843 if (pg->is_deleting())
9844 return;
9845
9846 op->mark_reached_pg();
9847 op->osd_trace.event("dequeue_op");
9848
9849 pg->do_request(op, handle);
9850
9851 // finish
9852 dout(10) << "dequeue_op " << op << " finish" << dendl;
9853 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9854 }
9855
9856
9857 void OSD::dequeue_peering_evt(
9858 OSDShard *sdata,
9859 PG *pg,
9860 PGPeeringEventRef evt,
9861 ThreadPool::TPHandle& handle)
9862 {
9863 PeeringCtx rctx = create_context();
9864 auto curmap = sdata->get_osdmap();
9865 bool need_up_thru = false;
9866 epoch_t same_interval_since = 0;
9867 if (!pg) {
9868 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9869 handle_pg_query_nopg(*q);
9870 } else {
9871 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9872 ceph_abort();
9873 }
9874 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9875 pg->do_peering_event(evt, rctx);
9876 if (pg->is_deleted()) {
9877 pg->unlock();
9878 return;
9879 }
9880 dispatch_context(rctx, pg, curmap, &handle);
9881 need_up_thru = pg->get_need_up_thru();
9882 same_interval_since = pg->get_same_interval_since();
9883 pg->unlock();
9884 }
9885
9886 if (need_up_thru) {
9887 queue_want_up_thru(same_interval_since);
9888 }
9889
9890 service.send_pg_temp();
9891 }
9892
9893 void OSD::dequeue_delete(
9894 OSDShard *sdata,
9895 PG *pg,
9896 epoch_t e,
9897 ThreadPool::TPHandle& handle)
9898 {
9899 dequeue_peering_evt(
9900 sdata,
9901 pg,
9902 PGPeeringEventRef(
9903 std::make_shared<PGPeeringEvent>(
9904 e, e,
9905 PeeringState::DeleteSome())),
9906 handle);
9907 }
9908
9909
9910
9911 // --------------------------------
9912
9913 const char** OSD::get_tracked_conf_keys() const
9914 {
9915 static const char* KEYS[] = {
9916 "osd_max_backfills",
9917 "osd_min_recovery_priority",
9918 "osd_max_trimming_pgs",
9919 "osd_op_complaint_time",
9920 "osd_op_log_threshold",
9921 "osd_op_history_size",
9922 "osd_op_history_duration",
9923 "osd_op_history_slow_op_size",
9924 "osd_op_history_slow_op_threshold",
9925 "osd_enable_op_tracker",
9926 "osd_map_cache_size",
9927 "osd_pg_epoch_max_lag_factor",
9928 "osd_pg_epoch_persisted_max_stale",
9929 "osd_recovery_sleep",
9930 "osd_recovery_sleep_hdd",
9931 "osd_recovery_sleep_ssd",
9932 "osd_recovery_sleep_hybrid",
9933 "osd_delete_sleep",
9934 "osd_delete_sleep_hdd",
9935 "osd_delete_sleep_ssd",
9936 "osd_delete_sleep_hybrid",
9937 "osd_snap_trim_sleep",
9938 "osd_snap_trim_sleep_hdd",
9939 "osd_snap_trim_sleep_ssd",
9940 "osd_snap_trim_sleep_hybrid"
9941 "osd_scrub_sleep",
9942 "osd_recovery_max_active",
9943 "osd_recovery_max_active_hdd",
9944 "osd_recovery_max_active_ssd",
9945 // clog & admin clog
9946 "clog_to_monitors",
9947 "clog_to_syslog",
9948 "clog_to_syslog_facility",
9949 "clog_to_syslog_level",
9950 "osd_objectstore_fuse",
9951 "clog_to_graylog",
9952 "clog_to_graylog_host",
9953 "clog_to_graylog_port",
9954 "host",
9955 "fsid",
9956 "osd_recovery_delay_start",
9957 "osd_client_message_size_cap",
9958 "osd_client_message_cap",
9959 "osd_heartbeat_min_size",
9960 "osd_heartbeat_interval",
9961 "osd_object_clean_region_max_num_intervals",
9962 "osd_scrub_min_interval",
9963 "osd_scrub_max_interval",
9964 NULL
9965 };
9966 return KEYS;
9967 }
9968
9969 void OSD::handle_conf_change(const ConfigProxy& conf,
9970 const std::set <std::string> &changed)
9971 {
9972 std::lock_guard l{osd_lock};
9973
9974 if (changed.count("osd_max_backfills") ||
9975 changed.count("osd_delete_sleep") ||
9976 changed.count("osd_delete_sleep_hdd") ||
9977 changed.count("osd_delete_sleep_ssd") ||
9978 changed.count("osd_delete_sleep_hybrid") ||
9979 changed.count("osd_snap_trim_sleep") ||
9980 changed.count("osd_snap_trim_sleep_hdd") ||
9981 changed.count("osd_snap_trim_sleep_ssd") ||
9982 changed.count("osd_snap_trim_sleep_hybrid") ||
9983 changed.count("osd_scrub_sleep") ||
9984 changed.count("osd_recovery_sleep") ||
9985 changed.count("osd_recovery_sleep_hdd") ||
9986 changed.count("osd_recovery_sleep_ssd") ||
9987 changed.count("osd_recovery_sleep_hybrid") ||
9988 changed.count("osd_recovery_max_active") ||
9989 changed.count("osd_recovery_max_active_hdd") ||
9990 changed.count("osd_recovery_max_active_ssd")) {
9991 if (!maybe_override_options_for_qos() &&
9992 changed.count("osd_max_backfills")) {
9993 // Scheduler is not "mclock". Fallback to earlier behavior
9994 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9995 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9996 }
9997 }
9998 if (changed.count("osd_min_recovery_priority")) {
9999 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10000 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10001 }
10002 if (changed.count("osd_max_trimming_pgs")) {
10003 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10004 }
10005 if (changed.count("osd_op_complaint_time") ||
10006 changed.count("osd_op_log_threshold")) {
10007 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10008 cct->_conf->osd_op_log_threshold);
10009 }
10010 if (changed.count("osd_op_history_size") ||
10011 changed.count("osd_op_history_duration")) {
10012 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10013 cct->_conf->osd_op_history_duration);
10014 }
10015 if (changed.count("osd_op_history_slow_op_size") ||
10016 changed.count("osd_op_history_slow_op_threshold")) {
10017 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10018 cct->_conf->osd_op_history_slow_op_threshold);
10019 }
10020 if (changed.count("osd_enable_op_tracker")) {
10021 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10022 }
10023 if (changed.count("osd_map_cache_size")) {
10024 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10025 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10026 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10027 }
10028 if (changed.count("clog_to_monitors") ||
10029 changed.count("clog_to_syslog") ||
10030 changed.count("clog_to_syslog_level") ||
10031 changed.count("clog_to_syslog_facility") ||
10032 changed.count("clog_to_graylog") ||
10033 changed.count("clog_to_graylog_host") ||
10034 changed.count("clog_to_graylog_port") ||
10035 changed.count("host") ||
10036 changed.count("fsid")) {
10037 update_log_config();
10038 }
10039 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10040 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10041 "osd_pg_epoch_max_lag_factor");
10042 }
10043
10044 #ifdef HAVE_LIBFUSE
10045 if (changed.count("osd_objectstore_fuse")) {
10046 if (store) {
10047 enable_disable_fuse(false);
10048 }
10049 }
10050 #endif
10051
10052 if (changed.count("osd_recovery_delay_start")) {
10053 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10054 service.kick_recovery_queue();
10055 }
10056
10057 if (changed.count("osd_client_message_cap")) {
10058 uint64_t newval = cct->_conf->osd_client_message_cap;
10059 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10060 if (pol.throttler_messages && newval > 0) {
10061 pol.throttler_messages->reset_max(newval);
10062 }
10063 }
10064 if (changed.count("osd_client_message_size_cap")) {
10065 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10066 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10067 if (pol.throttler_bytes && newval > 0) {
10068 pol.throttler_bytes->reset_max(newval);
10069 }
10070 }
10071 if (changed.count("osd_object_clean_region_max_num_intervals")) {
10072 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10073 }
10074
10075 if (changed.count("osd_scrub_min_interval") ||
10076 changed.count("osd_scrub_max_interval")) {
10077 resched_all_scrubs();
10078 dout(0) << __func__ << ": scrub interval change" << dendl;
10079 }
10080 check_config();
10081 if (changed.count("osd_asio_thread_count")) {
10082 service.poolctx.stop();
10083 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10084 }
10085 }
10086
10087 bool OSD::maybe_override_options_for_qos()
10088 {
10089 // If the scheduler enabled is mclock, override the recovery, backfill
10090 // and sleep options so that mclock can meet the QoS goals.
10091 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
10092 dout(1) << __func__
10093 << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10094
10095 // Set high value for recovery max active
10096 uint32_t rec_max_active = 1000;
10097 cct->_conf.set_val(
10098 "osd_recovery_max_active", std::to_string(rec_max_active));
10099 cct->_conf.set_val(
10100 "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10101 cct->_conf.set_val(
10102 "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10103
10104 // Set high value for osd_max_backfill
10105 uint32_t max_backfills = 1000;
10106 cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10107 service.local_reserver.set_max(max_backfills);
10108 service.remote_reserver.set_max(max_backfills);
10109
10110 // Disable recovery sleep
10111 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10112 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10113 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10114 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10115
10116 // Disable delete sleep
10117 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10118 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10119 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10120 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10121
10122 // Disable snap trim sleep
10123 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10124 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10125 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10126 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10127
10128 // Disable scrub sleep
10129 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10130 return true;
10131 }
10132 return false;
10133 }
10134
10135 void OSD::update_log_config()
10136 {
10137 map<string,string> log_to_monitors;
10138 map<string,string> log_to_syslog;
10139 map<string,string> log_channel;
10140 map<string,string> log_prio;
10141 map<string,string> log_to_graylog;
10142 map<string,string> log_to_graylog_host;
10143 map<string,string> log_to_graylog_port;
10144 uuid_d fsid;
10145 string host;
10146
10147 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10148 log_channel, log_prio, log_to_graylog,
10149 log_to_graylog_host, log_to_graylog_port,
10150 fsid, host) == 0)
10151 clog->update_config(log_to_monitors, log_to_syslog,
10152 log_channel, log_prio, log_to_graylog,
10153 log_to_graylog_host, log_to_graylog_port,
10154 fsid, host);
10155 derr << "log_to_monitors " << log_to_monitors << dendl;
10156 }
10157
10158 void OSD::check_config()
10159 {
10160 // some sanity checks
10161 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10162 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10163 << " is not > osd_pg_epoch_persisted_max_stale ("
10164 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10165 }
10166 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10167 clog->warn() << "osd_object_clean_region_max_num_intervals ("
10168 << cct->_conf->osd_object_clean_region_max_num_intervals
10169 << ") is < 0";
10170 }
10171 }
10172
10173 // --------------------------------
10174
10175 void OSD::get_latest_osdmap()
10176 {
10177 dout(10) << __func__ << " -- start" << dendl;
10178
10179 boost::system::error_code ec;
10180 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10181
10182 dout(10) << __func__ << " -- finish" << dendl;
10183 }
10184
10185 // --------------------------------
10186
10187 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10188 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10189 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10190 dout(10) << "setting " << queries.size() << " queries" << dendl;
10191
10192 std::list<OSDPerfMetricQuery> supported_queries;
10193 for (auto &it : queries) {
10194 auto &query = it.first;
10195 if (!query.key_descriptor.empty()) {
10196 supported_queries.push_back(query);
10197 }
10198 }
10199 if (supported_queries.size() < queries.size()) {
10200 dout(1) << queries.size() - supported_queries.size()
10201 << " unsupported queries" << dendl;
10202 }
10203 {
10204 std::lock_guard locker{m_perf_queries_lock};
10205 m_perf_queries = supported_queries;
10206 m_perf_limits = queries;
10207 }
10208 std::vector<PGRef> pgs;
10209 _get_pgs(&pgs);
10210 for (auto& pg : pgs) {
10211 std::scoped_lock l{*pg};
10212 pg->set_dynamic_perf_stats_queries(supported_queries);
10213 }
10214 }
10215
10216 MetricPayload OSD::get_perf_reports() {
10217 OSDMetricPayload payload;
10218 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10219
10220 std::vector<PGRef> pgs;
10221 _get_pgs(&pgs);
10222 DynamicPerfStats dps;
10223 for (auto& pg : pgs) {
10224 // m_perf_queries can be modified only in set_perf_queries by mgr client
10225 // request, and it is protected by by mgr client's lock, which is held
10226 // when set_perf_queries/get_perf_reports are called, so we may not hold
10227 // m_perf_queries_lock here.
10228 DynamicPerfStats pg_dps(m_perf_queries);
10229 pg->lock();
10230 pg->get_dynamic_perf_stats(&pg_dps);
10231 pg->unlock();
10232 dps.merge(pg_dps);
10233 }
10234 dps.add_to_reports(m_perf_limits, &reports);
10235 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10236
10237 return payload;
10238 }
10239
10240 // =============================================================
10241
10242 #undef dout_context
10243 #define dout_context cct
10244 #undef dout_prefix
10245 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10246
10247 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10248 {
10249 dout(10) << pg->pg_id << " " << pg << dendl;
10250 slot->pg = pg;
10251 pg->osd_shard = this;
10252 pg->pg_slot = slot;
10253 osd->inc_num_pgs();
10254
10255 slot->epoch = pg->get_osdmap_epoch();
10256 pg_slots_by_epoch.insert(*slot);
10257 }
10258
10259 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10260 {
10261 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10262 slot->pg->osd_shard = nullptr;
10263 slot->pg->pg_slot = nullptr;
10264 slot->pg = nullptr;
10265 osd->dec_num_pgs();
10266
10267 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10268 slot->epoch = 0;
10269 if (waiting_for_min_pg_epoch) {
10270 min_pg_epoch_cond.notify_all();
10271 }
10272 }
10273
10274 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10275 {
10276 std::lock_guard l(shard_lock);
10277 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10278 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10279 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10280 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10281 slot->epoch = e;
10282 pg_slots_by_epoch.insert(*slot);
10283 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10284 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10285 if (waiting_for_min_pg_epoch) {
10286 min_pg_epoch_cond.notify_all();
10287 }
10288 }
10289
10290 epoch_t OSDShard::get_min_pg_epoch()
10291 {
10292 std::lock_guard l(shard_lock);
10293 auto p = pg_slots_by_epoch.begin();
10294 if (p == pg_slots_by_epoch.end()) {
10295 return 0;
10296 }
10297 return p->epoch;
10298 }
10299
10300 void OSDShard::wait_min_pg_epoch(epoch_t need)
10301 {
10302 std::unique_lock l{shard_lock};
10303 ++waiting_for_min_pg_epoch;
10304 min_pg_epoch_cond.wait(l, [need, this] {
10305 if (pg_slots_by_epoch.empty()) {
10306 return true;
10307 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10308 return true;
10309 } else {
10310 dout(10) << need << " waiting on "
10311 << pg_slots_by_epoch.begin()->epoch << dendl;
10312 return false;
10313 }
10314 });
10315 --waiting_for_min_pg_epoch;
10316 }
10317
10318 epoch_t OSDShard::get_max_waiting_epoch()
10319 {
10320 std::lock_guard l(shard_lock);
10321 epoch_t r = 0;
10322 for (auto& i : pg_slots) {
10323 if (!i.second->waiting_peering.empty()) {
10324 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10325 }
10326 }
10327 return r;
10328 }
10329
10330 void OSDShard::consume_map(
10331 const OSDMapRef& new_osdmap,
10332 unsigned *pushes_to_free)
10333 {
10334 std::lock_guard l(shard_lock);
10335 OSDMapRef old_osdmap;
10336 {
10337 std::lock_guard l(osdmap_lock);
10338 old_osdmap = std::move(shard_osdmap);
10339 shard_osdmap = new_osdmap;
10340 }
10341 dout(10) << new_osdmap->get_epoch()
10342 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10343 << dendl;
10344 bool queued = false;
10345
10346 // check slots
10347 auto p = pg_slots.begin();
10348 while (p != pg_slots.end()) {
10349 OSDShardPGSlot *slot = p->second.get();
10350 const spg_t& pgid = p->first;
10351 dout(20) << __func__ << " " << pgid << dendl;
10352 if (!slot->waiting_for_split.empty()) {
10353 dout(20) << __func__ << " " << pgid
10354 << " waiting for split " << slot->waiting_for_split << dendl;
10355 ++p;
10356 continue;
10357 }
10358 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10359 dout(20) << __func__ << " " << pgid
10360 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10361 << dendl;
10362 ++p;
10363 continue;
10364 }
10365 if (!slot->waiting_peering.empty()) {
10366 epoch_t first = slot->waiting_peering.begin()->first;
10367 if (first <= new_osdmap->get_epoch()) {
10368 dout(20) << __func__ << " " << pgid
10369 << " pending_peering first epoch " << first
10370 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10371 _wake_pg_slot(pgid, slot);
10372 queued = true;
10373 }
10374 ++p;
10375 continue;
10376 }
10377 if (!slot->waiting.empty()) {
10378 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10379 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10380 << dendl;
10381 ++p;
10382 continue;
10383 }
10384 while (!slot->waiting.empty() &&
10385 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10386 auto& qi = slot->waiting.front();
10387 dout(20) << __func__ << " " << pgid
10388 << " waiting item " << qi
10389 << " epoch " << qi.get_map_epoch()
10390 << " <= " << new_osdmap->get_epoch()
10391 << ", "
10392 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10393 "misdirected")
10394 << ", dropping" << dendl;
10395 *pushes_to_free += qi.get_reserved_pushes();
10396 slot->waiting.pop_front();
10397 }
10398 }
10399 if (slot->waiting.empty() &&
10400 slot->num_running == 0 &&
10401 slot->waiting_for_split.empty() &&
10402 !slot->pg) {
10403 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10404 p = pg_slots.erase(p);
10405 continue;
10406 }
10407
10408 ++p;
10409 }
10410 if (queued) {
10411 std::lock_guard l{sdata_wait_lock};
10412 sdata_cond.notify_one();
10413 }
10414 }
10415
10416 void OSDShard::_wake_pg_slot(
10417 spg_t pgid,
10418 OSDShardPGSlot *slot)
10419 {
10420 dout(20) << __func__ << " " << pgid
10421 << " to_process " << slot->to_process
10422 << " waiting " << slot->waiting
10423 << " waiting_peering " << slot->waiting_peering << dendl;
10424 for (auto i = slot->to_process.rbegin();
10425 i != slot->to_process.rend();
10426 ++i) {
10427 scheduler->enqueue_front(std::move(*i));
10428 }
10429 slot->to_process.clear();
10430 for (auto i = slot->waiting.rbegin();
10431 i != slot->waiting.rend();
10432 ++i) {
10433 scheduler->enqueue_front(std::move(*i));
10434 }
10435 slot->waiting.clear();
10436 for (auto i = slot->waiting_peering.rbegin();
10437 i != slot->waiting_peering.rend();
10438 ++i) {
10439 // this is overkill; we requeue everything, even if some of these
10440 // items are waiting for maps we don't have yet. FIXME, maybe,
10441 // someday, if we decide this inefficiency matters
10442 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10443 scheduler->enqueue_front(std::move(*j));
10444 }
10445 }
10446 slot->waiting_peering.clear();
10447 ++slot->requeue_seq;
10448 }
10449
10450 void OSDShard::identify_splits_and_merges(
10451 const OSDMapRef& as_of_osdmap,
10452 set<pair<spg_t,epoch_t>> *split_pgs,
10453 set<pair<spg_t,epoch_t>> *merge_pgs)
10454 {
10455 std::lock_guard l(shard_lock);
10456 if (shard_osdmap) {
10457 for (auto& i : pg_slots) {
10458 const spg_t& pgid = i.first;
10459 auto *slot = i.second.get();
10460 if (slot->pg) {
10461 osd->service.identify_splits_and_merges(
10462 shard_osdmap, as_of_osdmap, pgid,
10463 split_pgs, merge_pgs);
10464 } else if (!slot->waiting_for_split.empty()) {
10465 osd->service.identify_splits_and_merges(
10466 shard_osdmap, as_of_osdmap, pgid,
10467 split_pgs, nullptr);
10468 } else {
10469 dout(20) << __func__ << " slot " << pgid
10470 << " has no pg and waiting_for_split " << dendl;
10471 }
10472 }
10473 }
10474 }
10475
10476 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10477 set<pair<spg_t,epoch_t>> *pgids)
10478 {
10479 std::lock_guard l(shard_lock);
10480 _prime_splits(pgids);
10481 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10482 set<pair<spg_t,epoch_t>> newer_children;
10483 for (auto i : *pgids) {
10484 osd->service.identify_splits_and_merges(
10485 as_of_osdmap, shard_osdmap, i.first,
10486 &newer_children, nullptr);
10487 }
10488 newer_children.insert(pgids->begin(), pgids->end());
10489 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10490 << shard_osdmap->get_epoch() << ", new children " << newer_children
10491 << dendl;
10492 _prime_splits(&newer_children);
10493 // note: we don't care what is left over here for other shards.
10494 // if this shard is ahead of us and one isn't, e.g., one thread is
10495 // calling into prime_splits via _process (due to a newly created
10496 // pg) and this shard has a newer map due to a racing consume_map,
10497 // then any grandchildren left here will be identified (or were
10498 // identified) when the slower shard's osdmap is advanced.
10499 // _prime_splits() will tolerate the case where the pgid is
10500 // already primed.
10501 }
10502 }
10503
10504 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10505 {
10506 dout(10) << *pgids << dendl;
10507 auto p = pgids->begin();
10508 while (p != pgids->end()) {
10509 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10510 if (shard_index == shard_id) {
10511 auto r = pg_slots.emplace(p->first, nullptr);
10512 if (r.second) {
10513 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10514 r.first->second = make_unique<OSDShardPGSlot>();
10515 r.first->second->waiting_for_split.insert(p->second);
10516 } else {
10517 auto q = r.first;
10518 ceph_assert(q != pg_slots.end());
10519 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10520 << dendl;
10521 q->second->waiting_for_split.insert(p->second);
10522 }
10523 p = pgids->erase(p);
10524 } else {
10525 ++p;
10526 }
10527 }
10528 }
10529
10530 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10531 set<pair<spg_t,epoch_t>> *merge_pgs)
10532 {
10533 std::lock_guard l(shard_lock);
10534 dout(20) << __func__ << " checking shard " << shard_id
10535 << " for remaining merge pgs " << merge_pgs << dendl;
10536 auto p = merge_pgs->begin();
10537 while (p != merge_pgs->end()) {
10538 spg_t pgid = p->first;
10539 epoch_t epoch = p->second;
10540 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10541 if (shard_index != shard_id) {
10542 ++p;
10543 continue;
10544 }
10545 OSDShardPGSlot *slot;
10546 auto r = pg_slots.emplace(pgid, nullptr);
10547 if (r.second) {
10548 r.first->second = make_unique<OSDShardPGSlot>();
10549 }
10550 slot = r.first->second.get();
10551 if (slot->pg) {
10552 // already have pg
10553 dout(20) << __func__ << " have merge participant pg " << pgid
10554 << " " << slot->pg << dendl;
10555 } else if (!slot->waiting_for_split.empty() &&
10556 *slot->waiting_for_split.begin() < epoch) {
10557 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10558 << " " << slot->waiting_for_split << dendl;
10559 } else {
10560 dout(20) << __func__ << " creating empty merge participant " << pgid
10561 << " for merge in " << epoch << dendl;
10562 // leave history zeroed; PG::merge_from() will fill it in.
10563 pg_history_t history;
10564 PGCreateInfo cinfo(pgid, epoch - 1,
10565 history, PastIntervals(), false);
10566 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10567 _attach_pg(r.first->second.get(), pg.get());
10568 _wake_pg_slot(pgid, slot);
10569 pg->unlock();
10570 }
10571 // mark slot for merge
10572 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10573 slot->waiting_for_merge_epoch = epoch;
10574 p = merge_pgs->erase(p);
10575 }
10576 }
10577
10578 void OSDShard::register_and_wake_split_child(PG *pg)
10579 {
10580 epoch_t epoch;
10581 {
10582 std::lock_guard l(shard_lock);
10583 dout(10) << pg->pg_id << " " << pg << dendl;
10584 auto p = pg_slots.find(pg->pg_id);
10585 ceph_assert(p != pg_slots.end());
10586 auto *slot = p->second.get();
10587 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10588 << dendl;
10589 ceph_assert(!slot->pg);
10590 ceph_assert(!slot->waiting_for_split.empty());
10591 _attach_pg(slot, pg);
10592
10593 epoch = pg->get_osdmap_epoch();
10594 ceph_assert(slot->waiting_for_split.count(epoch));
10595 slot->waiting_for_split.erase(epoch);
10596 if (slot->waiting_for_split.empty()) {
10597 _wake_pg_slot(pg->pg_id, slot);
10598 } else {
10599 dout(10) << __func__ << " still waiting for split on "
10600 << slot->waiting_for_split << dendl;
10601 }
10602 }
10603
10604 // kick child to ensure it pulls up to the latest osdmap
10605 osd->enqueue_peering_evt(
10606 pg->pg_id,
10607 PGPeeringEventRef(
10608 std::make_shared<PGPeeringEvent>(
10609 epoch,
10610 epoch,
10611 NullEvt())));
10612
10613 std::lock_guard l{sdata_wait_lock};
10614 sdata_cond.notify_one();
10615 }
10616
10617 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10618 {
10619 std::lock_guard l(shard_lock);
10620 vector<spg_t> to_delete;
10621 for (auto& i : pg_slots) {
10622 if (i.first != parent &&
10623 i.first.get_ancestor(old_pg_num) == parent) {
10624 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10625 << dendl;
10626 _wake_pg_slot(i.first, i.second.get());
10627 to_delete.push_back(i.first);
10628 }
10629 }
10630 for (auto pgid : to_delete) {
10631 pg_slots.erase(pgid);
10632 }
10633 }
10634
10635 OSDShard::OSDShard(
10636 int id,
10637 CephContext *cct,
10638 OSD *osd)
10639 : shard_id(id),
10640 cct(cct),
10641 osd(osd),
10642 shard_name(string("OSDShard.") + stringify(id)),
10643 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10644 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10645 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10646 shard_lock_name(shard_name + "::shard_lock"),
10647 shard_lock{make_mutex(shard_lock_name)},
10648 scheduler(ceph::osd::scheduler::make_scheduler(
10649 cct, osd->num_shards, osd->store->is_rotational())),
10650 context_queue(sdata_wait_lock, sdata_cond)
10651 {
10652 dout(0) << "using op scheduler " << *scheduler << dendl;
10653 }
10654
10655
10656 // =============================================================
10657
10658 #undef dout_context
10659 #define dout_context osd->cct
10660 #undef dout_prefix
10661 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10662
10663 void OSD::ShardedOpWQ::_add_slot_waiter(
10664 spg_t pgid,
10665 OSDShardPGSlot *slot,
10666 OpSchedulerItem&& qi)
10667 {
10668 if (qi.is_peering()) {
10669 dout(20) << __func__ << " " << pgid
10670 << " peering, item epoch is "
10671 << qi.get_map_epoch()
10672 << ", will wait on " << qi << dendl;
10673 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10674 } else {
10675 dout(20) << __func__ << " " << pgid
10676 << " item epoch is "
10677 << qi.get_map_epoch()
10678 << ", will wait on " << qi << dendl;
10679 slot->waiting.push_back(std::move(qi));
10680 }
10681 }
10682
10683 #undef dout_prefix
10684 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10685
10686 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10687 {
10688 uint32_t shard_index = thread_index % osd->num_shards;
10689 auto& sdata = osd->shards[shard_index];
10690 ceph_assert(sdata);
10691
10692 // If all threads of shards do oncommits, there is a out-of-order
10693 // problem. So we choose the thread which has the smallest
10694 // thread_index(thread_index < num_shards) of shard to do oncommit
10695 // callback.
10696 bool is_smallest_thread_index = thread_index < osd->num_shards;
10697
10698 // peek at spg_t
10699 sdata->shard_lock.lock();
10700 if (sdata->scheduler->empty() &&
10701 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10702 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10703 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10704 // we raced with a context_queue addition, don't wait
10705 wait_lock.unlock();
10706 } else if (!sdata->stop_waiting) {
10707 dout(20) << __func__ << " empty q, waiting" << dendl;
10708 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10709 sdata->shard_lock.unlock();
10710 sdata->sdata_cond.wait(wait_lock);
10711 wait_lock.unlock();
10712 sdata->shard_lock.lock();
10713 if (sdata->scheduler->empty() &&
10714 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10715 sdata->shard_lock.unlock();
10716 return;
10717 }
10718 // found a work item; reapply default wq timeouts
10719 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10720 timeout_interval, suicide_interval);
10721 } else {
10722 dout(20) << __func__ << " need return immediately" << dendl;
10723 wait_lock.unlock();
10724 sdata->shard_lock.unlock();
10725 return;
10726 }
10727 }
10728
10729 list<Context *> oncommits;
10730 if (is_smallest_thread_index) {
10731 sdata->context_queue.move_to(oncommits);
10732 }
10733
10734 WorkItem work_item;
10735 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10736 if (sdata->scheduler->empty()) {
10737 if (osd->is_stopping()) {
10738 sdata->shard_lock.unlock();
10739 for (auto c : oncommits) {
10740 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10741 delete c;
10742 }
10743 return; // OSD shutdown, discard.
10744 }
10745 sdata->shard_lock.unlock();
10746 handle_oncommits(oncommits);
10747 return;
10748 }
10749
10750 work_item = sdata->scheduler->dequeue();
10751 if (osd->is_stopping()) {
10752 sdata->shard_lock.unlock();
10753 for (auto c : oncommits) {
10754 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10755 delete c;
10756 }
10757 return; // OSD shutdown, discard.
10758 }
10759
10760 // If the work item is scheduled in the future, wait until
10761 // the time returned in the dequeue response before retrying.
10762 if (auto when_ready = std::get_if<double>(&work_item)) {
10763 if (is_smallest_thread_index) {
10764 sdata->shard_lock.unlock();
10765 handle_oncommits(oncommits);
10766 return;
10767 }
10768 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10769 auto future_time = ceph::real_clock::from_double(*when_ready);
10770 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10771 sdata->shard_lock.unlock();
10772 ++sdata->waiting_threads;
10773 sdata->sdata_cond.wait_until(wait_lock, future_time);
10774 --sdata->waiting_threads;
10775 wait_lock.unlock();
10776 sdata->shard_lock.lock();
10777 }
10778 } // while
10779
10780 // Access the stored item
10781 auto item = std::move(std::get<OpSchedulerItem>(work_item));
10782 if (osd->is_stopping()) {
10783 sdata->shard_lock.unlock();
10784 for (auto c : oncommits) {
10785 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10786 delete c;
10787 }
10788 return; // OSD shutdown, discard.
10789 }
10790
10791 const auto token = item.get_ordering_token();
10792 auto r = sdata->pg_slots.emplace(token, nullptr);
10793 if (r.second) {
10794 r.first->second = make_unique<OSDShardPGSlot>();
10795 }
10796 OSDShardPGSlot *slot = r.first->second.get();
10797 dout(20) << __func__ << " " << token
10798 << (r.second ? " (new)" : "")
10799 << " to_process " << slot->to_process
10800 << " waiting " << slot->waiting
10801 << " waiting_peering " << slot->waiting_peering
10802 << dendl;
10803 slot->to_process.push_back(std::move(item));
10804 dout(20) << __func__ << " " << slot->to_process.back()
10805 << " queued" << dendl;
10806
10807 retry_pg:
10808 PGRef pg = slot->pg;
10809
10810 // lock pg (if we have it)
10811 if (pg) {
10812 // note the requeue seq now...
10813 uint64_t requeue_seq = slot->requeue_seq;
10814 ++slot->num_running;
10815
10816 sdata->shard_lock.unlock();
10817 osd->service.maybe_inject_dispatch_delay();
10818 pg->lock();
10819 osd->service.maybe_inject_dispatch_delay();
10820 sdata->shard_lock.lock();
10821
10822 auto q = sdata->pg_slots.find(token);
10823 if (q == sdata->pg_slots.end()) {
10824 // this can happen if we race with pg removal.
10825 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10826 pg->unlock();
10827 sdata->shard_lock.unlock();
10828 handle_oncommits(oncommits);
10829 return;
10830 }
10831 slot = q->second.get();
10832 --slot->num_running;
10833
10834 if (slot->to_process.empty()) {
10835 // raced with _wake_pg_slot or consume_map
10836 dout(20) << __func__ << " " << token
10837 << " nothing queued" << dendl;
10838 pg->unlock();
10839 sdata->shard_lock.unlock();
10840 handle_oncommits(oncommits);
10841 return;
10842 }
10843 if (requeue_seq != slot->requeue_seq) {
10844 dout(20) << __func__ << " " << token
10845 << " requeue_seq " << slot->requeue_seq << " > our "
10846 << requeue_seq << ", we raced with _wake_pg_slot"
10847 << dendl;
10848 pg->unlock();
10849 sdata->shard_lock.unlock();
10850 handle_oncommits(oncommits);
10851 return;
10852 }
10853 if (slot->pg != pg) {
10854 // this can happen if we race with pg removal.
10855 dout(20) << __func__ << " slot " << token << " no longer attached to "
10856 << pg << dendl;
10857 pg->unlock();
10858 goto retry_pg;
10859 }
10860 }
10861
10862 dout(20) << __func__ << " " << token
10863 << " to_process " << slot->to_process
10864 << " waiting " << slot->waiting
10865 << " waiting_peering " << slot->waiting_peering << dendl;
10866
10867 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10868 suicide_interval);
10869
10870 // take next item
10871 auto qi = std::move(slot->to_process.front());
10872 slot->to_process.pop_front();
10873 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10874 set<pair<spg_t,epoch_t>> new_children;
10875 OSDMapRef osdmap;
10876
10877 while (!pg) {
10878 // should this pg shard exist on this osd in this (or a later) epoch?
10879 osdmap = sdata->shard_osdmap;
10880 const PGCreateInfo *create_info = qi.creates_pg();
10881 if (!slot->waiting_for_split.empty()) {
10882 dout(20) << __func__ << " " << token
10883 << " splitting " << slot->waiting_for_split << dendl;
10884 _add_slot_waiter(token, slot, std::move(qi));
10885 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10886 dout(20) << __func__ << " " << token
10887 << " map " << qi.get_map_epoch() << " > "
10888 << osdmap->get_epoch() << dendl;
10889 _add_slot_waiter(token, slot, std::move(qi));
10890 } else if (qi.is_peering()) {
10891 if (!qi.peering_requires_pg()) {
10892 // for pg-less events, we run them under the ordering lock, since
10893 // we don't have the pg lock to keep them ordered.
10894 qi.run(osd, sdata, pg, tp_handle);
10895 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10896 if (create_info) {
10897 if (create_info->by_mon &&
10898 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10899 dout(20) << __func__ << " " << token
10900 << " no pg, no longer primary, ignoring mon create on "
10901 << qi << dendl;
10902 } else {
10903 dout(20) << __func__ << " " << token
10904 << " no pg, should create on " << qi << dendl;
10905 pg = osd->handle_pg_create_info(osdmap, create_info);
10906 if (pg) {
10907 // we created the pg! drop out and continue "normally"!
10908 sdata->_attach_pg(slot, pg.get());
10909 sdata->_wake_pg_slot(token, slot);
10910
10911 // identify split children between create epoch and shard epoch.
10912 osd->service.identify_splits_and_merges(
10913 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10914 sdata->_prime_splits(&new_children);
10915 // distribute remaining split children to other shards below!
10916 break;
10917 }
10918 dout(20) << __func__ << " ignored create on " << qi << dendl;
10919 }
10920 } else {
10921 dout(20) << __func__ << " " << token
10922 << " no pg, peering, !create, discarding " << qi << dendl;
10923 }
10924 } else {
10925 dout(20) << __func__ << " " << token
10926 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10927 << ", discarding " << qi
10928 << dendl;
10929 }
10930 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10931 dout(20) << __func__ << " " << token
10932 << " no pg, should exist e" << osdmap->get_epoch()
10933 << ", will wait on " << qi << dendl;
10934 _add_slot_waiter(token, slot, std::move(qi));
10935 } else {
10936 dout(20) << __func__ << " " << token
10937 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10938 << ", dropping " << qi << dendl;
10939 // share map with client?
10940 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10941 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10942 sdata->shard_osdmap,
10943 (*_op)->sent_epoch);
10944 }
10945 unsigned pushes_to_free = qi.get_reserved_pushes();
10946 if (pushes_to_free > 0) {
10947 sdata->shard_lock.unlock();
10948 osd->service.release_reserved_pushes(pushes_to_free);
10949 handle_oncommits(oncommits);
10950 return;
10951 }
10952 }
10953 sdata->shard_lock.unlock();
10954 handle_oncommits(oncommits);
10955 return;
10956 }
10957 if (qi.is_peering()) {
10958 OSDMapRef osdmap = sdata->shard_osdmap;
10959 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10960 _add_slot_waiter(token, slot, std::move(qi));
10961 sdata->shard_lock.unlock();
10962 pg->unlock();
10963 handle_oncommits(oncommits);
10964 return;
10965 }
10966 }
10967 sdata->shard_lock.unlock();
10968
10969 if (!new_children.empty()) {
10970 for (auto shard : osd->shards) {
10971 shard->prime_splits(osdmap, &new_children);
10972 }
10973 ceph_assert(new_children.empty());
10974 }
10975
10976 // osd_opwq_process marks the point at which an operation has been dequeued
10977 // and will begin to be handled by a worker thread.
10978 {
10979 #ifdef WITH_LTTNG
10980 osd_reqid_t reqid;
10981 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10982 reqid = (*_op)->get_reqid();
10983 }
10984 #endif
10985 tracepoint(osd, opwq_process_start, reqid.name._type,
10986 reqid.name._num, reqid.tid, reqid.inc);
10987 }
10988
10989 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10990 Formatter *f = Formatter::create("json");
10991 f->open_object_section("q");
10992 dump(f);
10993 f->close_section();
10994 f->flush(*_dout);
10995 delete f;
10996 *_dout << dendl;
10997
10998 qi.run(osd, sdata, pg, tp_handle);
10999
11000 {
11001 #ifdef WITH_LTTNG
11002 osd_reqid_t reqid;
11003 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11004 reqid = (*_op)->get_reqid();
11005 }
11006 #endif
11007 tracepoint(osd, opwq_process_finish, reqid.name._type,
11008 reqid.name._num, reqid.tid, reqid.inc);
11009 }
11010
11011 handle_oncommits(oncommits);
11012 }
11013
11014 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11015 uint32_t shard_index =
11016 item.get_ordering_token().hash_to_shard(osd->shards.size());
11017
11018 dout(20) << __func__ << " " << item << dendl;
11019
11020 OSDShard* sdata = osd->shards[shard_index];
11021 assert (NULL != sdata);
11022
11023 bool empty = true;
11024 {
11025 std::lock_guard l{sdata->shard_lock};
11026 empty = sdata->scheduler->empty();
11027 sdata->scheduler->enqueue(std::move(item));
11028 }
11029
11030 {
11031 std::lock_guard l{sdata->sdata_wait_lock};
11032 if (empty) {
11033 sdata->sdata_cond.notify_all();
11034 } else if (sdata->waiting_threads) {
11035 sdata->sdata_cond.notify_one();
11036 }
11037 }
11038 }
11039
11040 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11041 {
11042 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11043 auto& sdata = osd->shards[shard_index];
11044 ceph_assert(sdata);
11045 sdata->shard_lock.lock();
11046 auto p = sdata->pg_slots.find(item.get_ordering_token());
11047 if (p != sdata->pg_slots.end() &&
11048 !p->second->to_process.empty()) {
11049 // we may be racing with _process, which has dequeued a new item
11050 // from scheduler, put it on to_process, and is now busy taking the
11051 // pg lock. ensure this old requeued item is ordered before any
11052 // such newer item in to_process.
11053 p->second->to_process.push_front(std::move(item));
11054 item = std::move(p->second->to_process.back());
11055 p->second->to_process.pop_back();
11056 dout(20) << __func__
11057 << " " << p->second->to_process.front()
11058 << " shuffled w/ " << item << dendl;
11059 } else {
11060 dout(20) << __func__ << " " << item << dendl;
11061 }
11062 sdata->scheduler->enqueue_front(std::move(item));
11063 sdata->shard_lock.unlock();
11064 std::lock_guard l{sdata->sdata_wait_lock};
11065 sdata->sdata_cond.notify_one();
11066 }
11067
11068 namespace ceph::osd_cmds {
11069
11070 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11071 std::ostream& os)
11072 {
11073 if (!ceph_using_tcmalloc()) {
11074 os << "could not issue heap profiler command -- not using tcmalloc!";
11075 return -EOPNOTSUPP;
11076 }
11077
11078 string cmd;
11079 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11080 os << "unable to get value for command \"" << cmd << "\"";
11081 return -EINVAL;
11082 }
11083
11084 std::vector<std::string> cmd_vec;
11085 get_str_vec(cmd, cmd_vec);
11086
11087 string val;
11088 if (cmd_getval(cmdmap, "value", val)) {
11089 cmd_vec.push_back(val);
11090 }
11091
11092 ceph_heap_profiler_handle_command(cmd_vec, os);
11093
11094 return 0;
11095 }
11096
11097 } // namespace ceph::osd_cmds